diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
new file mode 100644
index 00000000000..5a7e0e1c662
--- /dev/null
+++ b/.github/workflows/clang-format-check.yml
@@ -0,0 +1,15 @@
+name: clang-format check
+
+on: [push, pull_request]
+
+permissions: read-all
+
+jobs:
+  formatting-check:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Run clang-format style check.
+      uses: DoozyX/clang-format-lint-action@v0.16.2
+      with:
+        clangFormatVersion: 8
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 00000000000..2ed86a14751
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,51 @@
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ "master", "develop", "release-*" ]
+  pull_request:
+    branches: [ "develop" ]
+
+permissions: read-all
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    timeout-minutes: 360
+    permissions:
+      # required for all workflows
+      security-events: write
+
+      # only required for workflows in private repositories
+      actions: read
+      contents: read
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v3
+      with:
+        languages: c-cpp
+
+    - name: configure
+      run:
+        cmake -B build .
+          -DKokkos_ENABLE_OPENMP=ON
+          -DCMAKE_CXX_STANDARD=17
+          -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF
+          -DKokkos_ENABLE_TESTS=ON
+          -DKokkos_ENABLE_EXAMPLES=ON
+          -DKokkos_ENABLE_BENCHMARKS=ON
+          -DCMAKE_BUILD_TYPE=Debug
+    - name: build
+      run:
+        cmake --build build --parallel 2
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v3
+      with:
+        category: "/language:c-cpp"
diff --git a/.github/workflows/continuous-integration-workflow-32bit.yml b/.github/workflows/continuous-integration-workflow-32bit.yml
index 68fbdbe8a47..87c21d3a6e7 100644
--- a/.github/workflows/continuous-integration-workflow-32bit.yml
+++ b/.github/workflows/continuous-integration-workflow-32bit.yml
@@ -9,6 +9,8 @@ on:
     - '**/*.md'
     types: [ opened, reopened, synchronize ]
 
+permissions: read-all
+
 concurrency:
   group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: ${{github.event_name == 'pull_request'}}
@@ -21,7 +23,7 @@ jobs:
       image: ghcr.io/kokkos/ci-containers/ubuntu:latest
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: install_multilib
         run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib gfortran-multilib
       - name: Configure Kokkos
diff --git a/.github/workflows/continuous-integration-workflow-hpx.yml b/.github/workflows/continuous-integration-workflow-hpx.yml
index 8b39350dc87..35704a28cf3 100644
--- a/.github/workflows/continuous-integration-workflow-hpx.yml
+++ b/.github/workflows/continuous-integration-workflow-hpx.yml
@@ -13,6 +13,8 @@ concurrency:
   group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: ${{github.event_name == 'pull_request'}}
 
+permissions: read-all
+
 jobs:
   hpx:
     name: hpx
@@ -20,7 +22,7 @@ jobs:
 
     steps:
       - name: checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           path: kokkos
       - name: setup hpx dependencies
@@ -33,12 +35,12 @@ jobs:
             libboost-all-dev \
             ninja-build
       - name: checkout hpx
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: STELLAR-GROUP/hpx
-          ref: 1.8.0
+          ref: v1.9.0
           path: hpx
-      - uses: actions/cache@v3
+      - uses: actions/cache@v4
         id:   cache-hpx
         with:
           path:         ./hpx/install
diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml
index 8c226c3766c..6425cc2668e 100644
--- a/.github/workflows/continuous-integration-workflow.yml
+++ b/.github/workflows/continuous-integration-workflow.yml
@@ -13,6 +13,8 @@ concurrency:
   group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: ${{github.event_name == 'pull_request'}}
 
+permissions: read-all
+
 jobs:
   CI:
     continue-on-error: true
@@ -25,31 +27,39 @@ jobs:
         backend: ['OPENMP']
         clang-tidy: ['']
         include:
-          - distro: 'fedora:intel'
+          - distro: 'ubuntu:intel'
             cxx: 'icpc'
             cxx_extra_flags: '-diag-disable=177,10441'
             cmake_build_type: 'Release'
             backend: 'OPENMP'
-          - distro: 'fedora:intel'
+          - distro: 'ubuntu:intel'
             cxx: 'icpc'
             cxx_extra_flags: '-diag-disable=177,10441'
             cmake_build_type: 'Debug'
             backend: 'OPENMP'
-          - distro: 'fedora:intel'
+          - distro: 'ubuntu:intel'
             cxx: 'icpx'
             cxx_extra_flags: '-fp-model=precise -Wno-pass-failed'
             cmake_build_type: 'Release'
             backend: 'OPENMP'
-          - distro: 'fedora:intel'
+          - distro: 'ubuntu:intel'
             cxx: 'icpx'
             cxx_extra_flags: '-fp-model=precise -Wno-pass-failed'
             cmake_build_type: 'Debug'
             backend: 'OPENMP'
           - distro: 'ubuntu:latest'
             cxx: 'clang++'
+            cxx_extra_flags: '-fsanitize=address'
+            extra_linker_flags: '-fsanitize=address'
             cmake_build_type: 'RelWithDebInfo'
             backend: 'THREADS'
             clang-tidy: '-DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*"'
+          - distro: 'ubuntu:latest'
+            cxx: 'clang++'
+            cxx_extra_flags: '-fsanitize=address'
+            extra_linker_flags: '-fsanitize=address'
+            cmake_build_type: 'RelWithDebInfo'
+            backend: 'SERIAL'
           - distro: 'ubuntu:latest'
             cxx: 'g++'
             cmake_build_type: 'RelWithDebInfo'
@@ -59,7 +69,7 @@ jobs:
       image: ghcr.io/kokkos/ci-containers/${{ matrix.distro }}
     steps:
       - name: Checkout desul
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: desul/desul
           ref: 477da9c8f40f8db369c28dd3f93a67e376d8511b
@@ -74,8 +84,8 @@ jobs:
           cmake -DDESUL_ENABLE_TESTS=OFF -DCMAKE_INSTALL_PREFIX=/usr/desul-install ..
           sudo cmake --build . --target install --parallel 2
       - name: Checkout code
-        uses: actions/checkout@v3
-      - uses: actions/cache@v3
+        uses: actions/checkout@v4
+      - uses: actions/cache@v4
         with:
           path: ~/.cache/ccache
           key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{ github.ref }}-${{ github.sha }}
@@ -106,6 +116,7 @@ jobs:
             -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
             -DKokkos_ENABLE_IMPL_MDSPAN=ON \
             -DCMAKE_CXX_FLAGS="-Werror ${{ matrix.cxx_extra_flags }}" \
+            -DCMAKE_EXE_LINKER_FLAGS="${{ matrix.extra_linker_flags }}" \
             -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
             -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }}
@@ -118,6 +129,7 @@ jobs:
         working-directory: builddir
         run: ctest --output-on-failure
       - name: Test linking against build dir
+        if: ${{ matrix.cxx_extra_flags != '-fsanitize=address' }}
         working-directory: example/build_cmake_installed
         run: |
           cmake -B builddir_buildtree -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} -DKokkos_ROOT=../../builddir
@@ -128,6 +140,7 @@ jobs:
       - name: Install
         run: sudo cmake --build builddir --target install
       - name: Test install
+        if: ${{ matrix.cxx_extra_flags != '-fsanitize=address' }}
         working-directory: example/build_cmake_installed
         run: |
           cmake -B builddir -DCMAKE_CXX_COMPILER=${{ matrix.cxx }}
diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml
index 85b079e56c8..2bcf41a0d30 100644
--- a/.github/workflows/osx.yml
+++ b/.github/workflows/osx.yml
@@ -13,6 +13,8 @@ concurrency:
   group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: ${{github.event_name == 'pull_request'}}
 
+permissions: read-all
+
 jobs:
   osxci:
     name: osx-ci
@@ -31,7 +33,7 @@ jobs:
             cmake_build_type: "Release"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: configure
         run:
           cmake -B build .
diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml
index 59eed4f6096..bfbbeea4a85 100644
--- a/.github/workflows/performance-benchmark.yml
+++ b/.github/workflows/performance-benchmark.yml
@@ -8,6 +8,8 @@ on:
       - '**/*.md'
     types: [ opened, reopened, synchronize ]
 
+permissions: read-all
+
 jobs:
   CI:
     continue-on-error: true
@@ -23,8 +25,8 @@ jobs:
       BUILD_ID: ${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }}
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
-      - uses: actions/cache@v3
+        uses: actions/checkout@v4
+      - uses: actions/cache@v4
         with:
           path: ~/.cache/ccache
           key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }}-${{ github.ref }}-${{ github.sha }}
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
new file mode 100644
index 00000000000..3d7ede20773
--- /dev/null
+++ b/.github/workflows/scorecard.yml
@@ -0,0 +1,73 @@
+# This workflow uses actions that are not certified by GitHub. They are provided
+# by a third-party and are governed by separate terms of service, privacy
+# policy, and support documentation.
+
+name: Scorecard supply-chain security
+on:
+  # For Branch-Protection check. Only the default branch is supported. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
+  branch_protection_rule:
+  # To guarantee Maintained check is occasionally updated. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
+  schedule:
+    # Weekly on Saturdays.
+    - cron: '30 1 * * 6'
+  push:
+    branches: [ master, develop ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analysis:
+    name: Scorecard analysis
+    runs-on: ubuntu-latest
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      # Needed to publish results and get a badge (see publish_results below).
+      id-token: write
+      # Uncomment the permissions below if installing in a private repository.
+      # contents: read
+      # actions: read
+
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@80e868c13c90f172d68d1f4501dee99e2479f7af # v2.1.3
+        with:
+          results_file: results.sarif
+          results_format: sarif
+          # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
+          # - you want to enable the Branch-Protection check on a *public* repository, or
+          # - you are installing Scorecard on a *private* repository
+          # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
+          # repo_token: ${{ secrets.SCORECARD_TOKEN }}
+
+          # Public repositories:
+          #   - Publish results to OpenSSF REST API for easy access by consumers
+          #   - Allows the repository to include the Scorecard badge.
+          #   - See https://github.com/ossf/scorecard-action#publishing-results.
+          # For private repositories:
+          #   - `publish_results` will always be set to `false`, regardless
+          #     of the value entered here.
+          publish_results: true
+
+      # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
+      # format to the repository Actions tab.
+      - name: "Upload artifact"
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
+        with:
+          name: SARIF file
+          path: results.sarif
+          retention-days: 5
+
+      # Upload the results to GitHub's code scanning dashboard.
+      - name: "Upload SARIF results to code scanning"
+        uses: github/codeql-action/upload-sarif@83f0fe6c4988d98a455712a27f0255212bba9bd4 # v2.3.6
+        with:
+          sarif_file: results.sarif
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
new file mode 100644
index 00000000000..1bea94a721b
--- /dev/null
+++ b/.github/workflows/windows.yml
@@ -0,0 +1,35 @@
+name: github-windows
+
+on:
+  push:
+  pull_request:
+
+concurrency:
+  group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{github.event_name == 'pull_request'}}
+
+permissions: read-all
+
+jobs:
+  windows-cuda:
+    # Cuda build on Windows
+    name: Windows Cuda
+    runs-on: windows-2022
+
+    steps:
+    - uses: Jimver/cuda-toolkit@v0.2.14
+      id: cuda-toolkit
+      with:
+        cuda: '12.1.0'
+    - uses: actions/checkout@v4
+    - name: configure
+      shell: bash
+      run: |
+        mkdir build
+        mkdir c:/project
+        cd build
+        cmake -DKokkos_ENABLE_CUDA=ON -DKokkos_ARCH_VOLTA70=ON -DKokkos_ENABLE_TESTS=ON -DKokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE=ON ..
+    - name: build library
+      shell: bash
+      run: |
+        cmake --build build --parallel 2 --config Release
diff --git a/.jenkins b/.jenkins
index 6f5cf80033f..ae3bffd92d7 100644
--- a/.jenkins
+++ b/.jenkins
@@ -8,16 +8,21 @@ pipeline {
     }
 
     options {
+        disableConcurrentBuilds(abortPrevious: true)
         timeout(time: 6, unit: 'HOURS')
     }
 
+    triggers {
+        issueCommentTrigger('.*test this please.*')
+    }
+
     stages {
         stage('Clang-Format') {
             agent {
                 dockerfile {
                     filename 'Dockerfile.clang'
                     dir 'scripts/docker'
-                    label 'nvidia-docker || rocm-docker || docker'
+                    label 'nvidia-docker || docker'
                     args '-v /tmp/ccache.kokkos:/tmp/ccache'
                 }
             }
@@ -102,12 +107,11 @@ pipeline {
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''. /opt/intel/oneapi/setvars.sh --include-intel-llvm && \
-                              rm -rf build && mkdir -p build && cd build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-                                -DCMAKE_CXX_COMPILER=/opt/intel/oneapi/compiler/2023.0.0/linux/bin-llvm/clang++ \
+                                -DCMAKE_CXX_COMPILER=clang++ \
                                 -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Wno-deprecated-declarations -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-unknown-cuda-version -Wno-sycl-target" \
                                 -DKOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED=0 \
                                 -DKokkos_ARCH_NATIVE=ON \
@@ -135,8 +139,8 @@ pipeline {
                         dockerfile {
                             filename 'Dockerfile.hipcc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2'
-                            label 'rocm-docker && vega'
+                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2-complete'
+                            label 'rocm-docker '
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
                         }
                     }
@@ -177,8 +181,8 @@ pipeline {
                         dockerfile {
                             filename 'Dockerfile.hipcc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6'
-                            label 'rocm-docker && vega'
+                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6-complete'
+                            label 'rocm-docker'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
                         }
                     }
@@ -390,7 +394,6 @@ pipeline {
                                 -DKokkos_ENABLE_CUDA_LAMBDA=OFF \
                                 -DKokkos_ENABLE_CUDA_UVM=ON \
                                 -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \
-                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \
                                 -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
                                 -DKokkos_ENABLE_IMPL_MDSPAN=ON \
@@ -493,7 +496,6 @@ pipeline {
                                 -DCMAKE_CXX_FLAGS=-Werror \
                                 -DKokkos_ARCH_NATIVE=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
-                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \
                                 -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 40e3c95f24f..f7b8af7695c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,105 @@
 # CHANGELOG
 
+## [4.3.00](https://github.com/kokkos/kokkos/tree/4.3.00) (2024-03-19)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.01...4.3.00)
+
+### Features:
+* Add `Experimental::sort_by_key(exec, keys, values)` algorithm [\#6801](https://github.com/kokkos/kokkos/pull/6801)
+
+### Backend and Architecture Enhancements:
+
+#### CUDA:
+* Experimental multi-GPU support (from the same process) [\#6782](https://github.com/kokkos/kokkos/pull/6782)
+* Link against CUDA libraries even with KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE [\#6701](https://github.com/kokkos/kokkos/pull/6701)
+* Don't use the compiler launcher script if the CMake compile language is CUDA. [\#6704](https://github.com/kokkos/kokkos/pull/6704)
+* nvcc(wrapper): adding "long" and "short" versions for all flags [\#6615](https://github.com/kokkos/kokkos/pull/6615)
+
+#### HIP:
+ * Fix compilation when using amdclang (with ROCm >= 5.7) and RDC [\#6857](https://github.com/kokkos/kokkos/pull/6857)
+ * Use rocthrust for sorting, when available [\#6793](https://github.com/kokkos/kokkos/pull/6793)
+
+#### SYCL:
+* We only support OneAPI SYCL implementation: add check during initialization
+  * Error out on initialization if the backend is different from `ext_oneapi_*` [\#6784](https://github.com/kokkos/kokkos/pull/6784)
+  * Filter GPU devices for `ext_onapi_*` GPU devices [\#6758](https://github.com/kokkos/kokkos/pull/6784)
+* Performance Improvements
+  * Avoid unnecessary zero-memset of the scratch flags in SYCL [\#6739](https://github.com/kokkos/kokkos/pull/6739)
+  * Use host-pinned memory to copy reduction/scan result [\#6500](https://github.com/kokkos/kokkos/pull/6500)
+* Address deprecations after oneAPI 2023.2.0 [\#6577](https://github.com/kokkos/kokkos/pull/6739)
+* Make sure to call find_dependency for oneDPL if necessary [\#6870](https://github.com/kokkos/kokkos/pull/6870)
+
+#### OpenMPTarget:
+* Use LLVM extensions for dynamic shared memory [\#6380](https://github.com/kokkos/kokkos/pull/6380)
+* Guard scratch memory usage in ParallelReduce [\#6585 ](https://github.com/kokkos/kokkos/pull/6585)
+* Update linker flags for Intel GPUs update [\#6735](https://github.com/kokkos/kokkos/pull/6735)
+* Improve handling of printf on Intel GPUs [\#6652](https://github.com/kokkos/kokkos/pull/6652)
+
+#### OpenACC:
+* Add atomics support [\#6446](https://github.com/kokkos/kokkos/pull/6446)
+* Make the OpenACC backend asynchronous [\#6772](https://github.com/kokkos/kokkos/pull/6772)
+
+#### Threads:
+* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6446)
+
+#### OpenMP:
+* Improve performance of view initializations and filling with zeros [\#6573](https://github.com/kokkos/kokkos/pull/6573)
+
+### General Enhancements
+
+* Improve performance of random number generation when using a normal distribution on GPUs [\#6556](https://github.com/kokkos/kokkos/pull/6556)
+* Allocate temporary view with the user-provided execution space instance and do not initialize in `unique` algorithm [\#6598](https://github.com/kokkos/kokkos/pull/6598)
+* Add deduction guide for `Kokkos::Array` [\#6373](https://github.com/kokkos/kokkos/pull/6373)
+* Provide new public headers `<Kokkos_Clamp.hpp>` and `<Kokkos_MinMax.hpp>` [\#6687](https://github.com/kokkos/kokkos/pull/6687)
+* Fix/improvement to `remove_if` parallel algorithm: use the provided execution space instance for temporary allocations and drop unnecessaryinitialization + avoid evaluating twice the predicate during final pass [\#6747](https://github.com/kokkos/kokkos/pull/6747)
+* Add runtime function to query the number of devices and make device ID consistent with `KOKKOS_VISIBLE_DEVICES` [\#6713](https://github.com/kokkos/kokkos/pull/6713)
+* simd: support `vector_aligned_tag` [\#6243](https://github.com/kokkos/kokkos/pull/6243)
+* Avoid unnecessary allocation when default constructing Bitset [\#6524](https://github.com/kokkos/kokkos/pull/6524)
+* Fix constness for views in std algorithms [\#6813](https://github.com/kokkos/kokkos/pull/6813)
+* Improve error message on unsafe implicit conversion in MDRangePolicy [\#6855](https://github.com/kokkos/kokkos/pull/6855)
+* CTAD (deduction guides) for RangePolicy [\#6850](https://github.com/kokkos/kokkos/pull/6850)
+* CTAD (deduction guides) for MDRangePolicy [\#5516](https://github.com/kokkos/kokkos/pull/5516)
+
+### Build System Changes
+* Require `Kokkos_ENABLE_ATOMICS_BYPASS` option to bypass atomic operation for Serial backend only builds [\#6692](https://github.com/kokkos/kokkos/pull/6692)
+* Add support for RISCV and the Milk-V's Pioneer [\#6773](https://github.com/kokkos/kokkos/pull/6773)
+* Add C++26 standard to CMake setup [\#6733](https://github.com/kokkos/kokkos/pull/6733)
+* Fix Makefile when using gnu_generate_makefile.sh and make >= 4.3 [\#6606](https://github.com/kokkos/kokkos/pull/6606)
+* Cuda: Fix configuring with CMake >= 3.28.4 - temporary fallback to internal CudaToolkit.cmake [\#6898](https://github.com/kokkos/kokkos/pull/6898)
+
+### Incompatibilities (i.e. breaking changes)
+* Remove all `DEPRECATED_CODE_3` option and all code that was guarded by it  [\#6523](https://github.com/kokkos/kokkos/pull/6523)
+* Drop guards to accommodate external code defining `KOKKOS_ASSERT` [\#6665](https://github.com/kokkos/kokkos/pull/6665)
+* `Profiling::ProfilingSection(std::string)` constructor marked explicit and nodiscard [\#6690](https://github.com/kokkos/kokkos/pull/6690)
+* Add bound check preconditions for `RangePolicy` and `MDRangePolicy` [\#6617](https://github.com/kokkos/kokkos/pull/6617) [\#6726](https://github.com/kokkos/kokkos/pull/6726)
+* Add checks for unsafe implicit conversions in RangePolicy [\#6754](https://github.com/kokkos/kokkos/pull/6754)
+* Remove Kokkos::[b]half_t volatile overloads [\#6579](https://github.com/kokkos/kokkos/pull/6579)
+* Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF [\#6593](https://github.com/kokkos/kokkos/pull/6593)
+* Check matching static extents in View constructor [\#5190 ](https://github.com/kokkos/kokkos/pull/5190)
+* Tools(profiling): fix typo Kokkos_Tools_Optim[i]zationGoal [\#6642](https://github.com/kokkos/kokkos/pull/6642)
+* Remove variadic range policy constructor (disallow passing multiple trailing chunk size arguments) [\#6845](https://github.com/kokkos/kokkos/pull/6845)
+* Improve message on view out of bounds access and always abort [\#6861](https://github.com/kokkos/kokkos/pull/6861)
+* Drop `KOKKOS_ENABLE_INTEL_MM_ALLOC` macro [\#6797](https://github.com/kokkos/kokkos/pull/6797)
+* Remove `Kokkos::Experimental::LogicalMemorySpace` (without going through deprecation) [\#6557](https://github.com/kokkos/kokkos/pull/6557)
+* Remove `Experimental::HBWSpace` and support for linking against memkind [\#6791](https://github.com/kokkos/kokkos/pull/6791)
+* Drop librt TPL and associated `KOKKOS_ENABLE_LIBRT` macro [\#6798](https://github.com/kokkos/kokkos/pull/6798)
+* Drop support for old CPU architectures (`ARCH_BGQ`, `ARCH_POWER7`, `ARCH_WSM` and associated `ARCH_SSE4` macro) [\#6806](https://github.com/kokkos/kokkos/pull/6806)
+* Drop support for deprecated command-line arguments and environment variables [\#6744](https://github.com/kokkos/kokkos/pull/6744)
+
+### Deprecations
+* Provide kokkos_swap as part of Core and deprecate Experimental::swap in Algorithms [\#6697](https://github.com/kokkos/kokkos/pull/6697)
+* Deprecate {Cuda,HIP}::detect_device_count() and Cuda::[detect_]device_arch() [\#6710](https://github.com/kokkos/kokkos/pull/6710)
+* Deprecate `ExecutionSpace::in_parallel()` [\#6582](https://github.com/kokkos/kokkos/pull/6582)
+
+### Bug Fixes
+* Fix team-level MDRange reductions: [\#6511](https://github.com/kokkos/kokkos/pull/6511)
+* Fix CUDA and SYCL small value type (16-bit) team reductions [\#5334](https://github.com/kokkos/kokkos/pull/5334)
+* Enable `{transform_}exclusive_scan` in place [\#6667](https://github.com/kokkos/kokkos/pull/6667)
+* `fill_random` overload that do not take an execution space instance argument should fence [\#6658](https://github.com/kokkos/kokkos/pull/6658)
+* HIP,Cuda,OpenMPTarget: Fixup use provided execution space when copying host inaccessible reduction result [\#6777](https://github.com/kokkos/kokkos/pull/6777)
+* Fix typo in `cuda_func_set_attribute[s]_wrapper` preventing proper setting of desired occupancy [\#6786](https://github.com/kokkos/kokkos/pull/6786)
+* Avoid undefined behavior due to conversion between signed and unsigned integers in shift_{right, left}_team_impl [\#6821](https://github.com/kokkos/kokkos/pull/6821)
+* Fix a bug in Makefile.kokkos when using AMD GPU architectures as `AMD_GFXYYY` [\#6892](https://github.com/kokkos/kokkos/pull/6892)
+
 ## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01)
 
@@ -999,95 +1099,95 @@
 - Major update for OpenMPTarget: many capabilities now work. For details contact us.
 - Added DPC++/SYCL backend: primary capabilites are working.
 - Added Kokkos Graph API analogous to CUDA Graphs.
-- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/#3536)
-- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/#3546)
-- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/#3439)
-- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/#3379)
+- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/3536)
+- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/3546)
+- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/3439)
+- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/3379)
 
 **Implemented enhancements Backends and Archs:**
-- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/#3614)
-- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/#3375)
-- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/#3583)
-- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/#3577)
-- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/#3544)
-- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/#3550)
-- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/#3480)
-- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/#3474)
-- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/#3451)
-- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/#3447)
-- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/#3504)
-- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/#3411)
-- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/#3440)
-- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/#3418)
-- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/#3366)
+- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/3614)
+- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/3375)
+- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/3583)
+- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/3577)
+- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/3544)
+- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/3550)
+- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/3480)
+- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/3474)
+- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/3451)
+- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/3447)
+- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/3504)
+- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/3411)
+- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/3440)
+- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/3418)
+- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/3366)
 
 **Implemented enhancements Policies:**
-- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/#3494)
-- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/#3527)
-- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/#3395)
-- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/#3362)
-- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/#3369)
-- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/#3206)
-- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/#3509)
+- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/3494)
+- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/3527)
+- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/3395)
+- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/3362)
+- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/3369)
+- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/3206)
+- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/3509)
 
 **Implemented enhancements BuildSystem:**
-- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/#3488)
-- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/#3548)
-- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/#3136)
-- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/#3434)
-- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/#3402)
-- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/#3457)
+- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/3488)
+- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/3548)
+- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/3136)
+- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/3434)
+- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/3402)
+- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/3457)
 
 **Implemented enhancements Tools:**
-- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/#3455)
-- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/#3530)
-- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/#3518)
-- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/#3459)
-- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/#3326)
+- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/3455)
+- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/3530)
+- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/3518)
+- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/3459)
+- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/3326)
 
 **Implemented enhancements Other:**
-- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/#3528)
-- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/#3449)
-- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/#3436)
-- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/#3435)
-- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/#3422)
-- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/#3416)
-- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/#3388)
-- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/#3359)
-- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/#3357)
-- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/#3340)
-- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/#3339)
-- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/#3338)
-- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/#3309)
-- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/#3265)
-- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/#2941)
+- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/3528)
+- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/3449)
+- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/3436)
+- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/3435)
+- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/3422)
+- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/3416)
+- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/3388)
+- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/3359)
+- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/3357)
+- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/3340)
+- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/3339)
+- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/3338)
+- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/3309)
+- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/3265)
+- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/2941)
 
 **Fixed bugs:**
-- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/#3591)
-- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/#3588)
-- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/#3566)
-- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/#3565)
-- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/#3532)
-- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/#3529)
-- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/#3510)
-- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/#3503)
-- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/#3467)
-- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/#3458)
-- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/#3398)
-- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/#3393)
-- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/#3390)
-- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/#3378)
-- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/#3348)
-- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/#3345)
-- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/#3343)
-- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/#3260)
+- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/3591)
+- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/3588)
+- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/3566)
+- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/3565)
+- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/3532)
+- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/3529)
+- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/3510)
+- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/3503)
+- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/3467)
+- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/3458)
+- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/3398)
+- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/3393)
+- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/3390)
+- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/3378)
+- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/3348)
+- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/3345)
+- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/3343)
+- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/3260)
 
 **Incompatibilities:**
-- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/#3535)
-- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/#3534)
-- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/#3301)
-- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/#3264)
-- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/#3148)
+- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/3535)
+- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/3534)
+- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/3301)
+- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/3264)
+- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/3148)
 
 ## [3.2.01](https://github.com/kokkos/kokkos/tree/3.2.01) (2020-11-17)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.00...3.2.01)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4a4e7a55019..93a796f200b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,8 +150,8 @@ ENDIF()
 
 
 set(Kokkos_VERSION_MAJOR 4)
-set(Kokkos_VERSION_MINOR 2)
-set(Kokkos_VERSION_PATCH 1)
+set(Kokkos_VERSION_MINOR 3)
+set(Kokkos_VERSION_PATCH 0)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 message(STATUS "Kokkos version: ${Kokkos_VERSION}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
diff --git a/Makefile.kokkos b/Makefile.kokkos
index 6e28d2c0cc6..2c74dd77bfb 100644
--- a/Makefile.kokkos
+++ b/Makefile.kokkos
@@ -1,8 +1,8 @@
 # Default settings common options.
 
 KOKKOS_VERSION_MAJOR = 4
-KOKKOS_VERSION_MINOR = 2
-KOKKOS_VERSION_PATCH = 1
+KOKKOS_VERSION_MINOR = 3
+KOKKOS_VERSION_PATCH = 0
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
 
 # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
@@ -12,14 +12,14 @@ KOKKOS_DEVICES ?= "Threads"
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
 # NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
-# IBM:      BGQ,Power7,Power8,Power9
-# AMD-GPUS: GFX906,GFX908,GFX90A,GFX940,GFX942,GFX1030,GFX1100
+# IBM:      Power8,Power9
+# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100
 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
 # Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC
 KOKKOS_ARCH ?= ""
 # Options: yes,no
 KOKKOS_DEBUG ?= "no"
-# Options: hwloc,librt,experimental_memkind
+# Options: hwloc
 KOKKOS_USE_TPLS ?= ""
 # Options: c++17,c++1z,c++20,c++2a,c++23,c++2b
 KOKKOS_CXX_STANDARD ?= "c++17"
@@ -46,7 +46,7 @@ uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$(
 uppercase=$(eval uppercase_RESULT:=$(call uppercase_internal,$(uppercase_TABLE),$1))$(uppercase_RESULT)
 # Return a 1 if a string contains a substring and 0 if not
 # Note the search string should be without '"'
-# Example: $(call kokkos_has_string,"hwloc,librt",hwloc)
+# Example: $(call kokkos_has_string,"hwloc,libdl",hwloc)
 #   Will return a 1
 kokkos_has_string=$(if $(findstring $(call uppercase,$2),$(call uppercase,$1)),1,0)
 # Returns 1 if the path exists, 0 otherwise
@@ -63,11 +63,11 @@ KOKKOS_INTERNAL_ENABLE_CXX20 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),
 KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a)
 KOKKOS_INTERNAL_ENABLE_CXX23 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++23)
 KOKKOS_INTERNAL_ENABLE_CXX2B := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2b)
+KOKKOS_INTERNAL_ENABLE_CXX26 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++26)
+KOKKOS_INTERNAL_ENABLE_CXX2C := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2c)
 
 # Check for external libraries.
 KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc)
-KOKKOS_INTERNAL_USE_LIBRT := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),librt)
-KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),experimental_memkind)
 
 # Check for advanced settings.
 KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings)
@@ -308,7 +308,6 @@ endif
 
 # Intel based.
 KOKKOS_INTERNAL_USE_ARCH_KNC := $(call kokkos_has_string,$(KOKKOS_ARCH),KNC)
-KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM)
 KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB)
 KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW)
 KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW)
@@ -388,11 +387,9 @@ KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX)
 KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc))
 
 # IBM based.
-KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ)
-KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power7)
 KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8)
 KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power9)
-KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc))
+KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc))
 
 # AMD based.
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
@@ -403,22 +400,37 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0)
     KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
   endif
 endif
-KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906))
-KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908))
-KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A))
+
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 0)
+  KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906)
+endif
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 0)
+  KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908)
+endif
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 0)
+  KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A)
+endif
 KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940)
 KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942)
-KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030))
-KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100))
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 0)
+  KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030)
+endif
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0)
+  KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100)
+endif
 
 # Any AVX?
-KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
 KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
 
 # Incompatible flags?
-KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
 KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc)
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
@@ -563,6 +575,16 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2B), 1)
   KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2B_FLAG)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX23")
 endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX26), 1)
+  #I cannot make CMake add this in a good way - so add it here
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX26_FLAG)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26")
+endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2C), 1)
+  #I cannot make CMake add this in a good way - so add it here
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2C_FLAG)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26")
+endif
 
 ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
   ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
@@ -602,27 +624,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HWLOC")
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_LIBRT")
-  KOKKOS_LIBS += -lrt
-  KOKKOS_TPL_LIBRARY_NAMES += rt
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
-  ifneq ($(KOKKOS_CMAKE), yes)
-    ifneq ($(MEMKIND_PATH),)
-      KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
-      KOKKOS_LIBDIRS += -L$(MEMKIND_PATH)/lib
-      KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib
-      KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include
-      KOKKOS_TPL_LIBRARY_DIRS += $(MEMKIND_PATH)/lib
-    endif
-    KOKKOS_LIBS += -lmemkind -lnuma
-    KOKKOS_TPL_LIBRARY_NAMES += memkind numa
-  endif
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HBWSPACE")
-endif
-
 ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LARGE_MEM_TESTS")
 endif
@@ -689,10 +690,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
     endif
   endif
 
-  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND")
-  endif
-
   ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0)
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC")
   else
@@ -817,20 +814,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1)
   endif
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_SSE42")
-
-  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
-    KOKKOS_CXXFLAGS += -xSSE4.2
-    KOKKOS_LDFLAGS  += -xSSE4.2
-  else ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-  else
-    # Assume that this is a really a GNU compiler.
-    KOKKOS_CXXFLAGS += -msse4.2
-    KOKKOS_LDFLAGS  += -msse4.2
-  endif
-endif
-
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX")
 
@@ -1239,7 +1222,6 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
   tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp")
   tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp")
   tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
-  tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
   ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
     tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
     tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
@@ -1279,10 +1261,6 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
     tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
     tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
   endif
-  ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
-    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
-    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-  endif
 endif
 
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
@@ -1393,11 +1371,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
   KOKKOS_TPL_LIBRARY_NAMES += hpx
 endif
 
-# Don't include Kokkos_HBWSpace.cpp if not using MEMKIND to avoid a link warning.
-ifneq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
-  KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp,$(KOKKOS_SRC))
-endif
-
 # With Cygwin functions such as fdopen and fileno are not defined
 # when strict ansi is enabled. strict ansi gets enabled with -std=c++14
 # though. So we hard undefine it here. Not sure if that has any bad side effects
@@ -1451,6 +1424,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
 else
   tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENMP */")
 endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1)
+  tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC")
+else
+  tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */")
+endif
 tmp := $(call desul_append_header, "")
 tmp := $(call desul_append_header, "$H""endif")
 
@@ -1483,7 +1462,7 @@ include $(KOKKOS_PATH)/Makefile.targets
 kokkos-clean:
 	rm -f $(KOKKOS_OBJ_LINK) $(DESUL_CONFIG_HEADER) $(DESUL_INTERNAL_CONFIG_TMP) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \
 	KokkosCore_Config_FwdBackend.hpp KokkosCore_Config_DeclareBackend.hpp KokkosCore_Config_DeclareBackend.tmp \
-        KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_PostInclude.hpp KokkosCore_Config_PostInclude.tmp KokkosCore_Config_SetupBackend.tmp
+        KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_SetupBackend.tmp
 
 libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS)
 	ar cr libkokkos.a $(KOKKOS_OBJ_LINK)
diff --git a/Makefile.targets b/Makefile.targets
index ec8770dd7de..e6900a822a8 100644
--- a/Makefile.targets
+++ b/Makefile.targets
@@ -20,8 +20,6 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
 Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
-Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
 Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp
 Kokkos_Profiling.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp
@@ -30,8 +28,6 @@ Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
 Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
-Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp
 Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp 
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
 Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp
@@ -82,8 +78,10 @@ Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
-Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
+Kokkos_Threads_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp
+Kokkos_Threads_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Spinwait.cpp
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
@@ -123,6 +121,3 @@ Kokkos_OpenACC_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC
 Kokkos_OpenACC_SharedAllocationRecord.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp
 endif
-
-Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
diff --git a/README.md b/README.md
index 033346e956e..19793bb82d9 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ To start learning about Kokkos:
 
 - [Use cases and Examples](https://kokkos.github.io/kokkos-core-wiki/usecases.html): a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability.
 
-For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.
+For questions find us on Slack: https://kokkosteam.slack.com or open a GitHub issue.
 
 For non-public questions send an email to: *crtrott(at)sandia.gov*
 
@@ -48,10 +48,10 @@ Please see the [following page](https://kokkos.github.io/kokkos-core-wiki/citati
 
 # License
 
-[![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
+[![License](https://img.shields.io/badge/License-Apache--2.0_WITH_LLVM--exception-blue)](https://spdx.org/licenses/LLVM-exception.html)
 
 Under the terms of Contract DE-NA0003525 with NTESS,
 the U.S. Government retains certain rights in this software.
 
-The full license statement used in all headers is available [here](https://kokkos.github.io/kokkos-core-wiki/license.html) or
-[here](https://github.com/kokkos/kokkos/blob/master/LICENSE).
+The full license statement used in all headers is available [here](https://kokkos.org/kokkos-core-wiki/license.html) or
+[here](https://github.com/kokkos/kokkos/blob/develop/LICENSE).
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 00000000000..93cf6e3663e
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,12 @@
+# Reporting Security Issues
+
+To report a security issue, please email
+[lebrungrandt@ornl.gov](mailto:lebrungrandt@ornl.gov)
+and [crtrott@sandia.gov](mailto:crtrott@sandia.gov)
+with a description of the issue, the steps you took to create the issue,
+affected versions, and, if known, mitigations for the issue.
+
+Our vulnerability management team will respond within 5 working days of your
+email. If the issue is confirmed as a vulnerability, we will open a
+Security Advisory and acknowledge your contributions as part of it. This project
+follows a 90 day disclosure timeline.
diff --git a/Spack.md b/Spack.md
index 79606c259d5..06c763a64ee 100644
--- a/Spack.md
+++ b/Spack.md
@@ -159,7 +159,6 @@ If you don't specify a CUDA build variant in a `packages.yaml` and you build you
 > spack install superscience
 ````
 you may end up just getting the default Kokkos (i.e. Serial).
-Some examples are included in the `config/yaml` folder for common platforms.
 Before running `spack install <package>` we recommend running `spack spec <package>` to confirm your dependency tree is correct.
 For example, with Kokkos Kernels:
 ````bash
diff --git a/algorithms/src/CMakeLists.txt b/algorithms/src/CMakeLists.txt
index 16957789472..b490caca628 100644
--- a/algorithms/src/CMakeLists.txt
+++ b/algorithms/src/CMakeLists.txt
@@ -30,5 +30,5 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms
   ${CMAKE_CURRENT_SOURCE_DIR}
 )
 
-
-
+KOKKOS_LINK_TPL(kokkoscontainers PUBLIC ROCTHRUST)
+KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL)
diff --git a/algorithms/src/Kokkos_Random.hpp b/algorithms/src/Kokkos_Random.hpp
index 2d7d236d2fc..7df12b8518e 100644
--- a/algorithms/src/Kokkos_Random.hpp
+++ b/algorithms/src/Kokkos_Random.hpp
@@ -849,18 +849,17 @@ class Random_XorShift64 {
     return drand(end - start) + start;
   }
 
-  // Marsaglia polar method for drawing a standard normal distributed random
+  // Box-muller method for drawing a standard normal distributed random
   // number
   KOKKOS_INLINE_FUNCTION
   double normal() {
-    double S = 2.0;
-    double U;
-    while (S >= 1.0) {
-      U              = 2.0 * drand() - 1.0;
-      const double V = 2.0 * drand() - 1.0;
-      S              = U * U + V * V;
-    }
-    return U * std::sqrt(-2.0 * std::log(S) / S);
+    constexpr auto two_pi = 2 * Kokkos::numbers::pi_v<double>;
+
+    const double u     = drand();
+    const double v     = drand();
+    const double r     = Kokkos::sqrt(-2.0 * Kokkos::log(u));
+    const double theta = v * two_pi;
+    return r * Kokkos::cos(theta);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1094,18 +1093,17 @@ class Random_XorShift1024 {
     return drand(end - start) + start;
   }
 
-  // Marsaglia polar method for drawing a standard normal distributed random
+  // Box-muller method for drawing a standard normal distributed random
   // number
   KOKKOS_INLINE_FUNCTION
   double normal() {
-    double S = 2.0;
-    double U;
-    while (S >= 1.0) {
-      U              = 2.0 * drand() - 1.0;
-      const double V = 2.0 * drand() - 1.0;
-      S              = U * U + V * V;
-    }
-    return U * std::sqrt(-2.0 * std::log(S) / S);
+    constexpr auto two_pi = 2 * Kokkos::numbers::pi_v<double>;
+
+    const double u     = drand();
+    const double v     = drand();
+    const double r     = Kokkos::sqrt(-2.0 * Kokkos::log(u));
+    const double theta = v * two_pi;
+    return r * Kokkos::cos(theta);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1545,13 +1543,23 @@ template <class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g,
                  typename ViewType::const_value_type begin,
                  typename ViewType::const_value_type end) {
-  fill_random(typename ViewType::execution_space{}, a, g, begin, end);
+  Kokkos::fence(
+      "fill_random: fence before since no execution space instance provided");
+  typename ViewType::execution_space exec;
+  fill_random(exec, a, g, begin, end);
+  exec.fence(
+      "fill_random: fence after since no execution space instance provided");
 }
 
 template <class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g,
                  typename ViewType::const_value_type range) {
-  fill_random(typename ViewType::execution_space{}, a, g, 0, range);
+  Kokkos::fence(
+      "fill_random: fence before since no execution space instance provided");
+  typename ViewType::execution_space exec;
+  fill_random(exec, a, g, 0, range);
+  exec.fence(
+      "fill_random: fence after since no execution space instance provided");
 }
 
 }  // namespace Kokkos
diff --git a/algorithms/src/Kokkos_Sort.hpp b/algorithms/src/Kokkos_Sort.hpp
index f77484cc555..136b4ec82dc 100644
--- a/algorithms/src/Kokkos_Sort.hpp
+++ b/algorithms/src/Kokkos_Sort.hpp
@@ -23,6 +23,7 @@
 
 #include "sorting/Kokkos_BinSortPublicAPI.hpp"
 #include "sorting/Kokkos_SortPublicAPI.hpp"
+#include "sorting/Kokkos_SortByKeyPublicAPI.hpp"
 #include "sorting/Kokkos_NestedSortPublicAPI.hpp"
 
 #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT
diff --git a/algorithms/src/Kokkos_StdAlgorithms.hpp b/algorithms/src/Kokkos_StdAlgorithms.hpp
index 436ae0d10bf..b532a774e13 100644
--- a/algorithms/src/Kokkos_StdAlgorithms.hpp
+++ b/algorithms/src/Kokkos_StdAlgorithms.hpp
@@ -35,7 +35,6 @@
 // following the std classification.
 
 // modifying ops
-#include "std_algorithms/Kokkos_Swap.hpp"
 #include "std_algorithms/Kokkos_IterSwap.hpp"
 
 // non-modifying sequence
diff --git a/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp b/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp
new file mode 100644
index 00000000000..fc73eccad68
--- /dev/null
+++ b/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp
@@ -0,0 +1,117 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_
+#define KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_
+
+#include "./impl/Kokkos_SortByKeyImpl.hpp"
+#include <Kokkos_Core.hpp>
+#include <algorithm>
+
+namespace Kokkos::Experimental {
+
+// ---------------------------------------------------------------
+// basic overloads
+// ---------------------------------------------------------------
+
+template <class ExecutionSpace, class KeysDataType, class... KeysProperties,
+          class ValuesDataType, class... ValuesProperties>
+void sort_by_key(
+    const ExecutionSpace& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values) {
+  // constraints
+  using KeysType   = Kokkos::View<KeysDataType, KeysProperties...>;
+  using ValuesType = Kokkos::View<ValuesDataType, ValuesProperties...>;
+  ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys);
+  ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values);
+
+  static_assert(SpaceAccessibility<ExecutionSpace,
+                                   typename KeysType::memory_space>::accessible,
+                "Kokkos::sort: execution space instance is not able to access "
+                "the memory space of the keys View argument!");
+  static_assert(
+      SpaceAccessibility<ExecutionSpace,
+                         typename ValuesType::memory_space>::accessible,
+      "Kokkos::sort: execution space instance is not able to access "
+      "the memory space of the values View argument!");
+
+  static_assert(KeysType::static_extent(0) == 0 ||
+                ValuesType::static_extent(0) == 0 ||
+                KeysType::static_extent(0) == ValuesType::static_extent(0));
+  if (values.size() != keys.size())
+    Kokkos::abort((std::string("values and keys extents must be the same. The "
+                               "values extent is ") +
+                   std::to_string(values.size()) + ", and the keys extent is " +
+                   std::to_string(keys.size()) + ".")
+                      .c_str());
+
+  if (keys.extent(0) <= 1) {
+    return;
+  }
+
+  ::Kokkos::Impl::sort_by_key_device_view_without_comparator(exec, keys,
+                                                             values);
+}
+
+// ---------------------------------------------------------------
+// overloads supporting a custom comparator
+// ---------------------------------------------------------------
+
+template <class ExecutionSpace, class ComparatorType, class KeysDataType,
+          class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties>
+void sort_by_key(
+    const ExecutionSpace& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    const ComparatorType& comparator) {
+  // constraints
+  using KeysType   = Kokkos::View<KeysDataType, KeysProperties...>;
+  using ValuesType = Kokkos::View<ValuesDataType, ValuesProperties...>;
+  ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys);
+  ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values);
+
+  static_assert(SpaceAccessibility<ExecutionSpace,
+                                   typename KeysType::memory_space>::accessible,
+                "Kokkos::sort: execution space instance is not able to access "
+                "the memory space of the keys View argument!");
+  static_assert(
+      SpaceAccessibility<ExecutionSpace,
+                         typename ValuesType::memory_space>::accessible,
+      "Kokkos::sort: execution space instance is not able to access "
+      "the memory space of the values View argument!");
+
+  static_assert(KeysType::static_extent(0) == 0 ||
+                ValuesType::static_extent(0) == 0 ||
+                KeysType::static_extent(0) == ValuesType::static_extent(0));
+  if (values.size() != keys.size())
+    Kokkos::abort((std::string("values and keys extents must be the same. The "
+                               "values extent is ") +
+                   std::to_string(values.size()) + ", and the keys extent is " +
+                   std::to_string(keys.size()) + ".")
+                      .c_str());
+
+  if (keys.extent(0) <= 1) {
+    return;
+  }
+
+  ::Kokkos::Impl::sort_by_key_device_view_with_comparator(exec, keys, values,
+                                                          comparator);
+}
+
+}  // namespace Kokkos::Experimental
+#endif
diff --git a/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp b/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp
index a763c41e580..308e9e3a008 100644
--- a/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp
+++ b/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp
@@ -29,7 +29,7 @@ namespace Kokkos {
 // ---------------------------------------------------------------
 
 template <class ExecutionSpace, class DataType, class... Properties>
-void sort([[maybe_unused]] const ExecutionSpace& exec,
+void sort(const ExecutionSpace& exec,
           const Kokkos::View<DataType, Properties...>& view) {
   // constraints
   using ViewType = Kokkos::View<DataType, Properties...>;
@@ -52,6 +52,7 @@ void sort([[maybe_unused]] const ExecutionSpace& exec,
   }
 
   if constexpr (Impl::better_off_calling_std_sort_v<ExecutionSpace>) {
+    exec.fence("Kokkos::sort without comparator use std::sort");
     auto first = ::Kokkos::Experimental::begin(view);
     auto last  = ::Kokkos::Experimental::end(view);
     std::sort(first, last);
@@ -82,7 +83,7 @@ void sort(const Kokkos::View<DataType, Properties...>& view) {
 // ---------------------------------------------------------------
 template <class ExecutionSpace, class ComparatorType, class DataType,
           class... Properties>
-void sort([[maybe_unused]] const ExecutionSpace& exec,
+void sort(const ExecutionSpace& exec,
           const Kokkos::View<DataType, Properties...>& view,
           const ComparatorType& comparator) {
   // constraints
@@ -105,6 +106,7 @@ void sort([[maybe_unused]] const ExecutionSpace& exec,
   }
 
   if constexpr (Impl::better_off_calling_std_sort_v<ExecutionSpace>) {
+    exec.fence("Kokkos::sort with comparator use std::sort");
     auto first = ::Kokkos::Experimental::begin(view);
     auto last  = ::Kokkos::Experimental::end(view);
     std::sort(first, last, comparator);
diff --git a/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp b/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp
index 50ac8233195..2fe58272d92 100644
--- a/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp
+++ b/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp
@@ -18,7 +18,6 @@
 #define KOKKOS_NESTED_SORT_IMPL_HPP_
 
 #include <Kokkos_Core.hpp>
-#include <std_algorithms/Kokkos_Swap.hpp>
 
 namespace Kokkos {
 namespace Experimental {
@@ -99,7 +98,7 @@ KOKKOS_INLINE_FUNCTION void sort_nested_impl(
             keyView(elem1) = key2;
             keyView(elem2) = key1;
             if constexpr (!std::is_same_v<ValueViewType, std::nullptr_t>) {
-              Kokkos::Experimental::swap(valueView(elem1), valueView(elem2));
+              Kokkos::kokkos_swap(valueView(elem1), valueView(elem2));
             }
           }
         }
diff --git a/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp
new file mode 100644
index 00000000000..36deccdfb1e
--- /dev/null
+++ b/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp
@@ -0,0 +1,401 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_
+#define KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_
+
+#include <Kokkos_Core.hpp>
+
+#if defined(KOKKOS_ENABLE_CUDA)
+
+// Workaround for `Instruction 'shfl' without '.sync' is not supported on
+// .target sm_70 and higher from PTX ISA version 6.4`.
+// Also see https://github.com/NVIDIA/cub/pull/170.
+#if !defined(CUB_USE_COOPERATIVE_GROUPS)
+#define CUB_USE_COOPERATIVE_GROUPS
+#endif
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+
+#if defined(KOKKOS_COMPILER_CLANG)
+// Some versions of Clang fail to compile Thrust, failing with errors like
+// this:
+//    <snip>/thrust/system/cuda/detail/core/agent_launcher.h:557:11:
+//    error: use of undeclared identifier 'va_printf'
+// The exact combination of versions for Clang and Thrust (or CUDA) for this
+// failure was not investigated, however even very recent version combination
+// (Clang 10.0.0 and Cuda 10.0) demonstrated failure.
+//
+// Defining _CubLog here locally allows us to avoid that code path, however
+// disabling some debugging diagnostics
+#pragma push_macro("_CubLog")
+#ifdef _CubLog
+#undef _CubLog
+#endif
+#define _CubLog
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#pragma pop_macro("_CubLog")
+#else
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#endif
+
+#pragma GCC diagnostic pop
+
+#endif
+
+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#endif
+
+#if defined(KOKKOS_ENABLE_ONEDPL) && \
+    (ONEDPL_VERSION_MAJOR > 2022 ||  \
+     (ONEDPL_VERSION_MAJOR == 2022 && ONEDPL_VERSION_MINOR >= 2))
+#define KOKKOS_ONEDPL_HAS_SORT_BY_KEY
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/algorithm>
+#endif
+
+namespace Kokkos::Impl {
+
+template <typename T>
+constexpr inline bool is_admissible_to_kokkos_sort_by_key =
+    ::Kokkos::is_view<T>::value&& T::rank() == 1 &&
+    (std::is_same<typename T::traits::array_layout,
+                  Kokkos::LayoutLeft>::value ||
+     std::is_same<typename T::traits::array_layout,
+                  Kokkos::LayoutRight>::value ||
+     std::is_same<typename T::traits::array_layout,
+                  Kokkos::LayoutStride>::value);
+
+template <class ViewType>
+KOKKOS_INLINE_FUNCTION constexpr void
+static_assert_is_admissible_to_kokkos_sort_by_key(const ViewType& /* view */) {
+  static_assert(is_admissible_to_kokkos_sort_by_key<ViewType>,
+                "Kokkos::sort_by_key only accepts 1D values View with "
+                "LayoutRight, LayoutLeft or LayoutStride.");
+}
+
+// For the fallback implementation for sort_by_key using Kokkos::sort, we need
+// to consider if Kokkos::sort defers to the fallback implementation that copies
+// the array to the host and uses std::sort, see
+// copy_to_host_run_stdsort_copy_back() in impl/Kokkos_SortImpl.hpp. If
+// sort_on_device_v is true, we assume that std::sort doesn't copy data.
+// Otherwise, we manually copy all data to the host and provide Kokkos::sort
+// with a host execution space.
+template <class ExecutionSpace, class Layout>
+inline constexpr bool sort_on_device_v = false;
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template <class Layout>
+inline constexpr bool sort_on_device_v<Kokkos::Cuda, Layout> = true;
+
+template <class KeysDataType, class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties, class... MaybeComparator>
+void sort_by_key_cudathrust(
+    const Kokkos::Cuda& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    MaybeComparator&&... maybeComparator) {
+  const auto policy = thrust::cuda::par.on(exec.cuda_stream());
+  auto keys_first   = ::Kokkos::Experimental::begin(keys);
+  auto keys_last    = ::Kokkos::Experimental::end(keys);
+  auto values_first = ::Kokkos::Experimental::begin(values);
+  thrust::sort_by_key(policy, keys_first, keys_last, values_first,
+                      std::forward<MaybeComparator>(maybeComparator)...);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+template <class Layout>
+inline constexpr bool sort_on_device_v<Kokkos::HIP, Layout> = true;
+
+template <class KeysDataType, class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties, class... MaybeComparator>
+void sort_by_key_rocthrust(
+    const Kokkos::HIP& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    MaybeComparator&&... maybeComparator) {
+  const auto policy = thrust::hip::par.on(exec.hip_stream());
+  auto keys_first   = ::Kokkos::Experimental::begin(keys);
+  auto keys_last    = ::Kokkos::Experimental::end(keys);
+  auto values_first = ::Kokkos::Experimental::begin(values);
+  thrust::sort_by_key(policy, keys_first, keys_last, values_first,
+                      std::forward<MaybeComparator>(maybeComparator)...);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ONEDPL)
+template <class Layout>
+inline constexpr bool sort_on_device_v<Kokkos::Experimental::SYCL, Layout> =
+    std::is_same_v<Layout, Kokkos::LayoutLeft> ||
+    std::is_same_v<Layout, Kokkos::LayoutRight>;
+
+#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY
+template <class KeysDataType, class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties, class... MaybeComparator>
+void sort_by_key_onedpl(
+    const Kokkos::Experimental::SYCL& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    MaybeComparator&&... maybeComparator) {
+  if (keys.stride(0) != 1 && values.stride(0) != 1) {
+    Kokkos::abort(
+        "SYCL sort_by_key only supports rank-1 Views with stride(0) = 1.");
+  }
+
+  // Can't use Experimental::begin/end here since the oneDPL then assumes that
+  // the data is on the host.
+  auto queue  = exec.sycl_queue();
+  auto policy = oneapi::dpl::execution::make_device_policy(queue);
+  const int n = keys.extent(0);
+  oneapi::dpl::sort_by_key(policy, keys.data(), keys.data() + n, values.data(),
+                           std::forward<MaybeComparator>(maybeComparator)...);
+}
+#endif
+#endif
+
+template <typename ExecutionSpace, typename PermutationView, typename ViewType>
+void applyPermutation(const ExecutionSpace& space,
+                      const PermutationView& permutation,
+                      const ViewType& view) {
+  static_assert(std::is_integral<typename PermutationView::value_type>::value);
+
+  auto view_copy = Kokkos::create_mirror(
+      Kokkos::view_alloc(space, typename ExecutionSpace::memory_space{},
+                         Kokkos::WithoutInitializing),
+      view);
+  Kokkos::deep_copy(space, view_copy, view);
+  Kokkos::parallel_for(
+      "Kokkos::sort_by_key_via_sort::permute_" + view.label(),
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, view.extent(0)),
+      KOKKOS_LAMBDA(int i) { view(i) = view_copy(permutation(i)); });
+}
+
+template <class ExecutionSpace, class KeysDataType, class... KeysProperties,
+          class ValuesDataType, class... ValuesProperties,
+          class... MaybeComparator>
+void sort_by_key_via_sort(
+    const ExecutionSpace& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    MaybeComparator&&... maybeComparator) {
+  static_assert(sizeof...(MaybeComparator) <= 1);
+
+  auto const n = keys.size();
+
+  Kokkos::View<unsigned int*, ExecutionSpace> permute(
+      Kokkos::view_alloc(exec, Kokkos::WithoutInitializing,
+                         "Kokkos::sort_by_key_via_sort::permute"),
+      n);
+
+  // iota
+  Kokkos::parallel_for(
+      "Kokkos::sort_by_key_via_sort::iota",
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+      KOKKOS_LAMBDA(int i) { permute(i) = i; });
+
+  using Layout =
+      typename Kokkos::View<unsigned int*, ExecutionSpace>::array_layout;
+  if constexpr (!sort_on_device_v<ExecutionSpace, Layout>) {
+    auto host_keys = Kokkos::create_mirror_view(
+        Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing),
+        keys);
+    auto host_permute = Kokkos::create_mirror_view(
+        Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing),
+        permute);
+    Kokkos::deep_copy(exec, host_keys, keys);
+    Kokkos::deep_copy(exec, host_permute, permute);
+
+    exec.fence("Kokkos::Impl::sort_by_key_via_sort: before host sort");
+    Kokkos::DefaultHostExecutionSpace host_exec;
+
+    if constexpr (sizeof...(MaybeComparator) == 0) {
+      Kokkos::sort(
+          host_exec, host_permute,
+          KOKKOS_LAMBDA(int i, int j) { return host_keys(i) < host_keys(j); });
+    } else {
+      auto keys_comparator =
+          std::get<0>(std::tuple<MaybeComparator...>(maybeComparator...));
+      Kokkos::sort(
+          host_exec, host_permute, KOKKOS_LAMBDA(int i, int j) {
+            return keys_comparator(host_keys(i), host_keys(j));
+          });
+    }
+    host_exec.fence("Kokkos::Impl::sort_by_key_via_sort: after host sort");
+    Kokkos::deep_copy(exec, permute, host_permute);
+  } else {
+#ifdef KOKKOS_ENABLE_SYCL
+    auto* raw_keys_in_comparator = keys.data();
+    auto stride                  = keys.stride(0);
+    if constexpr (sizeof...(MaybeComparator) == 0) {
+      Kokkos::sort(
+          exec, permute, KOKKOS_LAMBDA(int i, int j) {
+            return raw_keys_in_comparator[i * stride] <
+                   raw_keys_in_comparator[j * stride];
+          });
+    } else {
+      auto keys_comparator =
+          std::get<0>(std::tuple<MaybeComparator...>(maybeComparator...));
+      Kokkos::sort(
+          exec, permute, KOKKOS_LAMBDA(int i, int j) {
+            return keys_comparator(raw_keys_in_comparator[i * stride],
+                                   raw_keys_in_comparator[j * stride]);
+          });
+    }
+#else
+    if constexpr (sizeof...(MaybeComparator) == 0) {
+      Kokkos::sort(
+          exec, permute,
+          KOKKOS_LAMBDA(int i, int j) { return keys(i) < keys(j); });
+    } else {
+      auto keys_comparator =
+          std::get<0>(std::tuple<MaybeComparator...>(maybeComparator...));
+      Kokkos::sort(
+          exec, permute, KOKKOS_LAMBDA(int i, int j) {
+            return keys_comparator(keys(i), keys(j));
+          });
+    }
+#endif
+  }
+
+  applyPermutation(exec, permute, keys);
+  applyPermutation(exec, permute, values);
+}
+
+// ------------------------------------------------------
+//
+// specialize cases for sorting by key without comparator
+//
+// ------------------------------------------------------
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template <class KeysDataType, class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties>
+void sort_by_key_device_view_without_comparator(
+    const Kokkos::Cuda& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values) {
+  sort_by_key_cudathrust(exec, keys, values);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+template <class KeysDataType, class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties>
+void sort_by_key_device_view_without_comparator(
+    const Kokkos::HIP& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values) {
+  sort_by_key_rocthrust(exec, keys, values);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ONEDPL)
+template <class KeysDataType, class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties>
+void sort_by_key_device_view_without_comparator(
+    const Kokkos::Experimental::SYCL& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values) {
+#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY
+  if (keys.stride(0) == 1 && values.stride(0) == 1)
+    sort_by_key_onedpl(exec, keys, values);
+  else
+#endif
+    sort_by_key_via_sort(exec, keys, values);
+}
+#endif
+
+// fallback case
+template <class ExecutionSpace, class KeysDataType, class... KeysProperties,
+          class ValuesDataType, class... ValuesProperties>
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value>
+sort_by_key_device_view_without_comparator(
+    const ExecutionSpace& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values) {
+  sort_by_key_via_sort(exec, keys, values);
+}
+
+// ---------------------------------------------------
+//
+// specialize cases for sorting by key with comparator
+//
+// ---------------------------------------------------
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template <class ComparatorType, class KeysDataType, class... KeysProperties,
+          class ValuesDataType, class... ValuesProperties>
+void sort_by_key_device_view_with_comparator(
+    const Kokkos::Cuda& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    const ComparatorType& comparator) {
+  sort_by_key_cudathrust(exec, keys, values, comparator);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+template <class ComparatorType, class KeysDataType, class... KeysProperties,
+          class ValuesDataType, class... ValuesProperties>
+void sort_by_key_device_view_with_comparator(
+    const Kokkos::HIP& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    const ComparatorType& comparator) {
+  sort_by_key_rocthrust(exec, keys, values, comparator);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ONEDPL)
+template <class ComparatorType, class KeysDataType, class... KeysProperties,
+          class ValuesDataType, class... ValuesProperties>
+void sort_by_key_device_view_with_comparator(
+    const Kokkos::Experimental::SYCL& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    const ComparatorType& comparator) {
+#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY
+  if (keys.stride(0) == 1 && values.stride(0) == 1)
+    sort_by_key_onedpl(exec, keys, values, comparator);
+  else
+#endif
+    sort_by_key_via_sort(exec, keys, values, comparator);
+}
+#endif
+
+// fallback case
+template <class ComparatorType, class ExecutionSpace, class KeysDataType,
+          class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties>
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value>
+sort_by_key_device_view_with_comparator(
+    const ExecutionSpace& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    const ComparatorType& comparator) {
+  sort_by_key_via_sort(exec, keys, values, comparator);
+}
+
+#undef KOKKOS_ONEDPL_HAS_SORT_BY_KEY
+
+}  // namespace Kokkos::Impl
+#endif
diff --git a/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp
index d87ab09e772..4c174b5fda9 100644
--- a/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp
+++ b/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp
@@ -63,6 +63,11 @@
 
 #endif
 
+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#endif
+
 #if defined(KOKKOS_ENABLE_ONEDPL)
 #include <oneapi/dpl/execution>
 #include <oneapi/dpl/algorithm>
@@ -184,6 +189,26 @@ void sort_cudathrust(const Cuda& space,
 }
 #endif
 
+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+template <class DataType, class... Properties, class... MaybeComparator>
+void sort_rocthrust(const HIP& space,
+                    const Kokkos::View<DataType, Properties...>& view,
+                    MaybeComparator&&... maybeComparator) {
+  using ViewType = Kokkos::View<DataType, Properties...>;
+  static_assert(ViewType::rank == 1,
+                "Kokkos::sort: currently only supports rank-1 Views.");
+
+  if (view.extent(0) <= 1) {
+    return;
+  }
+  const auto exec = thrust::hip::par.on(space.hip_stream());
+  auto first      = ::Kokkos::Experimental::begin(view);
+  auto last       = ::Kokkos::Experimental::end(view);
+  thrust::sort(exec, first, last,
+               std::forward<MaybeComparator>(maybeComparator)...);
+}
+#endif
+
 #if defined(KOKKOS_ENABLE_ONEDPL)
 template <class DataType, class... Properties, class... MaybeComparator>
 void sort_onedpl(const Kokkos::Experimental::SYCL& space,
@@ -274,6 +299,14 @@ void sort_device_view_without_comparator(
 }
 #endif
 
+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+template <class DataType, class... Properties>
+void sort_device_view_without_comparator(
+    const HIP& exec, const Kokkos::View<DataType, Properties...>& view) {
+  sort_rocthrust(exec, view);
+}
+#endif
+
 #if defined(KOKKOS_ENABLE_ONEDPL)
 template <class DataType, class... Properties>
 void sort_device_view_without_comparator(
@@ -320,6 +353,15 @@ void sort_device_view_with_comparator(
 }
 #endif
 
+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+template <class ComparatorType, class DataType, class... Properties>
+void sort_device_view_with_comparator(
+    const HIP& exec, const Kokkos::View<DataType, Properties...>& view,
+    const ComparatorType& comparator) {
+  sort_rocthrust(exec, view, comparator);
+}
+#endif
+
 #if defined(KOKKOS_ENABLE_ONEDPL)
 template <class ComparatorType, class DataType, class... Properties>
 void sort_device_view_with_comparator(
diff --git a/algorithms/src/std_algorithms/Kokkos_Copy.hpp b/algorithms/src/std_algorithms/Kokkos_Copy.hpp
index b7ce1ba5edb..c5406c72b0d 100644
--- a/algorithms/src/std_algorithms/Kokkos_Copy.hpp
+++ b/algorithms/src/std_algorithms/Kokkos_Copy.hpp
@@ -50,7 +50,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy(const ExecutionSpace& ex,
           const ::Kokkos::View<DataType1, Properties1...>& source,
-          ::Kokkos::View<DataType2, Properties2...>& dest) {
+          const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -66,7 +66,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy(const std::string& label, const ExecutionSpace& ex,
           const ::Kokkos::View<DataType1, Properties1...>& source,
-          ::Kokkos::View<DataType2, Properties2...>& dest) {
+          const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -93,7 +93,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto copy(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
diff --git a/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp b/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp
index 8f9e0f19b80..82071a9362e 100644
--- a/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp
+++ b/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp
@@ -50,7 +50,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_backward(const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType1, Properties1...>& source,
-                   ::Kokkos::View<DataType2, Properties2...>& dest) {
+                   const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -65,7 +65,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_backward(const std::string& label, const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType1, Properties1...>& source,
-                   ::Kokkos::View<DataType2, Properties2...>& dest) {
+                   const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -92,7 +92,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto copy_backward(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
diff --git a/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp b/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp
index ba18bc76b93..599fde5737a 100644
--- a/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp
+++ b/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp
@@ -54,7 +54,8 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_if(const ExecutionSpace& ex,
              const ::Kokkos::View<DataType1, Properties1...>& source,
-             ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
+             const ::Kokkos::View<DataType2, Properties2...>& dest,
+             Predicate pred) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -69,7 +70,8 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_if(const std::string& label, const ExecutionSpace& ex,
              const ::Kokkos::View<DataType1, Properties1...>& source,
-             ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
+             const ::Kokkos::View<DataType2, Properties2...>& dest,
+             Predicate pred) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -96,7 +98,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto copy_if(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
diff --git a/algorithms/src/std_algorithms/Kokkos_CopyN.hpp b/algorithms/src/std_algorithms/Kokkos_CopyN.hpp
index 43c91204837..637d8d4cbc5 100644
--- a/algorithms/src/std_algorithms/Kokkos_CopyN.hpp
+++ b/algorithms/src/std_algorithms/Kokkos_CopyN.hpp
@@ -51,7 +51,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_n(const ExecutionSpace& ex,
             const ::Kokkos::View<DataType1, Properties1...>& source, Size count,
-            ::Kokkos::View<DataType2, Properties2...>& dest) {
+            const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -66,7 +66,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_n(const std::string& label, const ExecutionSpace& ex,
             const ::Kokkos::View<DataType1, Properties1...>& source, Size count,
-            ::Kokkos::View<DataType2, Properties2...>& dest) {
+            const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -93,7 +93,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto copy_n(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& source, Size count,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
diff --git a/algorithms/src/std_algorithms/Kokkos_Equal.hpp b/algorithms/src/std_algorithms/Kokkos_Equal.hpp
index a72a49cc22b..593c42f87e1 100644
--- a/algorithms/src/std_algorithms/Kokkos_Equal.hpp
+++ b/algorithms/src/std_algorithms/Kokkos_Equal.hpp
@@ -80,7 +80,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool equal(const ExecutionSpace& ex,
            const ::Kokkos::View<DataType1, Properties1...>& view1,
-           ::Kokkos::View<DataType2, Properties2...>& view2) {
+           const ::Kokkos::View<DataType2, Properties2...>& view2) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
@@ -96,7 +96,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool equal(const std::string& label, const ExecutionSpace& ex,
            const ::Kokkos::View<DataType1, Properties1...>& view1,
-           ::Kokkos::View<DataType2, Properties2...>& view2) {
+           const ::Kokkos::View<DataType2, Properties2...>& view2) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
@@ -111,7 +111,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool equal(const ExecutionSpace& ex,
            const ::Kokkos::View<DataType1, Properties1...>& view1,
-           ::Kokkos::View<DataType2, Properties2...>& view2,
+           const ::Kokkos::View<DataType2, Properties2...>& view2,
            BinaryPredicateType predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
@@ -128,7 +128,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool equal(const std::string& label, const ExecutionSpace& ex,
            const ::Kokkos::View<DataType1, Properties1...>& view1,
-           ::Kokkos::View<DataType2, Properties2...>& view2,
+           const ::Kokkos::View<DataType2, Properties2...>& view2,
            BinaryPredicateType predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
@@ -227,7 +227,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION bool equal(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
@@ -243,7 +243,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION bool equal(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2,
+    const ::Kokkos::View<DataType2, Properties2...>& view2,
     BinaryPredicateType predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
diff --git a/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp b/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp
index a796a306dda..5bb2d1039dc 100644
--- a/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp
+++ b/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp
@@ -19,7 +19,6 @@
 
 #include <Kokkos_Core.hpp>
 #include "impl/Kokkos_Constraints.hpp"
-#include "Kokkos_Swap.hpp"
 
 namespace Kokkos {
 namespace Experimental {
@@ -33,7 +32,7 @@ struct StdIterSwapFunctor {
   KOKKOS_FUNCTION
   void operator()(int i) const {
     (void)i;
-    ::Kokkos::Experimental::swap(*m_a, *m_b);
+    ::Kokkos::kokkos_swap(*m_a, *m_b);
   }
 
   KOKKOS_FUNCTION
@@ -58,6 +57,16 @@ void iter_swap(IteratorType1 a, IteratorType2 b) {
   Impl::iter_swap_impl(a, b);
 }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+template <class T>
+KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::kokkos_swap instead!")
+KOKKOS_FUNCTION
+    void swap(T& a, T& b) noexcept(::Kokkos::kokkos_swap(std::declval<T&>(),
+                                                         std::declval<T&>())) {
+  ::Kokkos::kokkos_swap(a, b);
+}
+#endif
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp b/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp
index 4b5c69df451..e13479c370b 100644
--- a/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp
+++ b/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp
@@ -54,7 +54,7 @@ template <
 bool lexicographical_compare(
     const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
@@ -71,7 +71,7 @@ template <
 bool lexicographical_compare(
     const std::string& label, const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
@@ -112,7 +112,8 @@ template <
 bool lexicographical_compare(
     const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2,
+    ComparatorType comp) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
@@ -129,7 +130,8 @@ template <
 bool lexicographical_compare(
     const std::string& label, const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2,
+    ComparatorType comp) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
@@ -161,7 +163,7 @@ template <class TeamHandleType, class DataType1, class... Properties1,
 KOKKOS_FUNCTION bool lexicographical_compare(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
@@ -187,7 +189,8 @@ template <class TeamHandleType, class DataType1, class... Properties1,
 KOKKOS_FUNCTION bool lexicographical_compare(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2,
+    ComparatorType comp) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
diff --git a/algorithms/src/std_algorithms/Kokkos_Move.hpp b/algorithms/src/std_algorithms/Kokkos_Move.hpp
index f04ea12ba88..ac308ea1845 100644
--- a/algorithms/src/std_algorithms/Kokkos_Move.hpp
+++ b/algorithms/src/std_algorithms/Kokkos_Move.hpp
@@ -50,7 +50,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto move(const ExecutionSpace& ex,
           const ::Kokkos::View<DataType1, Properties1...>& source,
-          ::Kokkos::View<DataType2, Properties2...>& dest) {
+          const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -64,7 +64,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto move(const std::string& label, const ExecutionSpace& ex,
           const ::Kokkos::View<DataType1, Properties1...>& source,
-          ::Kokkos::View<DataType2, Properties2...>& dest) {
+          const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -92,7 +92,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto move(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
diff --git a/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp b/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp
index 375474ca57f..2789ab21796 100644
--- a/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp
+++ b/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp
@@ -41,7 +41,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto move_backward(const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType1, Properties1...>& source,
-                   ::Kokkos::View<DataType2, Properties2...>& dest) {
+                   const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -65,7 +65,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto move_backward(const std::string& label, const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType1, Properties1...>& source,
-                   ::Kokkos::View<DataType2, Properties2...>& dest) {
+                   const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -94,7 +94,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto move_backward(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
diff --git a/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp b/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp
index 37336c983ab..66f39c4eaa6 100644
--- a/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp
+++ b/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp
@@ -50,7 +50,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto reverse_copy(const ExecutionSpace& ex,
                   const ::Kokkos::View<DataType1, Properties1...>& source,
-                  ::Kokkos::View<DataType2, Properties2...>& dest) {
+                  const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -65,7 +65,7 @@ template <
     std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto reverse_copy(const std::string& label, const ExecutionSpace& ex,
                   const ::Kokkos::View<DataType1, Properties1...>& source,
-                  ::Kokkos::View<DataType2, Properties2...>& dest) {
+                  const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -94,7 +94,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto reverse_copy(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
diff --git a/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp b/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp
index 39f33b64879..d66763d304c 100644
--- a/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp
+++ b/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp
@@ -40,7 +40,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1,
           std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto swap_ranges(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& source,
-                 ::Kokkos::View<DataType2, Properties2...>& dest) {
+                 const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -64,7 +64,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1,
           std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto swap_ranges(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& source,
-                 ::Kokkos::View<DataType2, Properties2...>& dest) {
+                 const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -94,7 +94,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto swap_ranges(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
diff --git a/algorithms/src/std_algorithms/Kokkos_Transform.hpp b/algorithms/src/std_algorithms/Kokkos_Transform.hpp
index 838c9169e25..84cbed524d3 100644
--- a/algorithms/src/std_algorithms/Kokkos_Transform.hpp
+++ b/algorithms/src/std_algorithms/Kokkos_Transform.hpp
@@ -58,7 +58,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1,
           std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform(const ExecutionSpace& ex,
                const ::Kokkos::View<DataType1, Properties1...>& source,
-               ::Kokkos::View<DataType2, Properties2...>& dest,
+               const ::Kokkos::View<DataType2, Properties2...>& dest,
                UnaryOperation unary_op) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
@@ -73,7 +73,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1,
           std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform(const std::string& label, const ExecutionSpace& ex,
                const ::Kokkos::View<DataType1, Properties1...>& source,
-               ::Kokkos::View<DataType2, Properties2...>& dest,
+               const ::Kokkos::View<DataType2, Properties2...>& dest,
                UnaryOperation unary_op) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
@@ -119,7 +119,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1,
 auto transform(const ExecutionSpace& ex,
                const ::Kokkos::View<DataType1, Properties1...>& source1,
                const ::Kokkos::View<DataType2, Properties2...>& source2,
-               ::Kokkos::View<DataType3, Properties3...>& dest,
+               const ::Kokkos::View<DataType3, Properties3...>& dest,
                BinaryOperation binary_op) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2);
@@ -137,7 +137,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1,
 auto transform(const std::string& label, const ExecutionSpace& ex,
                const ::Kokkos::View<DataType1, Properties1...>& source1,
                const ::Kokkos::View<DataType2, Properties2...>& source2,
-               ::Kokkos::View<DataType3, Properties3...>& dest,
+               const ::Kokkos::View<DataType3, Properties3...>& dest,
                BinaryOperation binary_op) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2);
@@ -174,7 +174,8 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto transform(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest, UnaryOperation unary_op) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest,
+    UnaryOperation unary_op) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
@@ -207,7 +208,7 @@ KOKKOS_FUNCTION auto transform(
     const TeamHandleType& teamHandle,
     const ::Kokkos::View<DataType1, Properties1...>& source1,
     const ::Kokkos::View<DataType2, Properties2...>& source2,
-    ::Kokkos::View<DataType3, Properties3...>& dest,
+    const ::Kokkos::View<DataType3, Properties3...>& dest,
     BinaryOperation binary_op) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2);
diff --git a/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp
index 8151ee34955..5a7fe16984a 100644
--- a/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp
+++ b/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp
@@ -47,8 +47,9 @@ struct ExclusiveScanDefaultFunctorForKnownNeutralElement {
   KOKKOS_FUNCTION
   void operator()(const IndexType i, ValueType& update,
                   const bool final_pass) const {
+    const auto tmp = m_first_from[i];
     if (final_pass) m_first_dest[i] = update + m_init_value;
-    update += m_first_from[i];
+    update += tmp;
   }
 };
 
@@ -73,6 +74,7 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper {
   KOKKOS_FUNCTION
   void operator()(const IndexType i, value_type& update,
                   const bool final_pass) const {
+    const auto tmp = value_type{m_first_from[i], false};
     if (final_pass) {
       if (i == 0) {
         m_first_dest[i] = m_init_value;
@@ -81,7 +83,6 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper {
       }
     }
 
-    const auto tmp = value_type{m_first_from[i], false};
     this->join(update, tmp);
   }
 
@@ -132,6 +133,7 @@ struct TransformExclusiveScanFunctorWithValueWrapper {
   KOKKOS_FUNCTION
   void operator()(const IndexType i, value_type& update,
                   const bool final_pass) const {
+    const auto tmp = value_type{m_unary_op(m_first_from[i]), false};
     if (final_pass) {
       if (i == 0) {
         // for both ExclusiveScan and TransformExclusiveScan,
@@ -142,7 +144,6 @@ struct TransformExclusiveScanFunctorWithValueWrapper {
       }
     }
 
-    const auto tmp = value_type{m_unary_op(m_first_from[i]), false};
     this->join(update, tmp);
   }
 
@@ -190,6 +191,7 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper {
   KOKKOS_FUNCTION
   void operator()(const IndexType i, ValueType& update,
                   const bool final_pass) const {
+    const auto tmp = ValueType{m_unary_op(m_first_from[i])};
     if (final_pass) {
       if (i == 0) {
         // for both ExclusiveScan and TransformExclusiveScan,
@@ -200,7 +202,6 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper {
       }
     }
 
-    const auto tmp = ValueType{m_unary_op(m_first_from[i])};
     this->join(update, tmp);
   }
 
diff --git a/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp b/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp
index 50224c8874e..456df43aed2 100644
--- a/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp
+++ b/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp
@@ -46,15 +46,14 @@ struct StdRemoveIfStage1Functor {
   void operator()(const IndexType i, IndexType& update,
                   const bool final_pass) const {
     auto& myval = m_first_from[i];
-    if (final_pass) {
-      if (!m_must_remove(myval)) {
+
+    if (!m_must_remove(myval)) {
+      if (final_pass) {
         // calling move here is ok because we are inside final pass
         // we are calling move assign as specified by the std
         m_first_dest[update] = std::move(myval);
       }
-    }
 
-    if (!m_must_remove(myval)) {
       update += 1;
     }
   }
@@ -108,7 +107,9 @@ IteratorType remove_if_exespace_impl(const std::string& label,
     // create helper tmp view
     using value_type    = typename IteratorType::value_type;
     using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
-    tmp_view_type tmp_view("std_remove_if_tmp_view", keep_count);
+    tmp_view_type tmp_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, ex,
+                                              "std_remove_if_tmp_view"),
+                           keep_count);
     using tmp_readwrite_iterator_type = decltype(begin(tmp_view));
 
     // in stage 1, *move* all elements to keep from original range to tmp
diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp
index 428dc0d744a..b4046c7645b 100644
--- a/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp
+++ b/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp
@@ -21,7 +21,6 @@
 #include "Kokkos_Constraints.hpp"
 #include "Kokkos_HelperPredicates.hpp"
 #include <std_algorithms/Kokkos_Distance.hpp>
-#include <std_algorithms/Kokkos_Swap.hpp>
 #include <string>
 
 namespace Kokkos {
@@ -39,7 +38,7 @@ struct StdReverseFunctor {
 
   KOKKOS_FUNCTION
   void operator()(index_type i) const {
-    ::Kokkos::Experimental::swap(m_first[i], m_last[-i - 1]);
+    ::Kokkos::kokkos_swap(m_first[i], m_last[-i - 1]);
   }
 
   KOKKOS_FUNCTION
diff --git a/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp b/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp
index 50bc7c8d610..94147485071 100644
--- a/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp
+++ b/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp
@@ -126,10 +126,11 @@ KOKKOS_FUNCTION IteratorType shift_left_team_impl(
   // execution space impl because for this team impl we are
   // within a parallel region, so for now we solve serially
 
-  const std::size_t numElementsToMove =
+  using difference_type = typename IteratorType::difference_type;
+  const difference_type numElementsToMove =
       ::Kokkos::Experimental::distance(first + n, last);
   Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() {
-    for (std::size_t i = 0; i < numElementsToMove; ++i) {
+    for (difference_type i = 0; i < numElementsToMove; ++i) {
       first[i] = std::move(first[i + n]);
     }
   });
diff --git a/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp b/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp
index cac20bfbba6..0414e6f1c25 100644
--- a/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp
+++ b/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp
@@ -103,26 +103,6 @@ IteratorType shift_right_exespace_impl(
   return first + n;
 }
 
-template <class Iterator>
-struct StdShiftRightTeamSingleFunctor {
-  Iterator m_first;
-  Iterator m_last;
-  std::size_t m_shift;
-
-  KOKKOS_FUNCTION
-  void operator()() const {
-    // the impl function calling this functor guarantees that
-    // - m_shift is non-negative
-    // - m_first, m_last identify a valid range with m_last > m_first
-    // - m_shift is less than m_last - m_first
-    // so I can safely use std::size_t here
-  }
-
-  KOKKOS_FUNCTION
-  StdShiftRightTeamSingleFunctor(Iterator _first, Iterator _last, std::size_t n)
-      : m_first(std::move(_first)), m_last(std::move(_last)), m_shift(n) {}
-};
-
 template <class TeamHandleType, class IteratorType>
 KOKKOS_FUNCTION IteratorType shift_right_team_impl(
     const TeamHandleType& teamHandle, IteratorType first, IteratorType last,
@@ -145,10 +125,11 @@ KOKKOS_FUNCTION IteratorType shift_right_team_impl(
   // execution space impl because for this team impl we are
   // within a parallel region, so for now we solve serially
 
-  const std::size_t numElementsToMove =
+  using difference_type = typename IteratorType::difference_type;
+  const difference_type numElementsToMove =
       ::Kokkos::Experimental::distance(first, last - n);
   Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() {
-    for (std::size_t i = 0; i < numElementsToMove; ++i) {
+    for (difference_type i = 0; i < numElementsToMove; ++i) {
       last[-i - 1] = std::move(last[-n - i - 1]);
     }
   });
diff --git a/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp b/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp
index 5bc77ed7ddc..930a14ac48c 100644
--- a/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp
+++ b/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp
@@ -21,7 +21,6 @@
 #include "Kokkos_Constraints.hpp"
 #include "Kokkos_HelperPredicates.hpp"
 #include <std_algorithms/Kokkos_Distance.hpp>
-#include <std_algorithms/Kokkos_Swap.hpp>
 #include <string>
 
 namespace Kokkos {
@@ -36,7 +35,7 @@ struct StdSwapRangesFunctor {
 
   KOKKOS_FUNCTION
   void operator()(index_type i) const {
-    ::Kokkos::Experimental::swap(m_first1[i], m_first2[i]);
+    ::Kokkos::kokkos_swap(m_first1[i], m_first2[i]);
   }
 
   KOKKOS_FUNCTION
diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp
index 11afa8ed6e0..28635824585 100644
--- a/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp
+++ b/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp
@@ -105,7 +105,9 @@ IteratorType unique_exespace_impl(const std::string& label,
       // using the same algorithm used for unique_copy but we now move things
       using value_type    = typename IteratorType::value_type;
       using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
-      tmp_view_type tmp_view("std_unique_tmp_view", num_elements_to_explore);
+      tmp_view_type tmp_view(Kokkos::view_alloc(ex, Kokkos::WithoutInitializing,
+                                                "std_unique_tmp_view"),
+                             num_elements_to_explore);
 
       // scan extent is: num_elements_to_explore - 1
       // for same reason as the one explained in unique_copy
diff --git a/algorithms/unit_tests/CMakeLists.txt b/algorithms/unit_tests/CMakeLists.txt
index 419f5ec1d13..db184bc8a99 100644
--- a/algorithms/unit_tests/CMakeLists.txt
+++ b/algorithms/unit_tests/CMakeLists.txt
@@ -25,6 +25,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
     set(ALGO_SORT_SOURCES)
     foreach(SOURCE_Input
 	TestSort
+	TestSortByKey
 	TestSortCustomComp
 	TestBinSortA
 	TestBinSortB
@@ -57,35 +58,37 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
       configure_file(${dir}/dummy.cpp ${file})
       list(APPEND ALGO_RANDOM_SOURCES ${file})
     endforeach()
+  endif()
+endforeach()
 
-    # ------------------------------------------
-    # std set A
-    # ------------------------------------------
-    set(STDALGO_SOURCES_A)
-    foreach(Name
+# ------------------------------------------
+# std set A
+# ------------------------------------------
+set(STDALGO_SOURCES_A)
+foreach(Name
 	StdReducers
 	StdAlgorithmsConstraints
 	RandomAccessIterator
-	)
-      list(APPEND STDALGO_SOURCES_A Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_SOURCES_A Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std set B
-    # ------------------------------------------
-    set(STDALGO_SOURCES_B)
-    foreach(Name
+# ------------------------------------------
+# std set B
+# ------------------------------------------
+set(STDALGO_SOURCES_B)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsMinMaxElementOps
-	)
-      list(APPEND STDALGO_SOURCES_B Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_SOURCES_B Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std set C
-    # ------------------------------------------
-    set(STDALGO_SOURCES_C)
-    foreach(Name
+# ------------------------------------------
+# std set C
+# ------------------------------------------
+set(STDALGO_SOURCES_C)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsLexicographicalCompare
 	StdAlgorithmsForEach
@@ -100,15 +103,15 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsSearch_n
 	StdAlgorithmsMismatch
 	StdAlgorithmsMoveBackward
-	)
-      list(APPEND STDALGO_SOURCES_C Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_SOURCES_C Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std set D
-    # ------------------------------------------
-    set(STDALGO_SOURCES_D)
-    foreach(Name
+# ------------------------------------------
+# std set D
+# ------------------------------------------
+set(STDALGO_SOURCES_D)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsModOps
 	StdAlgorithmsModSeqOps
@@ -128,15 +131,15 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsReverse
 	StdAlgorithmsShiftLeft
 	StdAlgorithmsShiftRight
-	)
-      list(APPEND STDALGO_SOURCES_D Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_SOURCES_D Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std set E
-    # ------------------------------------------
-    set(STDALGO_SOURCES_E)
-    foreach(Name
+# ------------------------------------------
+# std set E
+# ------------------------------------------
+set(STDALGO_SOURCES_E)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsIsSorted
 	StdAlgorithmsIsSortedUntil
@@ -149,83 +152,83 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsTransformUnaryOp
 	StdAlgorithmsTransformExclusiveScan
 	StdAlgorithmsTransformInclusiveScan
-	)
-      list(APPEND STDALGO_SOURCES_E Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_SOURCES_E Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std team Q
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_Q)
-    foreach(Name
+# ------------------------------------------
+# std team Q
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_Q)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamInclusiveScan
 	StdAlgorithmsTeamTransformInclusiveScan
-      )
-      list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std team P
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_P)
-    foreach(Name
+# ------------------------------------------
+# std team P
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_P)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamExclusiveScan
 	StdAlgorithmsTeamTransformExclusiveScan
-      )
-      list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std team M
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_M)
-    foreach(Name
+# ------------------------------------------
+# std team M
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_M)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamTransformUnaryOp
 	StdAlgorithmsTeamTransformBinaryOp
 	StdAlgorithmsTeamGenerate
 	StdAlgorithmsTeamGenerate_n
 	StdAlgorithmsTeamSwapRanges
-      )
-      list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std team L
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_L)
-    foreach(Name
+# ------------------------------------------
+# std team L
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_L)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamIsSorted
 	StdAlgorithmsTeamIsSortedUntil
 	StdAlgorithmsTeamIsPartitioned
 	StdAlgorithmsTeamPartitionCopy
 	StdAlgorithmsTeamPartitionPoint
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std team I
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_I)
-    foreach(Name
+# ------------------------------------------
+# std team I
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_I)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamUnique
 	StdAlgorithmsTeamAdjacentDifference
 	StdAlgorithmsTeamReduce
 	StdAlgorithmsTeamTransformReduce
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std team H
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_H)
-    foreach(Name
+# ------------------------------------------
+# std team H
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_H)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamCopy
 	StdAlgorithmsTeamCopy_n
@@ -236,43 +239,43 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsTeamRemoveIf
 	StdAlgorithmsTeamRemoveCopy
 	StdAlgorithmsTeamRemoveCopyIf
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std team G
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_G)
-    foreach(Name
+# ------------------------------------------
+# std team G
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_G)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamMove
 	StdAlgorithmsTeamMoveBackward
 	StdAlgorithmsTeamShiftLeft
 	StdAlgorithmsTeamShiftRight
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std team F
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_F)
-    foreach(Name
+# ------------------------------------------
+# std team F
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_F)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamReverse
 	StdAlgorithmsTeamReverseCopy
 	StdAlgorithmsTeamRotate
 	StdAlgorithmsTeamRotateCopy
-      )
-      list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std team E
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_E)
-    foreach(Name
+# ------------------------------------------
+# std team E
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_E)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamFill
 	StdAlgorithmsTeamFill_n
@@ -280,28 +283,28 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsTeamReplaceIf
 	StdAlgorithmsTeamReplaceCopy
 	StdAlgorithmsTeamReplaceCopyIf
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std team D
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_D)
-    foreach(Name
+# ------------------------------------------
+# std team D
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_D)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamMinElement
 	StdAlgorithmsTeamMaxElement
 	StdAlgorithmsTeamMinMaxElement
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std team C
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_C)
-    foreach(Name
+# ------------------------------------------
+# std team C
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_C)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamFind
 	StdAlgorithmsTeamFindIf
@@ -310,29 +313,29 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsTeamAnyOf
 	StdAlgorithmsTeamNoneOf
 	StdAlgorithmsTeamSearchN
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std team B
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_B)
-    foreach(Name
+# ------------------------------------------
+# std team B
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_B)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamEqual
 	StdAlgorithmsTeamSearch
 	StdAlgorithmsTeamFindEnd
 	StdAlgorithmsTeamFindFirstOf
-      )
-      list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp)
+endforeach()
 
-    # ------------------------------------------
-    # std team A
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_A)
-    foreach(Name
+# ------------------------------------------
+# std team A
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_A)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamAdjacentFind
 	StdAlgorithmsTeamCount
@@ -341,11 +344,8 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsTeamForEachN
 	StdAlgorithmsTeamLexicographicalCompare
 	StdAlgorithmsTeamMismatch
-      )
-      list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp)
-    endforeach()
-
-  endif()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp)
 endforeach()
 
 # FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time.
diff --git a/algorithms/unit_tests/Makefile b/algorithms/unit_tests/Makefile
index 601217799a8..d3946c149ba 100644
--- a/algorithms/unit_tests/Makefile
+++ b/algorithms/unit_tests/Makefile
@@ -27,13 +27,13 @@ TARGETS =
 
 tmp := $(foreach device, $(KOKKOS_DEVICELIST), \
   $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\
-     $(shell echo "\#include <Test"${device}"_Category.hpp>" > Test$(device).cpp); \
-     $(shell echo "\#include <TestRandom.hpp>" >> Test$(device).cpp); \
-     $(shell echo "\#include <TestSort.hpp>" >> Test$(device).cpp); \
-     $(shell echo "\#include <TestBinSortA.hpp>" >> Test$(device).cpp); \
-     $(shell echo "\#include <TestBinSortB.hpp>" >> Test$(device).cpp); \
-     $(shell echo "\#include <TestNestedSort.hpp>" >> Test$(device).cpp); \
-     $(shell echo "\#include <TestSortCustomComp.hpp>" >> Test$(device).cpp); \
+     $(shell echo "$(H)include <Test"${device}"_Category.hpp>" > Test$(device).cpp); \
+     $(shell echo "$(H)include <TestRandom.hpp>" >> Test$(device).cpp); \
+     $(shell echo "$(H)include <TestSort.hpp>" >> Test$(device).cpp); \
+     $(shell echo "$(H)include <TestBinSortA.hpp>" >> Test$(device).cpp); \
+     $(shell echo "$(H)include <TestBinSortB.hpp>" >> Test$(device).cpp); \
+     $(shell echo "$(H)include <TestNestedSort.hpp>" >> Test$(device).cpp); \
+     $(shell echo "$(H)include <TestSortCustomComp.hpp>" >> Test$(device).cpp); \
    ) \
 )
 
diff --git a/algorithms/unit_tests/TestSortByKey.hpp b/algorithms/unit_tests/TestSortByKey.hpp
new file mode 100644
index 00000000000..16f68eaaf26
--- /dev/null
+++ b/algorithms/unit_tests/TestSortByKey.hpp
@@ -0,0 +1,241 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP
+#define KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <Kokkos_Sort.hpp>
+
+#include <utility>  // pair
+
+namespace Test {
+namespace SortImpl {
+
+struct Less {
+  template <class ValueType>
+  KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs,
+                                         const ValueType &rhs) const {
+    return lhs < rhs;
+  }
+};
+
+struct Greater {
+  template <class ValueType>
+  KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs,
+                                         const ValueType &rhs) const {
+    return lhs > rhs;
+  }
+};
+
+template <class ExecutionSpace, class Keys, class Permute,
+          class Comparator = Less>
+struct is_sorted_by_key_struct {
+  Keys keys;
+  Keys keys_orig;
+  Permute permute;
+  Comparator comparator;
+
+  is_sorted_by_key_struct(Keys keys_, Keys keys_orig_, Permute permute_,
+                          Comparator comparator_ = Comparator{})
+      : keys(keys_),
+        keys_orig(keys_orig_),
+        permute(permute_),
+        comparator(comparator_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i, unsigned int &count) const {
+    if (i < keys.extent_int(0) - 1 && comparator(keys(i + 1), keys(i))) ++count;
+    if (keys(i) != keys_orig(permute(i))) ++count;
+  }
+};
+
+template <typename ExecutionSpace, typename ViewType>
+void iota(ExecutionSpace const &space, ViewType const &v,
+          typename ViewType::value_type value = 0) {
+  using ValueType = typename ViewType::value_type;
+  Kokkos::parallel_for(
+      "ArborX::Algorithms::iota",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, v.extent(0)),
+      KOKKOS_LAMBDA(int i) { v(i) = value + (ValueType)i; });
+}
+
+}  // namespace SortImpl
+
+TEST(TEST_CATEGORY, SortByKeyEmptyView) {
+  using ExecutionSpace = TEST_EXECSPACE;
+
+  // does not matter if we use int or something else
+  Kokkos::View<int *, ExecutionSpace> keys("keys", 0);
+  Kokkos::View<float *, ExecutionSpace> values("values", 0);
+
+  ASSERT_NO_THROW(
+      Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values));
+}
+
+TEST(TEST_CATEGORY, SortByKey) {
+  using ExecutionSpace = TEST_EXECSPACE;
+  using MemorySpace    = typename ExecutionSpace::memory_space;
+
+  ExecutionSpace space{};
+
+  for (auto keys_vector : {std::vector<int>{36, 19, 25, 17, 3, 7, 1, 2, 9},
+                           std::vector<int>{36, 19, 25, 17, 3, 9, 1, 2, 7},
+                           std::vector<int>{100, 19, 36, 17, 3, 25, 1, 2, 7},
+                           std::vector<int>{15, 5, 11, 3, 4, 8}}) {
+    auto const n = keys_vector.size();
+
+    auto keys = Kokkos::create_mirror_view_and_copy(
+        MemorySpace{},
+        Kokkos::View<int *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>(
+            keys_vector.data(), n));
+
+    auto keys_orig = Kokkos::create_mirror(space, keys);
+    Kokkos::deep_copy(space, keys_orig, keys);
+
+    Kokkos::View<int *, ExecutionSpace> permute("permute", n);
+    SortImpl::iota(space, permute);
+
+    Kokkos::Experimental::sort_by_key(space, keys, permute);
+
+    unsigned int sort_fails = 0;
+    Kokkos::parallel_reduce(
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, n),
+        SortImpl::is_sorted_by_key_struct<ExecutionSpace, decltype(keys),
+                                          decltype(permute)>(keys, keys_orig,
+                                                             permute),
+        sort_fails);
+
+    ASSERT_EQ(sort_fails, 0u);
+  }
+}
+
+TEST(TEST_CATEGORY, SortByKeyWithComparator) {
+  using ExecutionSpace = TEST_EXECSPACE;
+  using MemorySpace    = typename ExecutionSpace::memory_space;
+
+  ExecutionSpace space{};
+
+  SortImpl::Greater comparator;
+
+  for (auto keys_vector : {std::vector<int>{36, 19, 25, 17, 3, 7, 1, 2, 9},
+                           std::vector<int>{36, 19, 25, 17, 3, 9, 1, 2, 7},
+                           std::vector<int>{100, 19, 36, 17, 3, 25, 1, 2, 7},
+                           std::vector<int>{15, 5, 11, 3, 4, 8}}) {
+    auto const n = keys_vector.size();
+
+    auto keys = Kokkos::create_mirror_view_and_copy(
+        MemorySpace{},
+        Kokkos::View<int *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>(
+            keys_vector.data(), n));
+
+    auto keys_orig = Kokkos::create_mirror(space, keys);
+    Kokkos::deep_copy(space, keys_orig, keys);
+
+    Kokkos::View<int *, ExecutionSpace> permute("permute", n);
+    SortImpl::iota(space, permute);
+
+    Kokkos::Experimental::sort_by_key(space, keys, permute, comparator);
+
+    unsigned int sort_fails = 0;
+    Kokkos::parallel_reduce(
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, n),
+        SortImpl::is_sorted_by_key_struct<ExecutionSpace, decltype(keys),
+                                          decltype(permute), SortImpl::Greater>(
+            keys, keys_orig, permute, comparator),
+        sort_fails);
+
+    ASSERT_EQ(sort_fails, 0u);
+  }
+}
+
+TEST(TEST_CATEGORY, SortByKeyStaticExtents) {
+  using ExecutionSpace = TEST_EXECSPACE;
+
+  ExecutionSpace space{};
+
+  Kokkos::View<int[10], ExecutionSpace> keys("keys");
+
+  Kokkos::View<int[10], ExecutionSpace> values_static("values_static");
+  ASSERT_NO_THROW(
+      Kokkos::Experimental::sort_by_key(space, keys, values_static));
+
+  Kokkos::View<int *, ExecutionSpace> values_dynamic("values_dynamic", 10);
+  ASSERT_NO_THROW(
+      Kokkos::Experimental::sort_by_key(space, keys, values_dynamic));
+}
+
+template <typename ExecutionSpace, typename Keys, typename Values>
+void buildViewsForStrided(ExecutionSpace const &space, int n, Keys &keys,
+                          Values &values) {
+  Kokkos::parallel_for(
+      "create_data",
+      Kokkos::MDRangePolicy<Kokkos::Rank<3>, ExecutionSpace>(space, {0, 0, 0},
+                                                             {n, n, n}),
+      KOKKOS_LAMBDA(int i, int j, int k) {
+        keys(i, j, k)   = n - i;
+        values(i, j, k) = j;
+      });
+}
+
+TEST(TEST_CATEGORY, SortByKeyWithStrides) {
+  using ExecutionSpace = TEST_EXECSPACE;
+
+  ExecutionSpace space{};
+
+  auto const n = 10;
+
+  Kokkos::View<int ***, ExecutionSpace> keys("keys", n, n, n);
+  Kokkos::View<int ***, ExecutionSpace> values("values", n, n, n);
+  buildViewsForStrided(space, n, keys, values);
+
+  auto keys_sub   = Kokkos::subview(keys, Kokkos::ALL(), 1, 2);
+  auto values_sub = Kokkos::subview(values, 4, Kokkos::ALL(), 6);
+
+  auto keys_orig = Kokkos::create_mirror(space, keys_sub);
+  Kokkos::deep_copy(space, keys_orig, keys_sub);
+
+  Kokkos::Experimental::sort_by_key(space, keys_sub, values_sub);
+
+  unsigned int sort_fails = 0;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, n),
+      SortImpl::is_sorted_by_key_struct<ExecutionSpace, decltype(keys_sub),
+                                        decltype(values_sub)>(
+          keys_sub, keys_orig, values_sub),
+      sort_fails);
+
+  ASSERT_EQ(sort_fails, 0u);
+}
+
+TEST(TEST_CATEGORY, SortByKeyKeysLargerThanValues) {
+  using ExecutionSpace = TEST_EXECSPACE;
+
+  // does not matter if we use int or something else
+  Kokkos::View<int *, ExecutionSpace> keys("keys", 3);
+  Kokkos::View<float *, ExecutionSpace> values("values", 1);
+
+  ASSERT_DEATH(
+      Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values),
+      "values and keys extents must be the same");
+  ASSERT_DEATH(Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values,
+                                                 SortImpl::Greater{}),
+               "values and keys extents must be the same");
+}
+
+}  // namespace Test
+#endif
diff --git a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp
index 3eb963faf2d..67052e2f9d4 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp
@@ -239,16 +239,8 @@ KOKKOS_FUNCTION bool team_members_have_matching_result(
   // set accum to 1 if a mismach is found
   const bool mismatch = memberValue != target;
   int accum           = static_cast<int>(mismatch);
-  // FIXME_OPENMPTARGET: team API does not meet the TeamHandle concept and
-  // ignores the reducer passed
-#if defined KOKKOS_ENABLE_OPENMPTARGET
-  Kokkos::Sum<int> dummyReducer(accum);
-  const auto result = teamHandle.team_reduce(accum, dummyReducer);
-  return (result == 0);
-#else
   teamHandle.team_reduce(Kokkos::Sum<int>(accum));
   return (accum == 0);
-#endif
 }
 
 template <class ValueType1, class ValueType2>
diff --git a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp
index 6ab68a1987d..b364c53a888 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp
@@ -16,6 +16,7 @@
 
 #include <TestStdAlgorithmsCommon.hpp>
 #include <utility>
+#include <iomanip>
 
 namespace Test {
 namespace stdalgos {
@@ -132,47 +133,6 @@ void my_host_exclusive_scan(it1 first, it1 last, it2 dest, ValType init,
   }
 }
 
-template <class ViewType1, class ViewType2, class ValueType, class BinaryOp>
-void verify_data(ViewType1 data_view,  // contains data
-                 ViewType2 test_view,  // the view to test
-                 ValueType init_value, BinaryOp bop) {
-  //! always careful because views might not be deep copyable
-
-  auto data_view_dc = create_deep_copyable_compatible_clone(data_view);
-  auto data_view_h =
-      create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc);
-
-  using gold_view_value_type = typename ViewType2::value_type;
-  Kokkos::View<gold_view_value_type*, Kokkos::HostSpace> gold_h(
-      "goldh", data_view.extent(0));
-  my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h),
-                         KE::begin(gold_h), init_value, bop);
-
-  auto test_view_dc = create_deep_copyable_compatible_clone(test_view);
-  auto test_view_h =
-      create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc);
-  if (test_view_h.extent(0) > 0) {
-    for (std::size_t i = 0; i < test_view_h.extent(0); ++i) {
-      // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " "
-      //           << gold_h(i) << " " << test_view_h(i) << " "
-      //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
-      if (std::is_same<gold_view_value_type, int>::value) {
-        ASSERT_EQ(gold_h(i), test_view_h(i));
-      } else {
-        const auto error =
-            std::abs(static_cast<double>(gold_h(i) - test_view_h(i)));
-        if (error > 1e-10) {
-          std::cout << i << " " << std::setprecision(15) << data_view_h(i)
-                    << " " << gold_h(i) << " " << test_view_h(i) << " "
-                    << std::abs(static_cast<double>(gold_h(i) - test_view_h(i)))
-                    << std::endl;
-        }
-        EXPECT_LT(error, 1e-10);
-      }
-    }
-  }
-}
-
 template <class ValueType>
 struct MultiplyFunctor {
   KOKKOS_INLINE_FUNCTION
@@ -189,107 +149,153 @@ struct SumFunctor {
   }
 };
 
+struct VerifyData {
+  template <class ViewType1, class ViewType2, class ValueType, class BinaryOp>
+  void operator()(ViewType1 data_view,  // contains data
+                  ViewType2 test_view,  // the view to test
+                  ValueType init_value, BinaryOp bop) {
+    //! always careful because views might not be deep copyable
+
+    auto data_view_dc = create_deep_copyable_compatible_clone(data_view);
+    auto data_view_h =
+        create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc);
+
+    using gold_view_value_type = typename ViewType2::value_type;
+    Kokkos::View<gold_view_value_type*, Kokkos::HostSpace> gold_h(
+        "goldh", data_view.extent(0));
+    my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h),
+                           KE::begin(gold_h), init_value, bop);
+
+    auto test_view_dc = create_deep_copyable_compatible_clone(test_view);
+    auto test_view_h =
+        create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc);
+    if (test_view_h.extent(0) > 0) {
+      for (std::size_t i = 0; i < test_view_h.extent(0); ++i) {
+        if (std::is_same<gold_view_value_type, int>::value) {
+          ASSERT_EQ(gold_h(i), test_view_h(i));
+        } else {
+          const auto error =
+              std::abs(static_cast<double>(gold_h(i) - test_view_h(i)));
+          ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error
+                                  << static_cast<double>(test_view_h(i)) << " "
+                                  << static_cast<double>(gold_h(i));
+        }
+      }
+    }
+  }
+
+  template <class ViewType1, class ViewType2, class ValueType>
+  void operator()(ViewType1 data_view,  // contains data
+                  ViewType2 test_view,  // the view to test
+                  ValueType init_value) {
+    (*this)(data_view, test_view, init_value, SumFunctor<ValueType>());
+  }
+};
+
 std::string value_type_to_string(int) { return "int"; }
 
 std::string value_type_to_string(double) { return "double"; }
 
-template <class Tag, class ValueType, class InfoType>
-void run_single_scenario_default_op(const InfoType& scenario_info,
-                                    ValueType init_value) {
-  using default_op           = SumFunctor<ValueType>;
+template <class Tag, class ValueType, class InfoType, class... OpOrEmpty>
+void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
+                         OpOrEmpty... empty_or_op) {
   const auto name            = std::get<0>(scenario_info);
   const std::size_t view_ext = std::get<1>(scenario_info);
-  // std::cout << "exclusive_scan default op: " << name << ", "
-  //           << view_tag_to_string(Tag{}) << ", "
-  //           << value_type_to_string(ValueType()) << ", "
-  //           << "init = " << init_value << std::endl;
 
   auto view_dest = create_view<ValueType>(Tag{}, view_ext, "exclusive_scan");
   auto view_from = create_view<ValueType>(Tag{}, view_ext, "exclusive_scan");
   fill_view(view_from, name);
+  // view_dest is filled with zeros before calling the algorithm everytime to
+  // ensure the algorithm does something meaningful
 
   {
     fill_zero(view_dest);
     auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest),
-                                init_value);
+                                init_value, empty_or_op...);
     ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, default_op());
+    VerifyData()(view_from, view_dest, init_value, empty_or_op...);
   }
 
   {
     fill_zero(view_dest);
     auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest),
-                                init_value);
+                                init_value, empty_or_op...);
     ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, default_op());
+    VerifyData()(view_from, view_dest, init_value, empty_or_op...);
   }
 
   {
     fill_zero(view_dest);
-    auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value);
+    auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value,
+                                empty_or_op...);
     ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, default_op());
+    VerifyData()(view_from, view_dest, init_value, empty_or_op...);
   }
 
   {
     fill_zero(view_dest);
     auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest,
-                                init_value);
+                                init_value, empty_or_op...);
     ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, default_op());
+    VerifyData()(view_from, view_dest, init_value, empty_or_op...);
   }
 
   Kokkos::fence();
 }
 
-template <class Tag, class ValueType, class InfoType, class BinaryOp>
-void run_single_scenario_custom_op(const InfoType& scenario_info,
-                                   ValueType init_value, BinaryOp bop) {
+template <class Tag, class ValueType, class InfoType, class... OpOrEmpty>
+void run_single_scenario_inplace(const InfoType& scenario_info,
+                                 ValueType init_value,
+                                 OpOrEmpty... empty_or_op) {
   const auto name            = std::get<0>(scenario_info);
   const std::size_t view_ext = std::get<1>(scenario_info);
-  // std::cout << "exclusive_scan custom op: " << name << ", "
-  //           << view_tag_to_string(Tag{}) << ", "
-  //           << value_type_to_string(ValueType()) << ", "
-  //           << "init = " << init_value << std::endl;
 
-  auto view_dest = create_view<ValueType>(Tag{}, view_ext, "exclusive_scan");
-  auto view_from = create_view<ValueType>(Tag{}, view_ext, "exclusive_scan");
-  fill_view(view_from, name);
+  // since here we call the in-place operation, we need to use two views:
+  // view1: filled according to what the scenario asks for and is not modified
+  // view2: filled according to what the scenario asks for and used for the
+  // in-place op Therefore, after the op is done, view2 should contain the
+  // result of doing exclusive scan NOTE: view2 is filled below every time
+  // because the algorithm acts in place
+
+  auto view1 =
+      create_view<ValueType>(Tag{}, view_ext, "exclusive_scan_inplace_view1");
+  fill_view(view1, name);
 
+  auto view2 =
+      create_view<ValueType>(Tag{}, view_ext, "exclusive_scan_inplace_view2");
   {
-    fill_zero(view_dest);
-    auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from),
-                                KE::cend(view_from), KE::begin(view_dest),
-                                init_value, bop);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, bop);
+    fill_view(view2, name);
+    auto r = KE::exclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2),
+                                KE::begin(view2), init_value, empty_or_op...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, init_value, empty_or_op...);
   }
 
   {
-    fill_zero(view_dest);
-    auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from),
-                                KE::cend(view_from), KE::begin(view_dest),
-                                init_value, bop);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, bop);
+    fill_view(view2, name);
+    auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view2),
+                                KE::cend(view2), KE::begin(view2), init_value,
+                                empty_or_op...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, init_value, empty_or_op...);
   }
 
   {
-    fill_zero(view_dest);
-    auto r =
-        KE::exclusive_scan(exespace(), view_from, view_dest, init_value, bop);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, bop);
+    fill_view(view2, name);
+    auto r = KE::exclusive_scan(exespace(), view2, view2, init_value,
+                                empty_or_op...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, init_value, empty_or_op...);
   }
 
   {
-    fill_zero(view_dest);
-    auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest,
-                                init_value, bop);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, bop);
+    fill_view(view2, name);
+    auto r = KE::exclusive_scan("label", exespace(), view2, view2, init_value,
+                                empty_or_op...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, init_value, empty_or_op...);
   }
 
   Kokkos::fence();
@@ -303,34 +309,39 @@ void run_exclusive_scan_all_scenarios() {
       {"medium", 1103},      {"large", 10513}};
 
   for (const auto& it : scenarios) {
-    run_single_scenario_default_op<Tag, ValueType>(it, ValueType{0});
-    run_single_scenario_default_op<Tag, ValueType>(it, ValueType{1});
-    run_single_scenario_default_op<Tag, ValueType>(it, ValueType{-2});
-    run_single_scenario_default_op<Tag, ValueType>(it, ValueType{3});
+    run_single_scenario<Tag, ValueType>(it, ValueType{0});
+    run_single_scenario<Tag, ValueType>(it, ValueType{1});
+    run_single_scenario<Tag, ValueType>(it, ValueType{-2});
+    run_single_scenario<Tag, ValueType>(it, ValueType{3});
+
+    run_single_scenario_inplace<Tag, ValueType>(it, ValueType{0});
+    run_single_scenario_inplace<Tag, ValueType>(it, ValueType{-2});
 
 #if !defined KOKKOS_ENABLE_OPENMPTARGET
     // custom multiply op is only run for small views otherwise it overflows
     if (it.first == "small-a" || it.first == "small-b") {
       using custom_bop_t = MultiplyFunctor<ValueType>;
-      run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{0},
-                                                    custom_bop_t());
-      run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{1},
-                                                    custom_bop_t());
-      run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{-2},
-                                                    custom_bop_t());
-      run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{3},
-                                                    custom_bop_t());
-    }
+      run_single_scenario<Tag, ValueType>(it, ValueType{0}, custom_bop_t());
+      run_single_scenario<Tag, ValueType>(it, ValueType{1}, custom_bop_t());
+      run_single_scenario<Tag, ValueType>(it, ValueType{-2}, custom_bop_t());
+      run_single_scenario<Tag, ValueType>(it, ValueType{3}, custom_bop_t());
 
-    using custom_bop_t = SumFunctor<ValueType>;
-    run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{0},
-                                                  custom_bop_t());
-    run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{1},
+      run_single_scenario_inplace<Tag, ValueType>(it, ValueType{0},
                                                   custom_bop_t());
-    run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{-2},
-                                                  custom_bop_t());
-    run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{3},
+      run_single_scenario_inplace<Tag, ValueType>(it, ValueType{-2},
                                                   custom_bop_t());
+    }
+
+    using custom_bop_t = SumFunctor<ValueType>;
+    run_single_scenario<Tag, ValueType>(it, ValueType{0}, custom_bop_t());
+    run_single_scenario<Tag, ValueType>(it, ValueType{1}, custom_bop_t());
+    run_single_scenario<Tag, ValueType>(it, ValueType{-2}, custom_bop_t());
+    run_single_scenario<Tag, ValueType>(it, ValueType{3}, custom_bop_t());
+
+    run_single_scenario_inplace<Tag, ValueType>(it, ValueType{0},
+                                                custom_bop_t());
+    run_single_scenario_inplace<Tag, ValueType>(it, ValueType{-2},
+                                                custom_bop_t());
 #endif
   }
 }
diff --git a/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp
index 8e60a43e5ff..a08a7372108 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp
@@ -16,6 +16,7 @@
 
 #include <TestStdAlgorithmsCommon.hpp>
 #include <utility>
+#include <iomanip>
 
 namespace Test {
 namespace stdalgos {
@@ -143,51 +144,6 @@ void my_host_inclusive_scan(it1 first, it1 last, it2 dest, BinOp bop,
   }
 }
 
-template <class ViewType1, class ViewType2, class BinaryOp, class... Args>
-void verify_data(ViewType1 data_view,  // contains data
-                 ViewType2 test_view,  // the view to test
-                 BinaryOp bop, Args... args /* copy on purpose */) {
-  //! always careful because views might not be deep copyable
-
-  auto data_view_dc = create_deep_copyable_compatible_clone(data_view);
-  auto data_view_h =
-      create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc);
-
-  using gold_view_value_type = typename ViewType2::value_type;
-  Kokkos::View<gold_view_value_type*, Kokkos::HostSpace> gold_h(
-      "goldh", data_view.extent(0));
-  my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h),
-                         KE::begin(gold_h), bop, args...);
-
-  auto test_view_dc = create_deep_copyable_compatible_clone(test_view);
-  auto test_view_h =
-      create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc);
-
-  const auto ext = test_view_h.extent(0);
-  if (ext > 0) {
-    for (std::size_t i = 0; i < ext; ++i) {
-      // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " "
-      //           << gold_h(i) << " " << test_view_h(i) << " "
-      //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
-
-      if (std::is_same<gold_view_value_type, int>::value) {
-        ASSERT_EQ(gold_h(i), test_view_h(i));
-      } else {
-        const auto error =
-            std::abs(static_cast<double>(gold_h(i) - test_view_h(i)));
-        if (error > 1e-10) {
-          std::cout << i << " " << std::setprecision(15) << data_view_h(i)
-                    << " " << gold_h(i) << " " << test_view_h(i) << " "
-                    << std::abs(static_cast<double>(gold_h(i) - test_view_h(i)))
-                    << std::endl;
-        }
-        EXPECT_LT(error, 1e-10);
-      }
-    }
-    // std::cout << " last el: " << test_view_h(ext-1) << std::endl;
-  }
-}
-
 template <class ValueType>
 struct MultiplyFunctor {
   KOKKOS_INLINE_FUNCTION
@@ -204,107 +160,151 @@ struct SumFunctor {
   }
 };
 
+struct VerifyData {
+  template <class ViewType1, class ViewType2, class BinaryOp, class... Args>
+  void operator()(ViewType1 data_view,  // contains data
+                  ViewType2 test_view,  // the view to test
+                  BinaryOp bop, Args... args /* copy on purpose */) {
+    //! always careful because views might not be deep copyable
+
+    auto data_view_dc = create_deep_copyable_compatible_clone(data_view);
+    auto data_view_h =
+        create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc);
+
+    using gold_view_value_type = typename ViewType2::value_type;
+    Kokkos::View<gold_view_value_type*, Kokkos::HostSpace> gold_h(
+        "goldh", data_view.extent(0));
+    my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h),
+                           KE::begin(gold_h), bop, args...);
+
+    auto test_view_dc = create_deep_copyable_compatible_clone(test_view);
+    auto test_view_h =
+        create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc);
+
+    const auto ext = test_view_h.extent(0);
+    if (ext > 0) {
+      for (std::size_t i = 0; i < ext; ++i) {
+        if (std::is_same<gold_view_value_type, int>::value) {
+          ASSERT_EQ(gold_h(i), test_view_h(i));
+        } else {
+          const auto error =
+              std::abs(static_cast<double>(gold_h(i) - test_view_h(i)));
+          ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error
+                                  << static_cast<double>(test_view_h(i)) << " "
+                                  << static_cast<double>(gold_h(i));
+        }
+      }
+    }
+  }
+
+  template <class ViewType1, class ViewType2>
+  void operator()(ViewType1 data_view,  // contains data
+                  ViewType2 test_view)  // the view to test
+  {
+    using value_type = typename ViewType1::non_const_value_type;
+    (*this)(data_view, test_view, SumFunctor<value_type>());
+  }
+};
+
 std::string value_type_to_string(int) { return "int"; }
 std::string value_type_to_string(double) { return "double"; }
 
-template <class Tag, class ValueType, class InfoType>
-void run_single_scenario_default_op(const InfoType& scenario_info) {
-  using default_op           = SumFunctor<ValueType>;
+template <class Tag, class ValueType, class InfoType, class... Args>
+void run_single_scenario(const InfoType& scenario_info,
+                         Args... args /* copy on purpose */) {
   const auto name            = std::get<0>(scenario_info);
   const std::size_t view_ext = std::get<1>(scenario_info);
-  // std::cout << "inclusive_scan default op: " << name << ", "
-  //           << view_tag_to_string(Tag{}) << ", "
-  //           << value_type_to_string(ValueType()) << std::endl;
 
   auto view_dest = create_view<ValueType>(Tag{}, view_ext, "inclusive_scan");
   auto view_from = create_view<ValueType>(Tag{}, view_ext, "inclusive_scan");
   fill_view(view_from, name);
+  // view_dest is filled with zeros before calling the algorithm everytime to
+  // ensure the algorithm does something meaningful
 
   {
     fill_zero(view_dest);
-    auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from),
-                                KE::cend(view_from), KE::begin(view_dest));
+    auto r =
+        KE::inclusive_scan(exespace(), KE::cbegin(view_from),
+                           KE::cend(view_from), KE::begin(view_dest), args...);
     ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, default_op());
+    VerifyData()(view_from, view_dest, args...);
   }
 
   {
     fill_zero(view_dest);
-    auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from),
-                                KE::cend(view_from), KE::begin(view_dest));
+    auto r =
+        KE::inclusive_scan("label", exespace(), KE::cbegin(view_from),
+                           KE::cend(view_from), KE::begin(view_dest), args...);
     ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, default_op());
+    VerifyData()(view_from, view_dest, args...);
   }
 
   {
     fill_zero(view_dest);
-    auto r = KE::inclusive_scan(exespace(), view_from, view_dest);
+    auto r = KE::inclusive_scan(exespace(), view_from, view_dest, args...);
     ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, default_op());
+    VerifyData()(view_from, view_dest, args...);
   }
 
   {
     fill_zero(view_dest);
-    auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest);
+    auto r =
+        KE::inclusive_scan("label", exespace(), view_from, view_dest, args...);
     ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, default_op());
+    VerifyData()(view_from, view_dest, args...);
   }
 
   Kokkos::fence();
 }
 
-template <class Tag, class ValueType, class InfoType, class BinaryOp,
-          class... Args>
-void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop,
-                                   Args... args /* copy on purpose */) {
+template <class Tag, class ValueType, class InfoType, class... Args>
+void run_single_scenario_inplace(const InfoType& scenario_info,
+                                 Args... args /* copy on purpose */) {
   const auto name            = std::get<0>(scenario_info);
   const std::size_t view_ext = std::get<1>(scenario_info);
 
-  // if (1 == sizeof...(Args)) {
-  //   std::cout << "inclusive_scan custom op and init value: " << name << ", "
-  //             << view_tag_to_string(Tag{}) << ", "
-  //             << value_type_to_string(ValueType()) << ", " << std::endl;
-  // } else {
-  //   std::cout << "inclusive_scan custom op: " << name << ", "
-  //             << view_tag_to_string(Tag{}) << ", "
-  //             << value_type_to_string(ValueType()) << ", " << std::endl;
-  // }
+  // since here we call the in-place operation, we need to use two views:
+  // view1: filled according to what the scenario asks for and is not modified
+  // view2: filled according to what the scenario asks for and used for the
+  // in-place op Therefore, after the op is done, view_2 should contain the
+  // result of doing exclusive scan NOTE: view2 is filled below every time
+  // because the algorithm acts in place
 
-  auto view_dest = create_view<ValueType>(Tag{}, view_ext, "inclusive_scan");
-  auto view_from = create_view<ValueType>(Tag{}, view_ext, "inclusive_scan");
-  fill_view(view_from, name);
+  auto view1 =
+      create_view<ValueType>(Tag{}, view_ext, "inclusive_scan_inplace_view1");
+  fill_view(view1, name);
+
+  auto view2 =
+      create_view<ValueType>(Tag{}, view_ext, "inclusive_scan_inplace_view2");
 
   {
-    fill_zero(view_dest);
-    auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from),
-                                KE::cend(view_from), KE::begin(view_dest), bop,
-                                args...);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, bop, args...);
+    fill_view(view2, name);
+    auto r = KE::inclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2),
+                                KE::begin(view2), args...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, args...);
   }
 
   {
-    fill_zero(view_dest);
-    auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from),
-                                KE::cend(view_from), KE::begin(view_dest), bop,
-                                args...);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, bop, args...);
+    fill_view(view2, name);
+    auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view2),
+                                KE::cend(view2), KE::begin(view2), args...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, args...);
   }
 
   {
-    fill_zero(view_dest);
-    auto r = KE::inclusive_scan(exespace(), view_from, view_dest, bop, args...);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, bop, args...);
+    fill_view(view2, name);
+    auto r = KE::inclusive_scan(exespace(), view2, view2, args...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, args...);
   }
 
   {
-    fill_zero(view_dest);
-    auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest, bop,
-                                args...);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, bop, args...);
+    fill_view(view2, name);
+    auto r = KE::inclusive_scan("label", exespace(), view2, view2, args...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, args...);
   }
 
   Kokkos::fence();
@@ -318,27 +318,35 @@ void run_inclusive_scan_all_scenarios() {
       {"medium-a", 313},     {"medium-b", 1103}, {"large", 10513}};
 
   for (const auto& it : scenarios) {
-    run_single_scenario_default_op<Tag, ValueType>(it);
+    run_single_scenario<Tag, ValueType>(it);
+    run_single_scenario_inplace<Tag, ValueType>(it);
 
 #if !defined KOKKOS_ENABLE_OPENMPTARGET
     // the sum custom op is always run
     using sum_binary_op = SumFunctor<ValueType>;
     sum_binary_op sbop;
-    run_single_scenario_custom_op<Tag, ValueType>(it, sbop);
-    run_single_scenario_custom_op<Tag, ValueType>(it, sbop, ValueType{0});
-    run_single_scenario_custom_op<Tag, ValueType>(it, sbop, ValueType{1});
-    run_single_scenario_custom_op<Tag, ValueType>(it, sbop, ValueType{-2});
-    run_single_scenario_custom_op<Tag, ValueType>(it, sbop, ValueType{3});
+    run_single_scenario<Tag, ValueType>(it, sbop);
+    run_single_scenario<Tag, ValueType>(it, sbop, ValueType{0});
+    run_single_scenario<Tag, ValueType>(it, sbop, ValueType{1});
+    run_single_scenario<Tag, ValueType>(it, sbop, ValueType{-2});
+    run_single_scenario<Tag, ValueType>(it, sbop, ValueType{3});
+
+    run_single_scenario_inplace<Tag, ValueType>(it, sbop, ValueType{0});
+    run_single_scenario_inplace<Tag, ValueType>(it, sbop, ValueType{-2});
 
     // custom multiply only for small views to avoid overflows
     if (it.first == "small-a" || it.first == "small-b") {
       using mult_binary_op = MultiplyFunctor<ValueType>;
       mult_binary_op mbop;
-      run_single_scenario_custom_op<Tag, ValueType>(it, mbop);
-      run_single_scenario_custom_op<Tag, ValueType>(it, mbop, ValueType{0});
-      run_single_scenario_custom_op<Tag, ValueType>(it, mbop, ValueType{1});
-      run_single_scenario_custom_op<Tag, ValueType>(it, mbop, ValueType{-2});
-      run_single_scenario_custom_op<Tag, ValueType>(it, mbop, ValueType{3});
+      run_single_scenario<Tag, ValueType>(it, mbop);
+      run_single_scenario<Tag, ValueType>(it, mbop, ValueType{0});
+      run_single_scenario<Tag, ValueType>(it, mbop, ValueType{1});
+      run_single_scenario<Tag, ValueType>(it, mbop, ValueType{-2});
+      run_single_scenario<Tag, ValueType>(it, mbop, ValueType{3});
+
+      run_single_scenario_inplace<Tag, ValueType>(it, mbop);
+      run_single_scenario_inplace<Tag, ValueType>(it, mbop, ValueType{0});
+      run_single_scenario_inplace<Tag, ValueType>(it, mbop, ValueType{-2});
     }
 #endif
   }
diff --git a/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp b/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp
index f31d49e06b4..75d4f0afebc 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp
@@ -146,7 +146,7 @@ void run_single_scenario(const InfoType& scenario_info) {
   resultsA[3]     = KE::is_sorted("label", exespace(), view);
   const auto allA = std::all_of(resultsA.cbegin(), resultsA.cend(),
                                 [=](bool v) { return v == gold; });
-  EXPECT_TRUE(allA);
+  EXPECT_TRUE(allA) << name << ", " << view_tag_to_string(Tag{});
 
 #if !defined KOKKOS_ENABLE_OPENMPTARGET
   CustomLessThanComparator<ValueType, ValueType> comp;
@@ -159,7 +159,7 @@ void run_single_scenario(const InfoType& scenario_info) {
   resultsB[3]     = KE::is_sorted("label", exespace(), view, comp);
   const auto allB = std::all_of(resultsB.cbegin(), resultsB.cend(),
                                 [=](bool v) { return v == gold; });
-  EXPECT_TRUE(allB);
+  EXPECT_TRUE(allB) << name << ", " << view_tag_to_string(Tag{});
 #endif
 
   Kokkos::fence();
@@ -173,9 +173,6 @@ void run_is_sorted_all_scenarios() {
       {"medium-a", 1003},    {"medium-b", 1003}, {"large-a", 101513},
       {"large-b", 101513}};
 
-  std::cout << "is_sorted: " << view_tag_to_string(Tag{})
-            << ", all overloads \n";
-
   for (const auto& it : scenarios) {
     run_single_scenario<Tag, ValueType>(it);
   }
diff --git a/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp
index dcfe8ad67e1..29ac7cc9bc1 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp
@@ -145,10 +145,10 @@ void run_single_scenario(const InfoType& scenario_info) {
       KE::is_sorted_until("label", exespace(), KE::begin(view), KE::end(view));
   auto r3 = KE::is_sorted_until(exespace(), view);
   auto r4 = KE::is_sorted_until("label", exespace(), view);
-  ASSERT_EQ(r1, gold);
-  ASSERT_EQ(r2, gold);
-  ASSERT_EQ(r3, gold);
-  ASSERT_EQ(r4, gold);
+  ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{});
+  ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{});
+  ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{});
+  ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{});
 
 #if !defined KOKKOS_ENABLE_OPENMPTARGET
   CustomLessThanComparator<ValueType, ValueType> comp;
@@ -160,10 +160,10 @@ void run_single_scenario(const InfoType& scenario_info) {
   auto r8 = KE::is_sorted_until("label", exespace(), view, comp);
 #endif
 
-  ASSERT_EQ(r1, gold);
-  ASSERT_EQ(r2, gold);
-  ASSERT_EQ(r3, gold);
-  ASSERT_EQ(r4, gold);
+  ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{});
+  ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{});
+  ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{});
+  ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{});
 
   Kokkos::fence();
 }
@@ -176,9 +176,6 @@ void run_is_sorted_until_all_scenarios() {
       {"medium-a", 1003},    {"medium-b", 1003}, {"large-a", 101513},
       {"large-b", 101513}};
 
-  std::cout << "is_sorted_until: " << view_tag_to_string(Tag{})
-            << ", all overloads \n";
-
   for (const auto& it : scenarios) {
     run_single_scenario<Tag, ValueType>(it);
   }
diff --git a/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp
index 4604764097e..1b1a02f39c4 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp
@@ -48,7 +48,7 @@ struct MyMovableType {
 TEST(std_algorithms_mod_ops_test, move) {
   MyMovableType a;
   using move_t = decltype(std::move(a));
-  static_assert(std::is_rvalue_reference<move_t>::value, "");
+  static_assert(std::is_rvalue_reference<move_t>::value);
 
   // move constr
   MyMovableType b(std::move(a));
@@ -70,7 +70,7 @@ struct StdAlgoModSeqOpsTestMove {
   void operator()(const int index) const {
     typename ViewType::value_type a{11};
     using move_t = decltype(std::move(a));
-    static_assert(std::is_rvalue_reference<move_t>::value, "");
+    static_assert(std::is_rvalue_reference<move_t>::value);
     m_view(index) = std::move(a);
   }
 
@@ -89,50 +89,6 @@ TEST(std_algorithms_mod_ops_test, move_within_parfor) {
   }
 }
 
-// ------------
-// swap
-// ------------
-TEST(std_algorithms_mod_ops_test, swap) {
-  {
-    int a = 1;
-    int b = 2;
-    KE::swap(a, b);
-    ASSERT_EQ(a, 2);
-    ASSERT_EQ(b, 1);
-  }
-
-  {
-    double a = 3.;
-    double b = 1.;
-    KE::swap(a, b);
-    EXPECT_DOUBLE_EQ(a, 1.);
-    EXPECT_DOUBLE_EQ(b, 3.);
-  }
-}
-
-template <class ViewType>
-struct StdAlgoModSeqOpsTestSwap {
-  ViewType m_view;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const int index) const {
-    typename ViewType::value_type newval{11};
-    KE::swap(m_view(index), newval);
-  }
-
-  StdAlgoModSeqOpsTestSwap(ViewType aIn) : m_view(aIn) {}
-};
-
-TEST(std_algorithms_mod_ops_test, swap_within_parfor) {
-  auto a = create_view<double>(stdalgos::DynamicTag{}, 10, "a");
-  StdAlgoModSeqOpsTestSwap<decltype(a)> fnc(a);
-  Kokkos::parallel_for(a.extent(0), fnc);
-  auto a_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a);
-  for (std::size_t i = 0; i < a.extent(0); ++i) {
-    EXPECT_DOUBLE_EQ(a_h(0), 11.);
-  }
-}
-
 // ------------
 // iter_swap
 // ------------
diff --git a/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp
index f169fd9ce88..a36c9db2b9e 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp
@@ -110,11 +110,9 @@ void verify_data(const std::string& name, ResultType my_result,
                  ViewTypeDestFalse view_dest_false, PredType pred) {
   using value_type = typename ViewTypeFrom::value_type;
   static_assert(
-      std::is_same<value_type, typename ViewTypeDestTrue::value_type>::value,
-      "");
+      std::is_same<value_type, typename ViewTypeDestTrue::value_type>::value);
   static_assert(
-      std::is_same<value_type, typename ViewTypeDestFalse::value_type>::value,
-      "");
+      std::is_same<value_type, typename ViewTypeDestFalse::value_type>::value);
 
   const std::size_t ext = view_from.extent(0);
 
diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp
index b5aa27c7c38..7c3c465dc8d 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp
@@ -166,6 +166,10 @@ void run_all_scenarios() {
 }
 
 TEST(std_algorithms_copy_if_team_test, test) {
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs";
+#endif
   run_all_scenarios<DynamicTag, double>();
   run_all_scenarios<StridedTwoRowsTag, int>();
   run_all_scenarios<StridedThreeRowsTag, unsigned>();
diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp
index c6b2566c6cf..2c8fee02f47 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp
@@ -121,7 +121,9 @@ struct TestFunctorA {
   }
 };
 
-template <class LayoutTag, class ValueType>
+struct InPlace {};
+
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
   /* description:
      use a rank-2 view randomly filled with values,
@@ -147,9 +149,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
   using space_t = Kokkos::DefaultExecutionSpace;
   Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
 
-  // create the destination view
-  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
-
   // exclusive_scan returns an iterator so to verify that it is correct
   // each team stores the distance of the returned iterator from the beginning
   // of the interval that team operates on and then we check that these
@@ -168,12 +167,19 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
   rand_pool pool(lowerBound * upperBound);
   Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound);
 
-  // use CTAD for functor
   auto initValuesView =
       Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h);
-  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
-                   initValuesView, binaryOp, apiId);
-  Kokkos::parallel_for(policy, fnc);
+
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    TestFunctorA fnc(sourceView, sourceView, distancesView,
+                     intraTeamSentinelView, initValuesView, binaryOp, apiId);
+    Kokkos::parallel_for(policy, fnc);
+  } else {
+    TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                     initValuesView, binaryOp, apiId);
+    Kokkos::parallel_for(policy, fnc);
+  }
 
   // -----------------------------------------------
   // run cpp-std kernel and check
@@ -223,11 +229,16 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
 #undef exclusive_scan
   }
 
-  auto dataViewAfterOp_h = create_host_space_copy(destView);
-  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    auto dataViewAfterOp_h = create_host_space_copy(sourceView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  } else {
+    auto dataViewAfterOp_h = create_host_space_copy(destView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  }
 }
 
-template <class LayoutTag, class ValueType>
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void run_all_scenarios() {
   for (int numTeams : teamSizesToTest) {
     for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
@@ -236,16 +247,24 @@ void run_all_scenarios() {
 #else
       for (int apiId : {0, 1}) {
 #endif
-        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+        test_A<LayoutTag, ValueType, InPlaceOrVoid>(numTeams, numCols, apiId);
       }
     }
   }
 }
 
 TEST(std_algorithms_exclusive_scan_team_test, test) {
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs";
+#endif
   run_all_scenarios<DynamicTag, double>();
   run_all_scenarios<StridedTwoRowsTag, int>();
   run_all_scenarios<StridedThreeRowsTag, unsigned>();
+
+  run_all_scenarios<DynamicTag, double, InPlace>();
+  run_all_scenarios<StridedTwoRowsTag, int, InPlace>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned, InPlace>();
 }
 
 }  // namespace TeamExclusiveScan
diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp
index 0daf9dbfe82..b5f4cdd6123 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp
@@ -139,7 +139,9 @@ struct TestFunctorA {
   }
 };
 
-template <class LayoutTag, class ValueType>
+struct InPlace {};
+
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
   /* description:
      use a rank-2 view randomly filled with values,
@@ -165,9 +167,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
   using space_t = Kokkos::DefaultExecutionSpace;
   Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
 
-  // create the destination view
-  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
-
   // inclusive_scan returns an iterator so to verify that it is correct
   // each team stores the distance of the returned iterator from the beginning
   // of the interval that team operates on and then we check that these
@@ -186,12 +185,20 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
   rand_pool pool(lowerBound * upperBound);
   Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound);
 
-  // use CTAD for functor
   auto initValuesView =
       Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h);
-  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
-                   initValuesView, binaryOp, apiId);
-  Kokkos::parallel_for(policy, fnc);
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    TestFunctorA fnc(sourceView, sourceView, distancesView,
+                     intraTeamSentinelView, initValuesView, binaryOp, apiId);
+    Kokkos::parallel_for(policy, fnc);
+  } else {
+    TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                     initValuesView, binaryOp, apiId);
+    Kokkos::parallel_for(policy, fnc);
+  }
 
   // -----------------------------------------------
   // run cpp-std kernel and check
@@ -251,25 +258,38 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
 #undef inclusive_scan
   }
 
-  auto dataViewAfterOp_h = create_host_space_copy(destView);
-  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    auto dataViewAfterOp_h = create_host_space_copy(sourceView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  } else {
+    auto dataViewAfterOp_h = create_host_space_copy(destView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  }
 }
 
-template <class LayoutTag, class ValueType>
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void run_all_scenarios() {
   for (int numTeams : teamSizesToTest) {
     for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
       for (int apiId : {0, 1, 2, 3, 4, 5}) {
-        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+        test_A<LayoutTag, ValueType, InPlaceOrVoid>(numTeams, numCols, apiId);
       }
     }
   }
 }
 
 TEST(std_algorithms_inclusive_scan_team_test, test) {
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs";
+#endif
   run_all_scenarios<DynamicTag, double>();
   run_all_scenarios<StridedTwoRowsTag, int>();
   run_all_scenarios<StridedThreeRowsTag, unsigned>();
+
+  run_all_scenarios<DynamicTag, double, InPlace>();
+  run_all_scenarios<StridedTwoRowsTag, int, InPlace>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned, InPlace>();
 }
 
 }  // namespace TeamInclusiveScan
diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp
index 24b840154b7..6bb0d249988 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp
@@ -212,6 +212,10 @@ void run_all_scenarios() {
 }
 
 TEST(std_algorithms_remove_copy_team_test, test) {
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs";
+#endif
   run_all_scenarios<DynamicTag, double>();
   run_all_scenarios<StridedTwoRowsTag, int>();
   run_all_scenarios<StridedThreeRowsTag, unsigned>();
diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp
index ce18eb4d319..cff9aa178a2 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp
@@ -168,6 +168,10 @@ void run_all_scenarios() {
 }
 
 TEST(std_algorithms_remove_copy_if_team_test, test) {
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs";
+#endif
   run_all_scenarios<DynamicTag, double>();
   run_all_scenarios<StridedTwoRowsTag, int>();
   run_all_scenarios<StridedThreeRowsTag, unsigned>();
diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp
index 9f30812d8ef..60fa369af18 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp
@@ -108,7 +108,9 @@ struct TestFunctorA {
   }
 };
 
-template <class LayoutTag, class ValueType>
+struct InPlace {};
+
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
   /* description:
      use a rank-2 view randomly filled with values,
@@ -134,9 +136,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
   using space_t = Kokkos::DefaultExecutionSpace;
   Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
 
-  // create the destination view
-  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
-
   // tranform_exclusive_scan returns an iterator so to verify that it is correct
   // each team stores the distance of the returned iterator from the beginning
   // of the interval that team operates on and then we check that these
@@ -156,12 +155,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
   rand_pool pool(lowerBound * upperBound);
   Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound);
 
-  // use CTAD for functor
   auto initValuesView =
       Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h);
-  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
-                   initValuesView, binaryOp, unaryOp, apiId);
-  Kokkos::parallel_for(policy, fnc);
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    TestFunctorA fnc(sourceView, sourceView, distancesView,
+                     intraTeamSentinelView, initValuesView, binaryOp, unaryOp,
+                     apiId);
+    Kokkos::parallel_for(policy, fnc);
+  } else {
+    TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                     initValuesView, binaryOp, unaryOp, apiId);
+    Kokkos::parallel_for(policy, fnc);
+  }
 
   // -----------------------------------------------
   // run cpp-std kernel and check
@@ -200,16 +208,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
 #undef transform_exclusive_scan
   }
 
-  auto dataViewAfterOp_h = create_host_space_copy(destView);
-  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    auto dataViewAfterOp_h = create_host_space_copy(sourceView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  } else {
+    auto dataViewAfterOp_h = create_host_space_copy(destView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  }
 }
 
-template <class LayoutTag, class ValueType>
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void run_all_scenarios() {
   for (int numTeams : teamSizesToTest) {
     for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
       for (int apiId : {0, 1}) {
-        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+        test_A<LayoutTag, ValueType, InPlaceOrVoid>(numTeams, numCols, apiId);
       }
     }
   }
@@ -219,6 +232,10 @@ TEST(std_algorithms_transform_exclusive_scan_team_test, test) {
   run_all_scenarios<DynamicTag, double>();
   run_all_scenarios<StridedTwoRowsTag, int>();
   run_all_scenarios<StridedThreeRowsTag, unsigned>();
+
+  run_all_scenarios<DynamicTag, double, InPlace>();
+  run_all_scenarios<StridedTwoRowsTag, int, InPlace>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned, InPlace>();
 }
 
 }  // namespace TeamTransformExclusiveScan
diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp
index 4b316602326..10454d65515 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp
@@ -131,7 +131,9 @@ struct TestFunctorA {
   }
 };
 
-template <class LayoutTag, class ValueType>
+struct InPlace {};
+
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
   /* description:
      use a rank-2 view randomly filled with values,
@@ -157,9 +159,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
   using space_t = Kokkos::DefaultExecutionSpace;
   Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
 
-  // create the destination view
-  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
-
   // tranform_inclusive_scan returns an iterator so to verify that it is correct
   // each team stores the distance of the returned iterator from the beginning
   // of the interval that team operates on and then we check that these
@@ -179,12 +178,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
   rand_pool pool(lowerBound * upperBound);
   Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound);
 
-  // use CTAD for functor
   auto initValuesView =
       Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h);
-  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
-                   initValuesView, binaryOp, unaryOp, apiId);
-  Kokkos::parallel_for(policy, fnc);
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    TestFunctorA fnc(sourceView, sourceView, distancesView,
+                     intraTeamSentinelView, initValuesView, binaryOp, unaryOp,
+                     apiId);
+    Kokkos::parallel_for(policy, fnc);
+  } else {
+    TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                     initValuesView, binaryOp, unaryOp, apiId);
+    Kokkos::parallel_for(policy, fnc);
+  }
 
   // -----------------------------------------------
   // run cpp-std kernel and check
@@ -236,16 +244,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
   }
 #undef transform_inclusive_scan
 
-  auto dataViewAfterOp_h = create_host_space_copy(destView);
-  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    auto dataViewAfterOp_h = create_host_space_copy(sourceView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  } else {
+    auto dataViewAfterOp_h = create_host_space_copy(destView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  }
 }
 
-template <class LayoutTag, class ValueType>
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void run_all_scenarios() {
   for (int numTeams : teamSizesToTest) {
     for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
       for (int apiId : {0, 1, 2, 3}) {
-        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+        test_A<LayoutTag, ValueType, InPlaceOrVoid>(numTeams, numCols, apiId);
       }
     }
   }
@@ -255,6 +268,10 @@ TEST(std_algorithms_transform_inclusive_scan_team_test, test) {
   run_all_scenarios<DynamicTag, double>();
   run_all_scenarios<StridedTwoRowsTag, int>();
   run_all_scenarios<StridedThreeRowsTag, unsigned>();
+
+  run_all_scenarios<DynamicTag, double, InPlace>();
+  run_all_scenarios<StridedTwoRowsTag, int, InPlace>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned, InPlace>();
 }
 
 }  // namespace TeamTransformInclusiveScan
diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp
index 87687b60a16..0d3289e196f 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp
@@ -186,6 +186,10 @@ void run_all_scenarios() {
 }
 
 TEST(std_algorithms_unique_copy_team_test, test) {
+  // FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs";
+#endif
   run_all_scenarios<DynamicTag, int>();
   run_all_scenarios<StridedTwoRowsTag, int>();
   run_all_scenarios<StridedThreeRowsTag, int>();
diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp
index 9dac3ce75ff..fa2804256ac 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp
@@ -16,6 +16,7 @@
 
 #include <TestStdAlgorithmsCommon.hpp>
 #include <utility>
+#include <iomanip>
 
 namespace Test {
 namespace stdalgos {
@@ -160,24 +161,15 @@ void verify_data(ViewType1 data_view,  // contains data
       create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc);
   if (test_view_h.extent(0) > 0) {
     for (std::size_t i = 0; i < test_view_h.extent(0); ++i) {
-      // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " "
-      //           << gold_h(i) << " " << test_view_h(i) << " "
-      //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
-
       if (std::is_same<gold_view_value_type, int>::value) {
         ASSERT_EQ(gold_h(i), test_view_h(i));
       } else {
         const auto error = std::abs(gold_h(i) - test_view_h(i));
-        if (error > 1e-10) {
-          std::cout << i << " " << std::setprecision(15) << data_view_h(i)
-                    << " " << gold_h(i) << " " << test_view_h(i) << " "
-                    << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
-        }
-        EXPECT_LT(error, 1e-10);
+        ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error
+                                << static_cast<double>(test_view_h(i)) << " "
+                                << static_cast<double>(gold_h(i));
       }
     }
-    // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) <<
-    // std::endl;
   }
 }
 
@@ -205,17 +197,13 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
                          BinaryOp bop, UnaryOp uop) {
   const auto name            = std::get<0>(scenario_info);
   const std::size_t view_ext = std::get<1>(scenario_info);
-  // std::cout << "transform_exclusive_scan custom op: " << name << ", "
-  //           << view_tag_to_string(Tag{}) << ", "
-  //           << value_type_to_string(ValueType()) << ", "
-  //           << "init = " << init_value << std::endl;
-
-  auto view_dest =
-      create_view<ValueType>(Tag{}, view_ext, "transform_exclusive_scan");
-  auto view_from =
-      create_view<ValueType>(Tag{}, view_ext, "transform_exclusive_scan");
+
+  auto view_from = create_view<ValueType>(Tag{}, view_ext,
+                                          "transform_exclusive_scan_view_from");
   fill_view(view_from, name);
 
+  auto view_dest = create_view<ValueType>(Tag{}, view_ext,
+                                          "transform_exclusive_scan_view_dest");
   {
     fill_zero(view_dest);
     auto r = KE::transform_exclusive_scan(
@@ -253,6 +241,65 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
   Kokkos::fence();
 }
 
+template <class Tag, class ValueType, class InfoType, class BinaryOp,
+          class UnaryOp>
+void run_single_scenario_inplace(const InfoType& scenario_info,
+                                 ValueType init_value, BinaryOp bop,
+                                 UnaryOp uop) {
+  const auto name            = std::get<0>(scenario_info);
+  const std::size_t view_ext = std::get<1>(scenario_info);
+
+  // since here we call the in-place operation, we need to use two views:
+  // view1: filled according to what the scenario asks for and is not modified
+  // view2: filled according to what the scenario asks for and used for the
+  // in-place op Therefore, after the op is done, view2 should contain the
+  // result of doing exclusive scan NOTE: view2 is filled below every time
+  // because the algorithm acts in place
+
+  auto view1 =
+      create_view<ValueType>(Tag{}, view_ext, "transform_exclusive_scan_view1");
+  fill_view(view1, name);
+
+  auto view2 =
+      create_view<ValueType>(Tag{}, view_ext, "transform_exclusive_scan_view2");
+
+  {
+    fill_view(view2, name);
+    auto r = KE::transform_exclusive_scan(exespace(), KE::cbegin(view2),
+                                          KE::cend(view2), KE::begin(view2),
+                                          init_value, bop, uop);
+    ASSERT_EQ(r, KE::end(view2));
+    verify_data(view1, view2, init_value, bop, uop);
+  }
+
+  {
+    fill_view(view2, name);
+    auto r = KE::transform_exclusive_scan(
+        "label", exespace(), KE::cbegin(view2), KE::cend(view2),
+        KE::begin(view2), init_value, bop, uop);
+    ASSERT_EQ(r, KE::end(view2));
+    verify_data(view1, view2, init_value, bop, uop);
+  }
+
+  {
+    fill_view(view2, name);
+    auto r = KE::transform_exclusive_scan(exespace(), view2, view2, init_value,
+                                          bop, uop);
+    ASSERT_EQ(r, KE::end(view2));
+    verify_data(view1, view2, init_value, bop, uop);
+  }
+
+  {
+    fill_view(view2, name);
+    auto r = KE::transform_exclusive_scan("label", exespace(), view2, view2,
+                                          init_value, bop, uop);
+    ASSERT_EQ(r, KE::end(view2));
+    verify_data(view1, view2, init_value, bop, uop);
+  }
+
+  Kokkos::fence();
+}
+
 template <class Tag, class ValueType>
 void run_all_scenarios() {
   const std::map<std::string, std::size_t> scenarios = {
@@ -267,6 +314,11 @@ void run_all_scenarios() {
     run_single_scenario<Tag, ValueType>(it, ValueType{1}, bop_t(), uop_t());
     run_single_scenario<Tag, ValueType>(it, ValueType{-2}, bop_t(), uop_t());
     run_single_scenario<Tag, ValueType>(it, ValueType{3}, bop_t(), uop_t());
+
+    run_single_scenario_inplace<Tag, ValueType>(it, ValueType{0}, bop_t(),
+                                                uop_t());
+    run_single_scenario_inplace<Tag, ValueType>(it, ValueType{-2}, bop_t(),
+                                                uop_t());
   }
 }
 
diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp
index a90a68ca1d7..fb81ae91b04 100644
--- a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp
+++ b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp
@@ -16,6 +16,7 @@
 
 #include <TestStdAlgorithmsCommon.hpp>
 #include <utility>
+#include <iomanip>
 
 namespace Test {
 namespace stdalgos {
@@ -172,24 +173,15 @@ void verify_data(ViewType1 data_view,  // contains data
       create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc);
   if (test_view_h.extent(0) > 0) {
     for (std::size_t i = 0; i < test_view_h.extent(0); ++i) {
-      // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " "
-      //           << gold_h(i) << " " << test_view_h(i) << " "
-      //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
-
       if (std::is_same<gold_view_value_type, int>::value) {
         ASSERT_EQ(gold_h(i), test_view_h(i));
       } else {
         const auto error = std::abs(gold_h(i) - test_view_h(i));
-        if (error > 1e-10) {
-          std::cout << i << " " << std::setprecision(15) << data_view_h(i)
-                    << " " << gold_h(i) << " " << test_view_h(i) << " "
-                    << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
-        }
-        EXPECT_LT(error, 1e-10);
+        ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error
+                                << static_cast<double>(test_view_h(i)) << " "
+                                << static_cast<double>(gold_h(i));
       }
     }
-    // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) <<
-    // std::endl;
   }
 }
 
@@ -210,30 +202,11 @@ struct SumBinaryFunctor {
 std::string value_type_to_string(int) { return "int"; }
 std::string value_type_to_string(double) { return "double"; }
 
-template <class Tag, class BopT, class UopT>
-void print_scenario_details(const std::string& name, BopT bop, UopT uop) {
-  (void)bop;
-  (void)uop;
-  std::cout << "transform_inclusive_scan: " << name << ", "
-            << view_tag_to_string(Tag{}) << std::endl;
-}
-
-template <class Tag, class BopT, class UopT, class ValueType>
-void print_scenario_details(const std::string& name, BopT bop, UopT uop,
-                            ValueType init_value) {
-  (void)bop;
-  (void)uop;
-  std::cout << "transform_inclusive_scan: " << name << ", "
-            << view_tag_to_string(Tag{}) << ", "
-            << "init = " << init_value << std::endl;
-}
-
 template <class Tag, class ValueType, class InfoType, class... Args>
 void run_single_scenario(const InfoType& scenario_info,
                          Args... args /* by value on purpose*/) {
   const auto name            = std::get<0>(scenario_info);
   const std::size_t view_ext = std::get<1>(scenario_info);
-  // print_scenario_details<Tag>(name, args...);
 
   auto view_dest =
       create_view<ValueType>(Tag{}, view_ext, "transform_inclusive_scan");
@@ -278,6 +251,63 @@ void run_single_scenario(const InfoType& scenario_info,
   Kokkos::fence();
 }
 
+template <class Tag, class ValueType, class InfoType, class... Args>
+void run_single_scenario_inplace(const InfoType& scenario_info,
+                                 Args... args /* by value on purpose*/) {
+  const auto name            = std::get<0>(scenario_info);
+  const std::size_t view_ext = std::get<1>(scenario_info);
+
+  // since here we call the in-place operation, we need to use two views:
+  // view1: filled according to scenario and is not modified
+  // view2: filled according scenario and used for the in-place op
+  // Therefore, after the op is done, view_2 should contain the
+  // result of doing exclusive scan.
+  // NOTE: view2 must be filled before every call to the algorithm
+  // because the algorithm acts in place
+
+  auto view_1 = create_view<ValueType>(Tag{}, view_ext,
+                                       "transform_inclusive_scan_view_1");
+  fill_view(view_1, name);
+
+  auto view_2 = create_view<ValueType>(Tag{}, view_ext,
+                                       "transform_inclusive_scan_view_2");
+
+  {
+    fill_view(view_2, name);
+    auto r = KE::transform_inclusive_scan(exespace(), KE::cbegin(view_2),
+                                          KE::cend(view_2), KE::begin(view_2),
+                                          args...);
+    ASSERT_EQ(r, KE::end(view_2));
+    verify_data(view_1, view_2, args...);
+  }
+
+  {
+    fill_view(view_2, name);
+    auto r = KE::transform_inclusive_scan("label", exespace(),
+                                          KE::cbegin(view_2), KE::cend(view_2),
+                                          KE::begin(view_2), args...);
+    ASSERT_EQ(r, KE::end(view_2));
+    verify_data(view_1, view_2, args...);
+  }
+
+  {
+    fill_view(view_2, name);
+    auto r = KE::transform_inclusive_scan(exespace(), view_2, view_2, args...);
+    ASSERT_EQ(r, KE::end(view_2));
+    verify_data(view_1, view_2, args...);
+  }
+
+  {
+    fill_view(view_2, name);
+    auto r = KE::transform_inclusive_scan("label", exespace(), view_2, view_2,
+                                          args...);
+    ASSERT_EQ(r, KE::end(view_2));
+    verify_data(view_1, view_2, args...);
+  }
+
+  Kokkos::fence();
+}
+
 template <class Tag, class ValueType>
 void run_all_scenarios() {
   const std::map<std::string, std::size_t> scenarios = {
@@ -294,15 +324,23 @@ void run_all_scenarios() {
     run_single_scenario<Tag, ValueType>(it, bop_t(), uop_t(), ValueType{2});
     run_single_scenario<Tag, ValueType>(it, bop_t(), uop_t(), ValueType{-1});
     run_single_scenario<Tag, ValueType>(it, bop_t(), uop_t(), ValueType{-2});
+
+    run_single_scenario_inplace<Tag, ValueType>(it, bop_t(), uop_t());
+    run_single_scenario_inplace<Tag, ValueType>(it, bop_t(), uop_t(),
+                                                ValueType{0});
+    run_single_scenario_inplace<Tag, ValueType>(it, bop_t(), uop_t(),
+                                                ValueType{2});
+    run_single_scenario_inplace<Tag, ValueType>(it, bop_t(), uop_t(),
+                                                ValueType{-2});
   }
 }
 
 #if !defined KOKKOS_ENABLE_OPENMPTARGET
 TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan) {
   run_all_scenarios<DynamicTag, double>();
-  // run_all_scenarios<StridedThreeTag, double>();
-  // run_all_scenarios<DynamicTag, int>();
-  // run_all_scenarios<StridedThreeTag, int>();
+  run_all_scenarios<StridedThreeTag, double>();
+  run_all_scenarios<DynamicTag, int>();
+  run_all_scenarios<StridedThreeTag, int>();
 }
 #endif
 
diff --git a/algorithms/unit_tests/TestStdReducers.cpp b/algorithms/unit_tests/TestStdReducers.cpp
index 3847e1e6a36..c05006a1617 100644
--- a/algorithms/unit_tests/TestStdReducers.cpp
+++ b/algorithms/unit_tests/TestStdReducers.cpp
@@ -83,9 +83,6 @@ auto run_min_or_max_test(ViewType view, StdReducersTestEnumOrder enValue) {
   static_assert(std::is_same<ExeSpace, Kokkos::HostSpace>::value,
                 "test is only enabled for HostSpace");
 
-  std::cout << "checking reduction with order: " << order_to_string(enValue)
-            << "\n";
-
   using view_value_type = typename ViewType::value_type;
   using reducer_type    = std::conditional_t<
       (flag == 0), Kokkos::MaxFirstLoc<view_value_type, IndexType, ExeSpace>,
@@ -132,18 +129,24 @@ TEST(std_algorithms_reducers, max_first_loc) {
 
   const auto pair1 = run_min_or_max_test<0, hostspace, index_type>(
       view_h, StdReducersTestEnumOrder::LeftToRight);
-  ASSERT_EQ(pair1.first, gold_value);
-  ASSERT_EQ(pair1.second, gold_location);
+  ASSERT_EQ(pair1.first, gold_value)
+      << order_to_string(StdReducersTestEnumOrder::LeftToRight);
+  ASSERT_EQ(pair1.second, gold_location)
+      << order_to_string(StdReducersTestEnumOrder::LeftToRight);
 
   const auto pair2 = run_min_or_max_test<0, hostspace, index_type>(
       view_h, StdReducersTestEnumOrder::RightToLeft);
-  ASSERT_EQ(pair2.first, gold_value);
-  ASSERT_EQ(pair2.second, gold_location);
+  ASSERT_EQ(pair2.first, gold_value)
+      << order_to_string(StdReducersTestEnumOrder::RightToLeft);
+  ASSERT_EQ(pair2.second, gold_location)
+      << order_to_string(StdReducersTestEnumOrder::RightToLeft);
 
   const auto pair3 = run_min_or_max_test<0, hostspace, index_type>(
       view_h, StdReducersTestEnumOrder::Random);
-  ASSERT_EQ(pair3.first, gold_value);
-  ASSERT_EQ(pair3.second, gold_location);
+  ASSERT_EQ(pair3.first, gold_value)
+      << order_to_string(StdReducersTestEnumOrder::Random);
+  ASSERT_EQ(pair3.second, gold_location)
+      << order_to_string(StdReducersTestEnumOrder::Random);
 }
 
 TEST(std_algorithms_reducers, min_first_loc) {
@@ -191,9 +194,6 @@ void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue,
   static_assert(std::is_same<ExeSpace, Kokkos::HostSpace>::value,
                 "test is only enabled for HostSpace");
 
-  std::cout << "checking reduction with order: " << order_to_string(enValue)
-            << "\n";
-
   using view_value_type = typename ViewType::value_type;
   using reducer_type =
       Kokkos::MinMaxFirstLastLoc<view_value_type, IndexType, ExeSpace>;
@@ -212,10 +212,10 @@ void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue,
                  reduction_value_type{view(index), view(index), index, index});
   }
 
-  ASSERT_EQ(red_result.min_val, gold_values.first);
-  ASSERT_EQ(red_result.max_val, gold_values.second);
-  ASSERT_EQ(red_result.min_loc, gold_locs.first);
-  ASSERT_EQ(red_result.max_loc, gold_locs.second);
+  ASSERT_EQ(red_result.min_val, gold_values.first) << order_to_string(enValue);
+  ASSERT_EQ(red_result.max_val, gold_values.second) << order_to_string(enValue);
+  ASSERT_EQ(red_result.min_loc, gold_locs.first) << order_to_string(enValue);
+  ASSERT_EQ(red_result.max_loc, gold_locs.second) << order_to_string(enValue);
 }
 
 TEST(std_algorithms_reducers, min_max_first_last_loc) {
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 42279bf55db..abf50283594 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1 +1,12 @@
+#FIXME_OPENMPTARGET - compiling in debug mode causes ICE.
+KOKKOS_ADD_BENCHMARK_DIRECTORIES(atomic)
+KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather)
 KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups)
+KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency)
+KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream)
+
+#FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow.
+IF(NOT Kokkos_ENABLE_OPENMPTARGET)
+    KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance)
+    KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops)
+ENDIF()
diff --git a/benchmarks/atomic/CMakeLists.txt b/benchmarks/atomic/CMakeLists.txt
new file mode 100644
index 00000000000..85f7412f492
--- /dev/null
+++ b/benchmarks/atomic/CMakeLists.txt
@@ -0,0 +1,4 @@
+KOKKOS_ADD_EXECUTABLE(
+  atomic
+  SOURCES main.cpp
+)
diff --git a/benchmarks/bytes_and_flops/CMakeLists.txt b/benchmarks/bytes_and_flops/CMakeLists.txt
new file mode 100644
index 00000000000..0ce44a6f1a8
--- /dev/null
+++ b/benchmarks/bytes_and_flops/CMakeLists.txt
@@ -0,0 +1,4 @@
+KOKKOS_ADD_EXECUTABLE(
+  bytes_and_flops
+  SOURCES bench_double.cpp bench_float.cpp bench_int32_t.cpp bench_int64_t.cpp main.cpp
+)
diff --git a/benchmarks/bytes_and_flops/bench.hpp b/benchmarks/bytes_and_flops/bench.hpp
index 2589fd7309b..88830af624b 100644
--- a/benchmarks/bytes_and_flops/bench.hpp
+++ b/benchmarks/bytes_and_flops/bench.hpp
@@ -37,22 +37,22 @@ struct RunStride {
 };
 
 #define STRIDE 1
-#include <bench_stride.hpp>
+#include "bench_stride.hpp"
 #undef STRIDE
 #define STRIDE 2
-#include <bench_stride.hpp>
+#include "bench_stride.hpp"
 #undef STRIDE
 #define STRIDE 4
-#include <bench_stride.hpp>
+#include "bench_stride.hpp"
 #undef STRIDE
 #define STRIDE 8
-#include <bench_stride.hpp>
+#include "bench_stride.hpp"
 #undef STRIDE
 #define STRIDE 16
-#include <bench_stride.hpp>
+#include "bench_stride.hpp"
 #undef STRIDE
 #define STRIDE 32
-#include <bench_stride.hpp>
+#include "bench_stride.hpp"
 #undef STRIDE
 
 template <class Scalar>
diff --git a/benchmarks/bytes_and_flops/bench_double.cpp b/benchmarks/bytes_and_flops/bench_double.cpp
index f955c996660..2fda1ae3d42 100644
--- a/benchmarks/bytes_and_flops/bench_double.cpp
+++ b/benchmarks/bytes_and_flops/bench_double.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <bench.hpp>
+#include "bench.hpp"
 
 template void run_stride_unroll<double>(int N, int K, int R, int D, int U,
                                         int F, int T, int S, int B, int I);
diff --git a/benchmarks/bytes_and_flops/bench_float.cpp b/benchmarks/bytes_and_flops/bench_float.cpp
index 137ff67d404..3210116a9ee 100644
--- a/benchmarks/bytes_and_flops/bench_float.cpp
+++ b/benchmarks/bytes_and_flops/bench_float.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <bench.hpp>
+#include "bench.hpp"
 
 template void run_stride_unroll<float>(int N, int K, int R, int D, int U, int F,
                                        int T, int S, int B, int I);
diff --git a/benchmarks/bytes_and_flops/bench_int32_t.cpp b/benchmarks/bytes_and_flops/bench_int32_t.cpp
index 29ccec01414..24a5dcd3899 100644
--- a/benchmarks/bytes_and_flops/bench_int32_t.cpp
+++ b/benchmarks/bytes_and_flops/bench_int32_t.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <bench.hpp>
+#include "bench.hpp"
 
 template void run_stride_unroll<int32_t>(int N, int K, int R, int D, int U,
                                          int F, int T, int S, int B, int I);
diff --git a/benchmarks/bytes_and_flops/bench_int64_t.cpp b/benchmarks/bytes_and_flops/bench_int64_t.cpp
index c153d5eff39..0634700c31e 100644
--- a/benchmarks/bytes_and_flops/bench_int64_t.cpp
+++ b/benchmarks/bytes_and_flops/bench_int64_t.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <bench.hpp>
+#include "bench.hpp"
 
 template void run_stride_unroll<int64_t>(int N, int K, int R, int D, int U,
                                          int F, int T, int S, int B, int I);
diff --git a/benchmarks/bytes_and_flops/bench_stride.hpp b/benchmarks/bytes_and_flops/bench_stride.hpp
index b63d486fc9e..80f017fbe8f 100644
--- a/benchmarks/bytes_and_flops/bench_stride.hpp
+++ b/benchmarks/bytes_and_flops/bench_stride.hpp
@@ -15,28 +15,28 @@
 //@HEADER
 
 #define UNROLL 1
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 2
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 3
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 4
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 5
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 6
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 7
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 8
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 
 template <class Scalar>
diff --git a/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/benchmarks/bytes_and_flops/bench_unroll_stride.hpp
index 0f7a298c1bb..78cfd48effe 100644
--- a/benchmarks/bytes_and_flops/bench_unroll_stride.hpp
+++ b/benchmarks/bytes_and_flops/bench_unroll_stride.hpp
@@ -26,7 +26,7 @@ struct Run<Scalar, UNROLL, STRIDE> {
     Kokkos::deep_copy(C, Scalar(3.5));
 
     Kokkos::Timer timer;
-    for (int i = 0; i < I; ++i) {
+    for (int iter = 0; iter < I; ++iter) {
       Kokkos::parallel_for(
           "BenchmarkKernel",
           Kokkos::TeamPolicy<>(N, T).set_scratch_size(0, Kokkos::PerTeam(S)),
diff --git a/benchmarks/bytes_and_flops/main.cpp b/benchmarks/bytes_and_flops/main.cpp
index 20077757d1f..fdfcc4ea64f 100644
--- a/benchmarks/bytes_and_flops/main.cpp
+++ b/benchmarks/bytes_and_flops/main.cpp
@@ -16,7 +16,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Timer.hpp>
-#include <bench.hpp>
+#include "bench.hpp"
 #include <cstdlib>
 
 extern template void run_stride_unroll<float>(int, int, int, int, int, int, int,
@@ -86,7 +86,7 @@ int main(int argc, char* argv[]) {
     printf("D must be one of 1,2,4,8,16,32\n");
     return 0;
   }
-  if ((P < 1) && (P > 2)) {
+  if ((P < 1) || (P > 4)) {
     printf("P must be one of 1,2,3,4\n");
     return 0;
   }
diff --git a/benchmarks/gather/CMakeLists.txt b/benchmarks/gather/CMakeLists.txt
new file mode 100644
index 00000000000..24c70627725
--- /dev/null
+++ b/benchmarks/gather/CMakeLists.txt
@@ -0,0 +1,4 @@
+KOKKOS_ADD_EXECUTABLE(
+  gather
+  SOURCES main.cpp
+)
diff --git a/benchmarks/gather/gather.hpp b/benchmarks/gather/gather.hpp
index d83461702c7..90b1101c1d5 100644
--- a/benchmarks/gather/gather.hpp
+++ b/benchmarks/gather/gather.hpp
@@ -20,28 +20,28 @@ struct RunGather {
 };
 
 #define UNROLL 1
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 2
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 3
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 4
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 5
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 6
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 7
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 8
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 
 template <class Scalar>
diff --git a/benchmarks/gather/gather_unroll.hpp b/benchmarks/gather/gather_unroll.hpp
index 5ee5742a3f7..1aa73091bc5 100644
--- a/benchmarks/gather/gather_unroll.hpp
+++ b/benchmarks/gather/gather_unroll.hpp
@@ -138,7 +138,7 @@ struct RunGather<Scalar, UNROLL> {
     printf(
         "SNKDRUF: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: "
         "%lf GGather/s: %lf\n",
-        sizeof(Scalar) / 4, N, K, D, R, UNROLL, F, seconds,
+        static_cast<int>(sizeof(Scalar) / 4), N, K, D, R, UNROLL, F, seconds,
         1.0 * bytes / seconds / 1024 / 1024 / 1024, 1.e-9 * flops / seconds,
         1.e-9 * gather_ops / seconds);
   }
diff --git a/benchmarks/gather/main.cpp b/benchmarks/gather/main.cpp
index 7f4fc9ede6c..07fca9fdc64 100644
--- a/benchmarks/gather/main.cpp
+++ b/benchmarks/gather/main.cpp
@@ -16,7 +16,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Timer.hpp>
-#include <gather.hpp>
+#include "gather.hpp"
 #include <cstdlib>
 
 int main(int argc, char* argv[]) {
diff --git a/benchmarks/launch_latency/CMakeLists.txt b/benchmarks/launch_latency/CMakeLists.txt
new file mode 100644
index 00000000000..bb14da749d1
--- /dev/null
+++ b/benchmarks/launch_latency/CMakeLists.txt
@@ -0,0 +1,4 @@
+KOKKOS_ADD_EXECUTABLE(
+  launch_latency
+  SOURCES launch_latency.cpp
+)
diff --git a/benchmarks/launch_latency/launch_latency.cpp b/benchmarks/launch_latency/launch_latency.cpp
new file mode 100644
index 00000000000..73b176ab8dd
--- /dev/null
+++ b/benchmarks/launch_latency/launch_latency.cpp
@@ -0,0 +1,283 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+/*! \file launch_latency.cpp
+
+    Tests of parallel_for and parallel_reduce latency for different
+   circumstances.
+
+    Three launch kinds are tested: parallel_for, parallel_reduce into scalar,
+   and parallel_reduce into view
+
+   N controls how large the parallel loops is
+   V controls how large the functor is
+   M controls across how many launches the latency is averaged
+   K controls how larege the nested loop is (no larger than V)
+
+    For each launch kind,
+    1. Avg functor dispatch latency: (time to do M launches) / M
+    2. Avg functor completion throughput: (M launches + sync) / M
+    3. Avg functor completion latency: (M (launch + sync)) / M
+*/
+
+#include <Kokkos_Core.hpp>
+
+template <int V>
+struct TestFunctor {
+  double values[V];
+  Kokkos::View<double*> a;
+  int K;
+  TestFunctor(Kokkos::View<double*> a_, int K_) : a(a_), K(K_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j];
+  }
+};
+
+template <int V>
+struct TestRFunctor {
+  double values[V];
+  Kokkos::View<double*> a;
+  int K;
+  TestRFunctor(Kokkos::View<double*> a_, int K_) : a(a_), K(K_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i, double& lsum) const {
+    for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j];
+    lsum += a(i);
+  }
+};
+
+struct Opts {
+  bool par_for         = true;
+  bool par_reduce      = true;
+  bool par_reduce_view = true;
+};
+
+template <int V>
+void run(int N, int M, int K, const Opts& opts) {
+  std::string l_no_fence, l_fence, l_red_no_fence, l_red_fence,
+      l_red_view_no_fence, l_red_view_fence;
+  {
+    std::ostringstream ostream;
+    ostream << "RunNoFence_" << N << "_" << K << std::endl;
+    l_no_fence = ostream.str();
+  }
+  {
+    std::ostringstream ostream;
+    ostream << "RunFence_" << N << "_" << K << std::endl;
+    l_fence = ostream.str();
+  }
+  {
+    std::ostringstream ostream;
+    ostream << "RunReduceNoFence_" << N << "_" << K << std::endl;
+    l_red_no_fence = ostream.str();
+  }
+  {
+    std::ostringstream ostream;
+    ostream << "RunReduceFence_" << N << "_" << K << std::endl;
+    l_red_fence = ostream.str();
+  }
+  {
+    std::ostringstream ostream;
+    ostream << "RunReduceViewNoFence_" << N << "_" << K << std::endl;
+    l_red_view_no_fence = ostream.str();
+  }
+  {
+    std::ostringstream ostream;
+    ostream << "RunReduceViewFence_" << N << "_" << K << std::endl;
+    l_red_view_fence = ostream.str();
+  }
+
+  double result;
+  Kokkos::View<double*> a("A", N);
+  Kokkos::View<double> v_result("result");
+  TestFunctor<V> f(a, K);
+  TestRFunctor<V> rf(a, K);
+  Kokkos::Timer timer;
+
+  // initialize to an obviously wrong value
+  double time_no_fence        = -1;  // launch loop
+  double time_no_fence_fenced = -1;  // launch loop then fence
+  double time_fence           = -1;  // launch&fence loop
+
+  double time_red_no_fence        = -1;
+  double time_red_no_fence_fenced = -1;
+  double time_red_fence           = -1;
+
+  double time_red_view_no_fence        = -1;
+  double time_red_view_no_fence_fenced = -1;
+  double time_red_view_fence           = -1;
+
+  if (opts.par_for) {
+    // warmup
+    for (int i = 0; i < 4; ++i) {
+      Kokkos::parallel_for(l_no_fence, N, f);
+    }
+    Kokkos::fence();
+
+    timer.reset();
+    for (int i = 0; i < M; i++) {
+      Kokkos::parallel_for(l_no_fence, N, f);
+    }
+    time_no_fence = timer.seconds();
+    Kokkos::fence();
+    time_no_fence_fenced = timer.seconds();
+
+    timer.reset();
+    for (int i = 0; i < M; i++) {
+      Kokkos::parallel_for(l_fence, N, f);
+      Kokkos::fence();
+    }
+    time_fence = timer.seconds();
+  }
+
+  if (opts.par_reduce) {
+    // warmup
+    for (int i = 0; i < 4; ++i) {
+      Kokkos::parallel_reduce(l_red_no_fence, N, rf, result);
+    }
+    Kokkos::fence();
+
+    timer.reset();
+    for (int i = 0; i < M; i++) {
+      Kokkos::parallel_reduce(l_red_no_fence, N, rf, result);
+    }
+    time_red_no_fence = timer.seconds();
+    Kokkos::fence();
+    time_red_no_fence_fenced = timer.seconds();
+
+    timer.reset();
+    for (int i = 0; i < M; i++) {
+      Kokkos::parallel_reduce(l_red_fence, N, rf, result);
+      Kokkos::fence();
+    }
+    time_red_fence = timer.seconds();
+    Kokkos::fence();
+  }
+
+  if (opts.par_reduce_view) {
+    // warmup
+    for (int i = 0; i < 4; ++i) {
+      Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result);
+    }
+    Kokkos::fence();
+
+    timer.reset();
+    for (int i = 0; i < M; i++) {
+      Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result);
+    }
+    time_red_view_no_fence = timer.seconds();
+    Kokkos::fence();
+    time_red_view_no_fence_fenced = timer.seconds();
+
+    timer.reset();
+    for (int i = 0; i < M; i++) {
+      Kokkos::parallel_reduce(l_red_view_fence, N, rf, v_result);
+      Kokkos::fence();
+    }
+    time_red_view_fence = timer.seconds();
+    Kokkos::fence();
+    timer.reset();
+  }
+
+  const double x = 1.e6 / M;
+  printf("%i %i %i %i", N, V, K, M);
+  if (opts.par_for) {
+    printf(" parallel_for: %lf %lf ( %lf )", x * time_no_fence, x * time_fence,
+           x * time_no_fence_fenced);
+  }
+  if (opts.par_reduce) {
+    printf(" parallel_reduce: %lf %lf ( %lf )", x * time_red_no_fence,
+           x * time_red_fence, x * time_red_no_fence_fenced);
+  }
+  if (opts.par_reduce_view) {
+    printf(" parallel_reduce(view): %lf %lf ( %lf )",
+           x * time_red_view_no_fence, x * time_red_view_fence,
+           x * time_red_view_no_fence_fenced);
+  }
+  printf("\n");
+}
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
+  {
+    int N = 10000;
+    int M = 20;
+    int K = 1;
+
+    Opts opts;
+
+    printf("==========================\n");
+    printf("Kokkos Launch Latency Test\n");
+    printf("==========================\n");
+    printf("\n");
+    printf("Usage: %s ARGUMENTS [OPTIONS...]\n\n", argv[0]);
+    printf("Arguments: N M K\n");
+    printf("  N: loop length\n");
+    printf("  M: how many kernels to dispatch\n");
+    printf(
+        "  K: nested loop length (capped by size of functor member array\n\n");
+    printf("Options:\n");
+    printf("  --no-parallel-for:         skip parallel_for benchmark\n");
+    printf("  --no-parallel-reduce:      skip parallel_reduce benchmark\n");
+    printf(
+        "  --no-parallel-reduce-view: skip parallel_reduce into view "
+        "benchmark\n");
+    printf("\n\n");
+    printf("  Output V is the size of the functor member array\n");
+    printf("\n\n");
+
+    for (int i = 1; i < argc; ++i) {
+      const std::string_view arg(argv[i]);
+
+      // anything that doesn't start with --
+      if (arg.size() < 2 ||
+          (arg.size() >= 2 && arg[0] != '-' && arg[1] != '-')) {
+        if (i == 1)
+          N = atoi(arg.data());
+        else if (i == 2)
+          M = atoi(arg.data());
+        else if (i == 3)
+          K = atoi(arg.data());
+        else {
+          throw std::runtime_error("unexpected argument!");
+        }
+      } else if (arg == "--no-parallel-for") {
+        opts.par_for = false;
+      } else if (arg == "--no-parallel-reduce") {
+        opts.par_reduce = false;
+      } else if (arg == "--no-parallel-reduce-view") {
+        opts.par_reduce_view = false;
+      } else {
+        std::stringstream ss;
+        ss << "unexpected argument \"" << arg << "\" at position " << i;
+        throw std::runtime_error(ss.str());
+      }
+    }
+
+    printf("N V K M time_no_fence time_fence (time_no_fence_fenced)\n");
+
+    /* A backend may have different launch strategies for functors of different
+     * sizes: test a variety of functor sizes.*/
+    run<1>(N, M, K <= 1 ? K : 1, opts);
+    run<16>(N, M, K <= 16 ? K : 16, opts);
+    run<200>(N, M, K <= 200 ? K : 200, opts);
+    run<3000>(N, M, K <= 3000 ? K : 3000, opts);
+    run<30000>(N, M, K <= 30000 ? K : 30000, opts);
+  }
+  Kokkos::finalize();
+}
diff --git a/benchmarks/policy_performance/CMakeLists.txt b/benchmarks/policy_performance/CMakeLists.txt
new file mode 100644
index 00000000000..929b9c97023
--- /dev/null
+++ b/benchmarks/policy_performance/CMakeLists.txt
@@ -0,0 +1,4 @@
+KOKKOS_ADD_EXECUTABLE(
+  policy_performance
+  SOURCES main.cpp
+)
diff --git a/benchmarks/policy_performance/main.cpp b/benchmarks/policy_performance/main.cpp
index 28cfde552a5..0983a3d535c 100644
--- a/benchmarks/policy_performance/main.cpp
+++ b/benchmarks/policy_performance/main.cpp
@@ -106,8 +106,9 @@ int main(int argc, char* argv[]) {
 
   Kokkos::parallel_reduce(
       "parallel_reduce warmup", Kokkos::TeamPolicy<>(10, 1),
-      KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team,
-                    double& lval) { lval += 1; },
+      KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type&, double& lval) {
+        lval += 1;
+      },
       result);
 
   using view_type_1d = Kokkos::View<double*, Kokkos::LayoutRight>;
diff --git a/benchmarks/policy_performance/policy_perf_test.hpp b/benchmarks/policy_performance/policy_perf_test.hpp
index cc2cc40257b..0e23d221f67 100644
--- a/benchmarks/policy_performance/policy_perf_test.hpp
+++ b/benchmarks/policy_performance/policy_perf_test.hpp
@@ -21,13 +21,13 @@ struct ParallelScanFunctor {
   using value_type = double;
   ViewType v;
 
-  ParallelScanFunctor(const ViewType& v_) : v(v_) {}
+  explicit ParallelScanFunctor(const ViewType& v_) : v(v_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const int idx, value_type& val, const bool& final) const {
+  void operator()(const int idx, value_type& val, const bool& is_final) const {
     // inclusive scan
     val += v(idx);
-    if (final) {
+    if (is_final) {
       v(idx) = val;
     }
   }
@@ -109,7 +109,7 @@ void test_policy(int team_range, int thread_range, int vector_range,
                       vector_result = 0.0;
                       Kokkos::parallel_reduce(
                           Kokkos::ThreadVectorRange(team, vector_range),
-                          [&](const int vi, double& vval) { vval += 1; },
+                          [&](const int, double& vval) { vval += 1; },
                           vector_result);
                     }
                     v2(idx, t) = vector_result;
@@ -128,7 +128,7 @@ void test_policy(int team_range, int thread_range, int vector_range,
               team_result = 0.0;
               Kokkos::parallel_reduce(
                   Kokkos::TeamThreadRange(team, thread_range),
-                  [&](const int t, double& lval) { lval += 1; }, team_result);
+                  [&](const int, double& lval) { lval += 1; }, team_result);
             }
             v1(idx) = team_result;
             // prevent compiler optimizing loop away
@@ -170,13 +170,13 @@ void test_policy(int team_range, int thread_range, int vector_range,
             for (int tr = 0; tr < thread_repeat; ++tr) {
               Kokkos::parallel_reduce(
                   Kokkos::TeamThreadRange(team, thread_range),
-                  [&](const int t, double& lval) {
+                  [&](const int, double& lval) {
                     double vector_result = 0.0;
                     for (int vr = 0; vr < inner_repeat; ++vr) {
                       vector_result = 0.0;
                       Kokkos::parallel_reduce(
                           Kokkos::ThreadVectorRange(team, vector_range),
-                          [&](const int vi, double& vval) { vval += 1; },
+                          [&](const int, double& vval) { vval += 1; },
                           vector_result);
                       lval += vector_result;
                     }
diff --git a/benchmarks/stream/CMakeLists.txt b/benchmarks/stream/CMakeLists.txt
new file mode 100644
index 00000000000..0dded6e3a54
--- /dev/null
+++ b/benchmarks/stream/CMakeLists.txt
@@ -0,0 +1,4 @@
+KOKKOS_ADD_EXECUTABLE(
+  stream
+  SOURCES stream-kokkos.cpp
+)
diff --git a/bin/nvcc_wrapper b/bin/nvcc_wrapper
index c1400872402..9b935835d5f 100755
--- a/bin/nvcc_wrapper
+++ b/bin/nvcc_wrapper
@@ -229,7 +229,7 @@ do
     fi
     ;;
   #Handle known nvcc args
-  --dryrun|--verbose|--keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-G|-lineinfo|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|--fmad=*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this)
+  --dryrun|-dryrun|--verbose|--keep|-keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-keep-dir*|-G|-lineinfo|--generate-line-info|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-res-usage|-fmad=*|--use_fast_math|-use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this)
     cuda_args="$cuda_args $1"
     ;;
   #Handle more known nvcc args
diff --git a/cmake/KokkosConfig.cmake.in b/cmake/KokkosConfig.cmake.in
index e26c75b3122..1b6d1b66ff5 100644
--- a/cmake/KokkosConfig.cmake.in
+++ b/cmake/KokkosConfig.cmake.in
@@ -39,10 +39,12 @@ IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS)
         GLOBAL
         CHECK_CUDA_COMPILES)
 
-ELSEIF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS)
+ELSEIF(@Kokkos_ENABLE_CUDA@
+    AND NOT @KOKKOS_COMPILE_LANGUAGE@ STREQUAL CUDA
+    AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS)
     #
-    # if CUDA was enabled, separable compilation was not specified, and current compiler
-    # cannot compile CUDA, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and
+    # if CUDA was enabled, the compilation language was not set to CUDA, and separable compilation was not
+    # specified, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and
     # kokkos_launch_compiler will re-direct to the compiler used to compile CUDA code during installation.
     # kokkos_launch_compiler will re-direct if ${CMAKE_CXX_COMPILER} and -DKOKKOS_DEPENDENCE is present,
     # otherwise, the original command will be executed
diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in
index 9930d2abf0f..2df0f6c5205 100644
--- a/cmake/KokkosCore_config.h.in
+++ b/cmake/KokkosCore_config.h.in
@@ -23,8 +23,6 @@
 #cmakedefine KOKKOS_ENABLE_CUDA
 #cmakedefine KOKKOS_ENABLE_HIP
 #cmakedefine KOKKOS_ENABLE_HPX
-#cmakedefine KOKKOS_ENABLE_MEMKIND
-#cmakedefine KOKKOS_ENABLE_LIBRT
 #cmakedefine KOKKOS_ENABLE_SYCL
 #cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED
 
@@ -32,6 +30,7 @@
 #cmakedefine KOKKOS_ENABLE_CXX17
 #cmakedefine KOKKOS_ENABLE_CXX20
 #cmakedefine KOKKOS_ENABLE_CXX23
+#cmakedefine KOKKOS_ENABLE_CXX26
 
 #cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
 #cmakedefine KOKKOS_ENABLE_CUDA_UVM
@@ -45,7 +44,6 @@
 #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
 #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
 #cmakedefine KOKKOS_ENABLE_TUNING
-#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3
 #cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_4
 #cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS
 #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS
@@ -53,17 +51,15 @@
 #cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION  // deprecated
 #cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
 #cmakedefine KOKKOS_ENABLE_IMPL_MDSPAN
+#cmakedefine KOKKOS_ENABLE_ATOMICS_BYPASS
 
 /* TPL Settings */
 #cmakedefine KOKKOS_ENABLE_HWLOC
-#cmakedefine KOKKOS_USE_LIBRT
-#cmakedefine KOKKOS_ENABLE_HBWSPACE
 #cmakedefine KOKKOS_ENABLE_LIBDL
 #cmakedefine KOKKOS_ENABLE_LIBQUADMATH
-#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
 #cmakedefine KOKKOS_ENABLE_ONEDPL
+#cmakedefine KOKKOS_ENABLE_ROCTHRUST
 
-#cmakedefine KOKKOS_ARCH_SSE42
 #cmakedefine KOKKOS_ARCH_ARMV80
 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX
 #cmakedefine KOKKOS_ARCH_ARMV81
@@ -78,6 +74,7 @@
 #cmakedefine KOKKOS_ARCH_POWER7
 #cmakedefine KOKKOS_ARCH_POWER8
 #cmakedefine KOKKOS_ARCH_POWER9
+#cmakedefine KOKKOS_ARCH_RISCV_SG2042
 #cmakedefine KOKKOS_ARCH_INTEL_GEN
 #cmakedefine KOKKOS_ARCH_INTEL_DG1
 #cmakedefine KOKKOS_ARCH_INTEL_GEN9
diff --git a/cmake/Modules/FindTPLCUDA.cmake b/cmake/Modules/FindTPLCUDA.cmake
index 792c92c07e9..5a62c530fce 100644
--- a/cmake/Modules/FindTPLCUDA.cmake
+++ b/cmake/Modules/FindTPLCUDA.cmake
@@ -7,7 +7,8 @@ IF (NOT CUDAToolkit_ROOT)
   ENDIF()
 ENDIF()
 
-IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0")
+# FIXME CMake 3.28.4 creates more targets than we export
+IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0" AND CMAKE_VERSION VERSION_LESS "3.28.4")
   find_package(CUDAToolkit)
 ELSE()
   include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake)
diff --git a/cmake/Modules/FindTPLLIBRT.cmake b/cmake/Modules/FindTPLLIBRT.cmake
deleted file mode 100644
index e75da56b5b5..00000000000
--- a/cmake/Modules/FindTPLLIBRT.cmake
+++ /dev/null
@@ -1 +0,0 @@
-KOKKOS_FIND_IMPORTED(LIBRT HEADER time.h   LIBRARY rt)
diff --git a/cmake/Modules/FindTPLMEMKIND.cmake b/cmake/Modules/FindTPLMEMKIND.cmake
deleted file mode 100644
index 20aaff22955..00000000000
--- a/cmake/Modules/FindTPLMEMKIND.cmake
+++ /dev/null
@@ -1 +0,0 @@
-KOKKOS_FIND_IMPORTED(MEMKIND HEADER memkind.h LIBRARY memkind)
diff --git a/cmake/Modules/FindTPLONEDPL.cmake b/cmake/Modules/FindTPLONEDPL.cmake
index 01791cff443..603510c315e 100644
--- a/cmake/Modules/FindTPLONEDPL.cmake
+++ b/cmake/Modules/FindTPLONEDPL.cmake
@@ -43,4 +43,7 @@ ELSE()
       COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0
     )
   ENDIF()
+
+  # Export oneDPL as a Kokkos dependency
+  KOKKOS_EXPORT_CMAKE_TPL(oneDPL)
 ENDIF()
diff --git a/cmake/Modules/FindTPLROCTHRUST.cmake b/cmake/Modules/FindTPLROCTHRUST.cmake
new file mode 100644
index 00000000000..dae7dc3c952
--- /dev/null
+++ b/cmake/Modules/FindTPLROCTHRUST.cmake
@@ -0,0 +1,15 @@
+# ROCm 5.6 and earlier set AMDGPU_TARGETS and GPU_TARGETS to all the supported
+# architectures. Therefore, we end up compiling Kokkos for all the supported
+# architecture. Starting with ROCm 5.7 AMDGPU_TARGETS and GPU_TARGETS are empty.
+# It is the user's job to set the variables. Since we are injecting the
+# architecture flag ourselves, we can let the variables empty. To replicate the
+# behavior of ROCm 5.7 and later for earlier version of ROCm we set
+# AMDGPU_TARGETS and GPU_TARGETS to empty and set the values in the cache. If
+# the values are not cached, FIND_PACKAGE(rocthrust) will overwrite them.
+SET(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for")
+SET(GPU_TARGETS "" CACHE STRING "GPU targets to compile for")
+FIND_PACKAGE(rocthrust REQUIRED)
+KOKKOS_CREATE_IMPORTED_TPL(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust)
+
+# Export ROCTHRUST as a Kokkos dependency
+KOKKOS_EXPORT_CMAKE_TPL(rocthrust)
diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake
index 30764bde860..34e9f05986f 100644
--- a/cmake/kokkos_arch.cmake
+++ b/cmake/kokkos_arch.cmake
@@ -49,7 +49,6 @@ DECLARE_AND_CHECK_HOST_ARCH(ARMV81            "ARMv8.1 Compatible CPU")
 DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX    "ARMv8 Cavium ThunderX CPU")
 DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2   "ARMv8 Cavium ThunderX2 CPU")
 DECLARE_AND_CHECK_HOST_ARCH(A64FX             "ARMv8.2 with SVE Support")
-DECLARE_AND_CHECK_HOST_ARCH(WSM               "Intel Westmere CPU")
 DECLARE_AND_CHECK_HOST_ARCH(SNB               "Intel Sandy/Ivy Bridge CPUs")
 DECLARE_AND_CHECK_HOST_ARCH(HSW               "Intel Haswell CPUs")
 DECLARE_AND_CHECK_HOST_ARCH(BDW               "Intel Broadwell Xeon E-class CPUs")
@@ -60,13 +59,12 @@ DECLARE_AND_CHECK_HOST_ARCH(SKX               "Intel Skylake Xeon Server CPUs (A
 DECLARE_AND_CHECK_HOST_ARCH(KNC               "Intel Knights Corner Xeon Phi")
 DECLARE_AND_CHECK_HOST_ARCH(KNL               "Intel Knights Landing Xeon Phi")
 DECLARE_AND_CHECK_HOST_ARCH(SPR               "Intel Sapphire Rapids Xeon Server CPUs (AVX512)")
-DECLARE_AND_CHECK_HOST_ARCH(BGQ               "IBM Blue Gene Q")
-DECLARE_AND_CHECK_HOST_ARCH(POWER7            "IBM POWER7 CPUs")
 DECLARE_AND_CHECK_HOST_ARCH(POWER8            "IBM POWER8 CPUs")
 DECLARE_AND_CHECK_HOST_ARCH(POWER9            "IBM POWER9 CPUs")
 DECLARE_AND_CHECK_HOST_ARCH(ZEN               "AMD Zen architecture")
 DECLARE_AND_CHECK_HOST_ARCH(ZEN2              "AMD Zen2 architecture")
 DECLARE_AND_CHECK_HOST_ARCH(ZEN3              "AMD Zen3 architecture")
+DECLARE_AND_CHECK_HOST_ARCH(RISCV_SG2042      "SG2042 (RISC-V) CPUs")
 
 IF(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL)
   SET(KOKKOS_SHOW_CUDA_ARCHS ON)
@@ -191,9 +189,6 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
   ELSEIF(CUDAToolkit_BIN_DIR)
     GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..)
   ENDIF()
-  IF (KOKKOS_ENABLE_CUDA)
-     SET(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND ON CACHE BOOL "enable CUDA Clang workarounds" FORCE)
-  ENDIF()
 ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   SET(CUDA_ARCH_FLAG "-gpu")
   GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda)
@@ -342,18 +337,6 @@ IF (KOKKOS_ARCH_ZEN3)
   SET(KOKKOS_ARCH_AVX2 ON)
 ENDIF()
 
-IF (KOKKOS_ARCH_WSM)
-  COMPILER_SPECIFIC_FLAGS(
-    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
-    Cray    NO-VALUE-SPECIFIED
-    Intel   -xSSE4.2
-    MSVC    NO-VALUE-SPECIFIED
-    NVHPC   -tp=px
-    DEFAULT -msse4.2
-  )
-  SET(KOKKOS_ARCH_SSE42 ON)
-ENDIF()
-
 IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX)
   SET(KOKKOS_ARCH_AVX ON)
   COMPILER_SPECIFIC_FLAGS(
@@ -378,6 +361,23 @@ IF (KOKKOS_ARCH_HSW)
   )
 ENDIF()
 
+IF (KOKKOS_ARCH_RISCV_SG2042)
+  IF(NOT
+  (KOKKOS_CXX_COMPILER_ID STREQUAL GNU
+    AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
+  OR
+  (KOKKOS_CXX_COMPILER_ID STREQUAL Clang
+    AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14)
+  )
+  MESSAGE(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.")
+  ENDIF()
+  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+        DEFAULT -march=rv64imafdcv
+      )
+ENDIF()
+
+
 IF (KOKKOS_ARCH_BDW)
   SET(KOKKOS_ARCH_AVX2 ON)
   COMPILER_SPECIFIC_FLAGS(
@@ -571,6 +571,11 @@ IF (KOKKOS_ENABLE_HIP)
     COMPILER_SPECIFIC_FLAGS(
       DEFAULT -fgpu-rdc
     )
+    IF (NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
+      COMPILER_SPECIFIC_LINK_OPTIONS(
+        DEFAULT --hip-link
+      )
+    ENDIF()
   ELSE()
     COMPILER_SPECIFIC_FLAGS(
       DEFAULT -fno-gpu-rdc
@@ -588,32 +593,44 @@ IF (KOKKOS_ENABLE_SYCL)
 ENDIF()
 
 # Check support for device_global variables
-# FIXME_SYCL Once the feature test macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL is
-#            available, use that instead.
-IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS)
-  INCLUDE(CheckCXXSourceCompiles)
+# FIXME_SYCL If SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we can use device
+#   global variables with shared libraries using the "non-separable compilation"
+#   implementation. Otherwise, the feature is not supported when building shared
+#   libraries. Thus, we don't even check for support if shared libraries are
+#   requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined.
+IF(KOKKOS_ENABLE_SYCL)
   STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}")
-  CHECK_CXX_SOURCE_COMPILES("
-    #include <sycl/sycl.hpp>
-    using namespace sycl::ext::oneapi::experimental;
-    using namespace sycl;
-
-    SYCL_EXTERNAL device_global<int, decltype(properties(device_image_scope))> Foo;
-
-    void bar(queue q) {
-      q.single_task([=] {
-      Foo = 42;
-    });
-    }
-
-    int main(){ return 0; }
-    "
-    KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED)
-
-  IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED)
-    COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED
-    )
+  INCLUDE(CheckCXXSymbolExists)
+  CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL)
+  IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL)
+    SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON)
+    # Use the non-separable compilation implementation to support shared libraries as well.
+    COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED)
+  ELSEIF(NOT BUILD_SHARED_LIBS)
+    INCLUDE(CheckCXXSourceCompiles)
+    CHECK_CXX_SOURCE_COMPILES("
+      #include <sycl/sycl.hpp>
+      using namespace sycl::ext::oneapi::experimental;
+      using namespace sycl;
+
+      SYCL_EXTERNAL device_global<int, decltype(properties(device_image_scope))> Foo;
+
+      void bar(queue q) {
+        q.single_task([=] {
+        Foo = 42;
+      });
+      }
+
+      int main(){ return 0; }
+      "
+      KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED)
+
+    IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED)
+      # Only the separable compilation implementation is supported.
+      COMPILER_SPECIFIC_FLAGS(
+        DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED
+      )
+    ENDIF()
   ENDIF()
 ENDIF()
 
@@ -767,30 +784,35 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
     COMPILER_SPECIFIC_FLAGS(
       IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__
     )
-  ELSEIF(KOKKOS_ARCH_INTEL_GEN9)
-    COMPILER_SPECIFIC_FLAGS(
-      IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" -D__STRICT_ANSI__
-    )
-  ELSEIF(KOKKOS_ARCH_INTEL_GEN11)
-    COMPILER_SPECIFIC_FLAGS(
-      IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" -D__STRICT_ANSI__
-    )
-  ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP)
-    COMPILER_SPECIFIC_FLAGS(
-      IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" -D__STRICT_ANSI__
-    )
-  ELSEIF(KOKKOS_ARCH_INTEL_DG1)
-    COMPILER_SPECIFIC_FLAGS(
-      IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" -D__STRICT_ANSI__
-    )
-  ELSEIF(KOKKOS_ARCH_INTEL_XEHP)
-    COMPILER_SPECIFIC_FLAGS(
-      IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" -D__STRICT_ANSI__
+  ELSE()
+    COMPILER_SPECIFIC_OPTIONS(
+      IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__
     )
-  ELSEIF(KOKKOS_ARCH_INTEL_PVC)
-    COMPILER_SPECIFIC_FLAGS(
-      IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" -D__STRICT_ANSI__
+    IF(KOKKOS_ARCH_INTEL_GEN9)
+      COMPILER_SPECIFIC_LINK_OPTIONS(
+        IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9"
+      )
+    ELSEIF(KOKKOS_ARCH_INTEL_GEN11)
+      COMPILER_SPECIFIC_LINK_OPTIONS(
+        IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11"
+      )
+    ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP)
+      COMPILER_SPECIFIC_LINK_OPTIONS(
+        IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp"
+      )
+    ELSEIF(KOKKOS_ARCH_INTEL_DG1)
+      COMPILER_SPECIFIC_LINK_OPTIONS(
+        IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1"
+      )
+    ELSEIF(KOKKOS_ARCH_INTEL_XEHP)
+      COMPILER_SPECIFIC_LINK_OPTIONS(
+        IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4"
+      )
+    ELSEIF(KOKKOS_ARCH_INTEL_PVC)
+      COMPILER_SPECIFIC_LINK_OPTIONS(
+        IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7"
     )
+    ENDIF()
   ENDIF()
 ENDIF()
 
@@ -1130,3 +1152,14 @@ MESSAGE(STATUS "Architectures:")
 FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST})
   MESSAGE(STATUS " ${Arch}")
 ENDFOREACH()
+
+
+IF(KOKKOS_ENABLE_ATOMICS_BYPASS)
+  IF(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined")
+    MESSAGE(FATAL_ERROR "Not allowed to disable atomics (via -DKokkos_ENABLE_AROMICS_BYPASS=ON) if neither a host parallel nor a device backend is enabled!")
+  ENDIF()
+  IF(NOT KOKKOS_ENABLE_SERIAL)
+    MESSAGE(FATAL_ERROR "Implementation bug")  # safeguard
+  ENDIF()
+  MESSAGE(STATUS "Atomics: **DISABLED**")
+ENDIF()
diff --git a/cmake/kokkos_compiler_id.cmake b/cmake/kokkos_compiler_id.cmake
index 04589befc3a..9135ca2b41c 100644
--- a/cmake/kokkos_compiler_id.cmake
+++ b/cmake/kokkos_compiler_id.cmake
@@ -152,6 +152,7 @@ ENDIF()
 SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos.  Required compiler versions:")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Clang(CPU)         8.0.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Clang(CUDA)       10.0.0 or higher")
+SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Clang(OpenMPTarget) 15.0.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    GCC                8.2.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Intel             19.0.5 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    IntelLLVM(CPU)  2021.1.1 or higher")
@@ -210,6 +211,10 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC")
   ENDIF()
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang)
   MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
+ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET)
+  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 15.0.0)
+    MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
+  ENDIF()
 ENDIF()
 
 IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID)
diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake
index 89e23b019bd..a437f6132aa 100644
--- a/cmake/kokkos_enable_options.cmake
+++ b/cmake/kokkos_enable_options.cmake
@@ -48,7 +48,6 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda
 # resolved but we keep the option around a bit longer to be safe.
 KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON  "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
 KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler")
-KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3    OFF "Whether code deprecated in major release 3 is available" )
 KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4    ON "Whether code deprecated in major release 4 is available" )
 KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
 KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for HIP")
@@ -74,6 +73,7 @@ KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple ke
 
 # This option will go away eventually, but allows fallback to old implementation when needed.
 KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation")
+KOKKOS_ENABLE_OPTION(ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases")
 
 KOKKOS_ENABLE_OPTION(IMPL_MDSPAN OFF "Whether to enable experimental mdspan support")
 KOKKOS_ENABLE_OPTION(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan")
diff --git a/cmake/kokkos_pick_cxx_std.cmake b/cmake/kokkos_pick_cxx_std.cmake
index d4eca651d42..ae14a10d531 100644
--- a/cmake/kokkos_pick_cxx_std.cmake
+++ b/cmake/kokkos_pick_cxx_std.cmake
@@ -7,6 +7,7 @@ KOKKOS_OPTION(CXX_STANDARD "" STRING "[[DEPRECATED - USE CMAKE_CXX_STANDARD INST
 SET(KOKKOS_ENABLE_CXX17 OFF)
 SET(KOKKOS_ENABLE_CXX20 OFF)
 SET(KOKKOS_ENABLE_CXX23 OFF)
+SET(KOKKOS_ENABLE_CXX26 OFF)
 IF (KOKKOS_CXX_STANDARD)
   MESSAGE(FATAL_ERROR "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead")
 ENDIF()
diff --git a/cmake/kokkos_test_cxx_std.cmake b/cmake/kokkos_test_cxx_std.cmake
index 7ad49fdd2d9..b075a3e36b5 100644
--- a/cmake/kokkos_test_cxx_std.cmake
+++ b/cmake/kokkos_test_cxx_std.cmake
@@ -74,6 +74,10 @@ ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "23")
   kokkos_set_cxx_standard_feature(23)
   SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B")
   SET(KOKKOS_ENABLE_CXX23 ON)
+ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "26")
+  kokkos_set_cxx_standard_feature(26)
+  SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C")
+  SET(KOKKOS_ENABLE_CXX26 ON)
 ELSE()
   MESSAGE(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!")
 ENDIF()
diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake
index f124596a84e..6ef3b79bde2 100644
--- a/cmake/kokkos_tpls.cmake
+++ b/cmake/kokkos_tpls.cmake
@@ -32,19 +32,21 @@ FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT)
 ENDFUNCTION()
 
 KOKKOS_TPL_OPTION(HWLOC   Off TRIBITS HWLOC)
-KOKKOS_TPL_OPTION(MEMKIND Off)
-IF(KOKKOS_ENABLE_MEMKIND)
-  SET(KOKKOS_ENABLE_HBWSPACE ON)
-ENDIF()
 KOKKOS_TPL_OPTION(CUDA    ${Kokkos_ENABLE_CUDA} TRIBITS CUDA)
-KOKKOS_TPL_OPTION(LIBRT   Off)
 IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT
     KOKKOS_HAS_TRILINOS)
   SET(ROCM_DEFAULT ON)
 ELSE()
   SET(ROCM_DEFAULT OFF)
 ENDIF()
+IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_HAS_TRILINOS)
+  SET(ROCTHRUST_DEFAULT ON)
+ELSE()
+  SET(ROCTHRUST_DEFAULT OFF)
+ENDIF()
 KOKKOS_TPL_OPTION(ROCM    ${ROCM_DEFAULT})
+KOKKOS_TPL_OPTION(ROCTHRUST ${ROCTHRUST_DEFAULT})
+
 IF(KOKKOS_ENABLE_SYCL AND NOT KOKKOS_HAS_TRILINOS)
   SET(ONEDPL_DEFAULT ON)
 ELSE()
@@ -77,21 +79,18 @@ KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath)
 
 #Make sure we use our local FindKokkosCuda.cmake
 KOKKOS_IMPORT_TPL(HPX INTERFACE)
-IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE)
-  KOKKOS_IMPORT_TPL(CUDA INTERFACE)
-ENDIF()
+KOKKOS_IMPORT_TPL(CUDA INTERFACE)
 KOKKOS_IMPORT_TPL(HWLOC)
-KOKKOS_IMPORT_TPL(LIBRT)
 KOKKOS_IMPORT_TPL(LIBDL)
-KOKKOS_IMPORT_TPL(MEMKIND)
 IF (NOT WIN32)
   KOKKOS_IMPORT_TPL(THREADS INTERFACE)
 ENDIF()
 IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE)
   KOKKOS_IMPORT_TPL(ROCM INTERFACE)
-  KOKKOS_IMPORT_TPL(ONEDPL INTERFACE)
 ENDIF()
+KOKKOS_IMPORT_TPL(ONEDPL INTERFACE)
 KOKKOS_IMPORT_TPL(LIBQUADMATH)
+KOKKOS_IMPORT_TPL(ROCTHRUST)
 
 IF (Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL)
   find_package(desul REQUIRED COMPONENTS atomics)
@@ -119,7 +118,3 @@ STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}")
 #Convert to a regular variable
 UNSET(KOKKOS_TPL_EXPORTS CACHE)
 SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP})
-IF (KOKKOS_ENABLE_MEMKIND)
-   SET(KOKKOS_ENABLE_HBWSPACE)
-   LIST(APPEND KOKKOS_MEMSPACE_LIST HBWSpace)
-ENDIF()
diff --git a/cmake/kokkos_tribits.cmake b/cmake/kokkos_tribits.cmake
index b30ca70ab95..060a7a8472c 100644
--- a/cmake/kokkos_tribits.cmake
+++ b/cmake/kokkos_tribits.cmake
@@ -237,18 +237,10 @@ ENDMACRO()
 ##                        KOKKOS_DECLARE is the declaration set
 ##                        KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp
 MACRO(KOKKOS_CONFIGURE_CORE)
-   SET(FWD_BACKEND_LIST)
-   FOREACH(MEMSPACE ${KOKKOS_MEMSPACE_LIST})
-      LIST(APPEND FWD_BACKEND_LIST ${MEMSPACE})
-   ENDFOREACH()
-   FOREACH(BACKEND_ ${KOKKOS_ENABLED_DEVICES})
-      LIST(APPEND FWD_BACKEND_LIST ${BACKEND_})
-   ENDFOREACH()
-   MESSAGE(STATUS "Kokkos Devices: ${KOKKOS_ENABLED_DEVICES}, Kokkos Backends: ${FWD_BACKEND_LIST}")
-   KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${FWD_BACKEND_LIST}")
+   MESSAGE(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}")
+   KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${KOKKOS_ENABLED_DEVICES}")
    KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}")
-   KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${FWD_BACKEND_LIST}")
-   KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_PostInclude.hpp "KOKKOS_POST_INCLUDE" "Kokkos_Post_Include" "${KOKKOS_BACKEND_POST_INCLUDE_LIST}")
+   KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${KOKKOS_ENABLED_DEVICES}")
    SET(_DEFAULT_HOST_MEMSPACE "::Kokkos::HostSpace")
    KOKKOS_OPTION(DEFAULT_DEVICE_MEMORY_SPACE "" STRING "Override default device memory space")
    KOKKOS_OPTION(DEFAULT_HOST_MEMORY_SPACE "" STRING "Override default host memory space")
@@ -309,7 +301,6 @@ MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES)
           "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp"
           "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp"
           "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp"
-          "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_PostInclude.hpp"
           DESTINATION ${KOKKOS_HEADER_DIR})
 ENDMACRO()
 
diff --git a/config/test_all_sandia b/config/test_all_sandia
deleted file mode 100755
index 193a162a4e6..00000000000
--- a/config/test_all_sandia
+++ /dev/null
@@ -1,773 +0,0 @@
-#!/bin/bash -e
-
-#
-# Global config
-#
-
-set -o pipefail
-
-# Determine current machine.
-
-MACHINE=""
-HOSTNAME=$(hostname)
-PROCESSOR=`uname -p`
-
-if [[ "$HOSTNAME" =~ (white|ride).* ]]; then
-  MACHINE=white
-  module load git
-fi
-
-if [[ "$HOSTNAME" =~ .*bowman.* ]]; then
-  MACHINE=bowman
-  module load git
-fi
-
-if [[ "$HOSTNAME" == n* ]]; then # Warning: very generic name
-  if [[ "$PROCESSOR" = "aarch64" ]]; then
-    MACHINE=sullivan
-    module load git
-  fi
-fi
-
-if [[ "$HOSTNAME" == node* ]]; then # Warning: very generic name
-  if [[ "$MACHINE" = "" ]]; then
-    MACHINE=shepard
-    module load git
-  fi
-fi
-
-if [[ "$HOSTNAME" == apollo\.* ]]; then
-  MACHINE=apollo
-  module load git
-fi
-
-if [[ "$HOSTNAME" == sullivan ]]; then
-  MACHINE=sullivan
-  module load git
-fi
-
-if [[ "$HOSTNAME" == mayer\.* ]]; then
-  MACHINE=mayer
-#  module load git
-fi
-if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name
-  MACHINE=mayer
-fi
-
-if [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
-  if [[ "$MACHINE" = "" ]]; then
-    MACHINE=sems
-    module load sems-git
-  fi  
-fi
-
-if [[ "$MACHINE" = "" ]]; then
-  echo "Unrecognized machine" >&2
-  exit 1
-fi
-
-echo "Running on machine: $MACHINE"
-
-GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
-IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
-ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
-INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
-CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
-CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial"
-CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial"
-
-GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized"
-IBM_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
-CLANG_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
-INTEL_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
-#CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
-CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized"
-PGI_WARNING_FLAGS=""
-
-# Default. Machine specific can override.
-DEBUG=False
-ARGS=""
-CUSTOM_BUILD_LIST=""
-DRYRUN=False
-BUILD_ONLY=False
-declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=1
-TEST_SCRIPT=False
-SKIP_HWLOC=False
-SPOT_CHECK=False
-
-PRINT_HELP=False
-OPT_FLAG=""
-CXX_FLAGS_EXTRA=""
-LD_FLAGS_EXTRA=""
-KOKKOS_OPTIONS=""
-
-#
-# Handle arguments.
-#
-
-while [[ $# > 0 ]]
-do
-  key="$1"
-
-  case $key in
-    --kokkos-path*)
-      KOKKOS_PATH="${key#*=}"
-      ;;
-    --build-list*)
-      CUSTOM_BUILD_LIST="${key#*=}"
-      ;;
-    --debug*)
-      DEBUG=True
-      ;;
-    --build-only*)
-      BUILD_ONLY=True
-      ;;
-    --test-script*)
-      TEST_SCRIPT=True
-      ;;
-    --skip-hwloc*)
-      SKIP_HWLOC=True
-      ;;
-    --num*)
-      NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
-      ;;
-    --dry-run*)
-      DRYRUN=True
-      ;;
-    --spot-check*)
-      SPOT_CHECK=True
-      ;;
-    --arch*)
-      ARCH_FLAG="--arch=${key#*=}"
-      ;;
-    --opt-flag*)
-      OPT_FLAG="${key#*=}"
-      ;;
-    --with-cuda-options*)
-      KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
-      ;;
-    --with-options*)
-      KOKKOS_OPTIONS="--with-options=enable_large_mem_tests,${key#*=}"
-      ;;
-    --cxxflags-extra*)
-      CXX_FLAGS_EXTRA="${key#*=}"
-      ;;
-    --ldflags-extra*)
-      LD_FLAGS_EXTRA="${key#*=}"
-      ;;
-    --help*)
-      PRINT_HELP=True
-      ;;
-    *)
-      # args, just append
-      ARGS="$ARGS $1"
-      ;;
-  esac
-
-  shift
-done
-
-SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
-
-# Set kokkos path.
-if [ -z "$KOKKOS_PATH" ]; then
-  KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
-else
-  # Ensure KOKKOS_PATH is abs path.
-  KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
-fi
-
-UNCOMMITTED=`cd ${KOKKOS_PATH}; git status --porcelain 2>/dev/null`
-if ! [ -z "$UNCOMMITTED" ]; then
-  echo "WARNING!! THE FOLLOWING CHANGES ARE UNCOMMITTED!! :"
-  echo "$UNCOMMITTED"
-  echo ""
-fi
-
-GITSTATUS=`cd ${KOKKOS_PATH}; git log -n 1 --format=oneline`
-echo "Repository Status: " ${GITSTATUS}
-echo ""
-echo ""
-
-#
-# Machine specific config.
-#
-
-if [ "$MACHINE" = "sems" ]; then
-  source /projects/sems/modulefiles/utils/sems-modules-init.sh
-
-  BASE_MODULE_LIST="sems-env,kokkos-env,kokkos-hwloc/1.10.1/base,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
-  CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
-
-  if [ -z "$ARCH_FLAG" ]; then
-    ARCH_FLAG=""
-  fi
-
-  if [ "$SPOT_CHECK" = "True" ]; then
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
-               "gcc/6.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
-               "cuda/8.0.44 $CUDA8_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-    )
-  else
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-    )
-  fi
-elif [ "$MACHINE" = "white" ]; then
-  source /etc/profile.d/modules.sh
-  SKIP_HWLOC=True
-  export SLURM_TASKS_PER_NODE=32
-
-  BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
-  IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
-  CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/6.4.0,ibm/xl/16.1.0"
-
-  # Don't do pthread on white.
-  GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
-
-  # Format: (compiler module-list build-list exe-name warning-flag)
-  COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-             "gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-             "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
-             "cuda/9.0.103 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-  )
-
-  if [ -z "$ARCH_FLAG" ]; then
-    ARCH_FLAG="--arch=Power8,Kepler37"
-  fi
-
-  NUM_JOBS_TO_RUN_IN_PARALLEL=1
-
-elif [ "$MACHINE" = "bowman" ]; then
-  source /etc/profile.d/modules.sh
-  SKIP_HWLOC=True
-  export SLURM_TASKS_PER_NODE=32
-
-  BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
-
-  OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
-
-  # Format: (compiler module-list build-list exe-name warning-flag)
-  COMPILERS=("intel/16.4.258 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-             "intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-             "intel/18.0.128 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-  )
-
-  if [ -z "$ARCH_FLAG" ]; then
-    ARCH_FLAG="--arch=KNL"
-  fi
-
-  NUM_JOBS_TO_RUN_IN_PARALLEL=1
-
-elif [ "$MACHINE" = "sullivan" ]; then
-  source /etc/profile.d/modules.sh
-  SKIP_HWLOC=True
-  export SLURM_TASKS_PER_NODE=96
-
-  BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
-
-  # Format: (compiler module-list build-list exe-name warning-flag)
-  COMPILERS=("gcc/6.1.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS")
-
-  if [ -z "$ARCH_FLAG" ]; then
-    ARCH_FLAG="--arch=ARMv8-ThunderX"
-  fi
-
-  NUM_JOBS_TO_RUN_IN_PARALLEL=1
-
-elif [ "$MACHINE" = "mayer" ]; then
-  SKIP_HWLOC=True
-  export SLURM_TASKS_PER_NODE=96
-
-  BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
-  ARM_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
-
-  # Format: (compiler module-list build-list exe-name warning-flag)
-  COMPILERS=("gcc/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-             "arm/1.4.0 $ARM_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS")
-
-  if [ -z "$ARCH_FLAG" ]; then
-    ARCH_FLAG="--arch=ARMv8-TX2"
-  fi
-
-  NUM_JOBS_TO_RUN_IN_PARALLEL=1
-
-elif [ "$MACHINE" = "shepard" ]; then
-  source /etc/profile.d/modules.sh
-  SKIP_HWLOC=True
-  export SLURM_TASKS_PER_NODE=32
-
-  BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
-  BASE_MODULE_LIST_INTEL="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
-
-  # Format: (compiler module-list build-list exe-name warning-flag)
-  COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-             "intel/18.0.128 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-             "pgi/17.10.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS"
-  )
-
-  if [ -z "$ARCH_FLAG" ]; then
-    ARCH_FLAG="--arch=HSW"
-  fi
-  NUM_JOBS_TO_RUN_IN_PARALLEL=1
-
-elif [ "$MACHINE" = "apollo" ]; then
-  source /projects/sems/modulefiles/utils/sems-modules-init.sh
-  module use /home/projects/modulefiles/local/x86-64
-  module load kokkos-env
-
-  module load sems-git
-  module load sems-tex
-  module load sems-cmake/3.5.2
-  module load sems-gdb
-
-  SKIP_HWLOC=True
-
-  BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
-  CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
-  CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
-
-  CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/9.0.69"
-  NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
-
-  BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP"
-  BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread"
-  BUILD_LIST_CLANG="Serial,Pthread,OpenMP"
-
-  if [ "$SPOT_CHECK" = "True" ]; then
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
-               "gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
-               "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS"
-               "cuda/9.1 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-    )
-  else
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("cuda/9.1 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
-               "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
-               "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-    )
-  fi
-
-  if [ -z "$ARCH_FLAG" ]; then
-    ARCH_FLAG="--arch=SNB,Volta70"
-  fi
-
-  NUM_JOBS_TO_RUN_IN_PARALLEL=1
-
-else
-  echo "Unhandled machine $MACHINE" >&2
-  exit 1
-fi
-
-export OMP_NUM_THREADS=4
-
-declare -i NUM_RESULTS_TO_KEEP=7
-
-RESULT_ROOT_PREFIX=TestAll
-
-if [ "$PRINT_HELP" = "True" ]; then
-  echo "test_all_sandia <ARGS> <OPTIONS>:"
-  echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
-  echo "    Defaults to root repo containing this script"
-  echo "--debug: Run tests in debug. Defaults to False"
-  echo "--test-script: Test this script, not Kokkos"
-  echo "--skip-hwloc: Do not do hwloc tests"
-  echo "--num=N: Number of jobs to run in parallel"
-  echo "--spot-check: Minimal test set to issue pull request"
-  echo "--dry-run: Just print what would be executed"
-  echo "--build-only: Just do builds, don't run anything"
-  echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
-  echo "--cxxflags-extra=FLAGS: Extra flags to be added to CXX_FLAGS"
-  echo "--ldflags-extra=FLAGS: Extra flags to be added to LD_FLAGS"
-  echo "--arch=ARCHITECTURE: overwrite architecture flags"
-  echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
-  echo "--build-list=BUILD,BUILD,BUILD..."
-  echo "    Provide a comma-separated list of builds instead of running all builds"
-  echo "    Valid items:"
-  echo "      OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
-  echo "      Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
-  echo ""
-
-  echo "ARGS: list of expressions matching compilers to test"
-  echo "  supported compilers sems"
-  for COMPILER_DATA in "${COMPILERS[@]}"; do
-    ARR=($COMPILER_DATA)
-    COMPILER=${ARR[0]}
-    echo "    $COMPILER"
-  done
-  echo ""
-
-  echo "Examples:"
-  echo "  Run all tests"
-  echo "  % test_all_sandia"
-  echo ""
-  echo "  Run all gcc tests"
-  echo "  % test_all_sandia gcc"
-  echo ""
-  echo "  Run all gcc/4.8.4 and all intel tests"
-  echo "  % test_all_sandia gcc/4.8.4 intel"
-  echo ""
-  echo "  Run all tests in debug"
-  echo "  % test_all_sandia --debug"
-  echo ""
-  echo "  Run gcc/4.8.4 and only do OpenMP and OpenMP_Serial builds"
-  echo "  % test_all_sandia gcc/4.8.4 --build-list=OpenMP,OpenMP_Serial"
-  echo ""
-  echo "If you want to kill the tests, do:"
-  echo "  hit ctrl-z"
-  echo "  % kill -9 %1"
-  echo
-  exit 0
-fi
-
-# Set build type.
-if [ "$DEBUG" = "True" ]; then
-  BUILD_TYPE=debug
-else
-  BUILD_TYPE=release
-fi
-
-# If no args provided, do all compilers.
-if [ -z "$ARGS" ]; then
-  ARGS='?'
-fi
-
-# Process args to figure out which compilers to test.
-COMPILERS_TO_TEST=""
-
-for ARG in $ARGS; do
-  for COMPILER_DATA in "${COMPILERS[@]}"; do
-    ARR=($COMPILER_DATA)
-    COMPILER=${ARR[0]}
-
-    if [[ "$COMPILER" = $ARG* ]]; then
-      if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then
-        COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER"
-      else
-        echo "Tried to add $COMPILER twice"
-      fi
-    fi
-  done
-done
-
-#
-# Functions.
-#
-
-# get_compiler_name <COMPILER>
-get_compiler_name() {
-  echo $1 | cut -d/ -f1
-}
-
-# get_compiler_version <COMPILER>
-get_compiler_version() {
-  echo $1 | cut -d/ -f2
-}
-
-# Do not call directly.
-get_compiler_data() {
-  local compiler=$1
-  local item=$2
-  local compiler_name=$(get_compiler_name $compiler)
-  local compiler_vers=$(get_compiler_version $compiler)
-
-  local compiler_data
-  for compiler_data in "${COMPILERS[@]}" ; do
-    local arr=($compiler_data)
-
-    if [ "$compiler" = "${arr[0]}" ]; then
-      echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g"
-      return 0
-    fi
-  done
-
-  # Not found.
-  echo "Unreconized compiler $compiler" >&2
-  exit 1
-}
-
-#
-# For all getters, usage: <GETTER> <COMPILER>
-#
-
-get_compiler_modules() {
-  get_compiler_data $1 1
-}
-
-get_compiler_build_list() {
-  get_compiler_data $1 2
-}
-
-get_compiler_exe_name() {
-  get_compiler_data $1 3
-}
-
-get_compiler_warning_flags() {
-  get_compiler_data $1 4
-}
-
-run_cmd() {
-  echo "RUNNING: $*"
-  if [ "$DRYRUN" != "True" ]; then
-    eval "$* 2>&1"
-  fi
-}
-
-# report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
-report_and_log_test_result() {
-  # Use sane var names.
-  local success=$1; local desc=$2; local comment=$3;
-
-  if [ "$success" = "0" ]; then
-    echo "  PASSED $desc"
-    echo $comment > $PASSED_DIR/$desc
-  else
-    # For failures, comment should be the name of the phase that failed.
-    echo "  FAILED $desc" >&2
-    echo $comment > $FAILED_DIR/$desc
-    cat ${desc}.${comment}.log
-  fi
-}
-
-setup_env() {
-  local compiler=$1
-  local compiler_modules=$(get_compiler_modules $compiler)
-
-  module purge
-
-  local mod
-  for mod in $compiler_modules; do
-    echo "Loading module $mod"
-    module load $mod 2>&1
-    # It is ridiculously hard to check for the success of a loaded
-    # module. Module does not return error codes and piping to grep
-    # causes module to run in a subshell.
-    module list 2>&1 | grep "$mod" >& /dev/null || return 1
-  done
-
-  return 0
-}
-
-# single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE>
-single_build_and_test() {
-  # Use sane var names.
-  local compiler=$1; local build=$2; local build_type=$3;
-
-  # Set up env.
-  mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type"
-  cd $ROOT_DIR/$compiler/"${build}-$build_type"
-  local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
-  setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
-
-  # Set up flags.
-  local compiler_warning_flags=$(get_compiler_warning_flags $compiler)
-  local compiler_exe=$(get_compiler_exe_name $compiler)
-
-  if [[ "$build_type" = hwloc* ]]; then
-    local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
-  fi
-
-  if [[ "$OPT_FLAG" = "" ]]; then
-    OPT_FLAG="-O3"
-  fi
-
-  if [[ "$build_type" = *debug* ]]; then
-    local extra_args="$extra_args --debug"
-    local cxxflags="-g $compiler_warning_flags"
-    local ldflags="-g"
-  else
-    local cxxflags="$OPT_FLAG $compiler_warning_flags"
-    local ldflags="${OPT_FLAG}"
-  fi
-
-  local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}"
-  local ldflags="${ldflags} ${LD_FLAGS_EXTRA}"
-
-  if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
-    local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
-  fi
-  if [[ "$KOKKOS_OPTIONS" != "" ]]; then
-    local extra_args="$extra_args $KOKKOS_OPTIONS"
-  else
-    local extra_args="$extra_args --with-options=enable_large_mem_tests"
-  fi    
-
-  echo "  Starting job $desc"
-
-  local comment="no_comment"
-
-  if [ "$TEST_SCRIPT" = "True" ]; then
-    local rand=$[ 1 + $[ RANDOM % 10 ]]
-    sleep $rand
-
-    if [ $rand -gt 5 ]; then
-      run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
-    fi
-  else
-    run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
-    local -i build_start_time=$(date +%s)
-    run_cmd make -j 48 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
-    local -i build_end_time=$(date +%s)
-    comment="build_time=$(($build_end_time-$build_start_time))"
-
-    if [[ "$BUILD_ONLY" == False ]]; then
-      run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
-      local -i run_end_time=$(date +%s)
-      comment="$comment run_time=$(($run_end_time-$build_end_time))"
-    fi
-  fi
-
-  report_and_log_test_result 0 $desc "$comment"
-
-  return 0
-}
-
-# wait_for_jobs <NUM-JOBS>
-wait_for_jobs() {
-  local -i max_jobs=$1
-  local -i num_active_jobs=$(jobs | wc -l)
-  while [ $num_active_jobs -ge $max_jobs ]
-  do
-    sleep 1
-    num_active_jobs=$(jobs | wc -l)
-    jobs >& /dev/null
-  done
-}
-
-# run_in_background <COMPILER> <BUILD> <BUILD_TYPE>
-run_in_background() {
-  local compiler=$1
-
-  local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
-  # Don't override command line input.
-  # if [[ "$BUILD_ONLY" == True ]]; then
-  #   num_jobs=8
-  # else
-    if [[ "$compiler" == cuda* ]]; then
-      num_jobs=1
-    fi
-    if [[ "$compiler" == clang ]]; then 
-      num_jobs=1
-    fi
-  # fi
-  wait_for_jobs $num_jobs
-
-  single_build_and_test $* &
-}
-
-# build_and_test_all <COMPILER>
-build_and_test_all() {
-  # Get compiler data.
-  local compiler=$1
-  if [ -z "$CUSTOM_BUILD_LIST" ]; then
-    local compiler_build_list=$(get_compiler_build_list $compiler)
-  else
-    local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ')
-  fi
-
-  # Do builds.
-  local build
-  for build in $compiler_build_list
-  do
-    run_in_background $compiler $build $BUILD_TYPE
-
-    # If not cuda, do a hwloc test too.
-    if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
-      run_in_background $compiler $build "hwloc-$BUILD_TYPE"
-    fi
-  done
-
-  return 0
-}
-
-get_test_root_dir() {
-  local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort)
-  local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l)
-  local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP}
-
-  if [ $num_to_delete -gt 0 ]; then
-    /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete)
-  fi
-
-  echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S")
-}
-
-wait_summarize_and_exit() {
-  wait_for_jobs 1
-
-  echo "#######################################################"
-  echo "PASSED TESTS"
-  echo "#######################################################"
-
-  local passed_test
-  for passed_test in $(\ls -1 $PASSED_DIR | sort)
-  do
-    echo $passed_test $(cat $PASSED_DIR/$passed_test)
-  done
-
-  local -i rv=0
-  if [ "$(ls -A $FAILED_DIR)" ]; then
-    echo "#######################################################"
-    echo "FAILED TESTS"
-    echo "#######################################################"
-
-    local failed_test
-    for failed_test in $(\ls -1 $FAILED_DIR | sort)
-    do
-      echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
-      rv=$rv+1
-    done
-  fi
-
-  exit $rv
-}
-
-#
-# Main.
-#
-
-ROOT_DIR=$(get_test_root_dir)
-mkdir -p $ROOT_DIR
-cd $ROOT_DIR
-
-PASSED_DIR=$ROOT_DIR/results/passed
-FAILED_DIR=$ROOT_DIR/results/failed
-mkdir -p $PASSED_DIR
-mkdir -p $FAILED_DIR
-
-echo "Going to test compilers: " $COMPILERS_TO_TEST
-for COMPILER in $COMPILERS_TO_TEST; do
-  echo "Testing compiler $COMPILER"
-  build_and_test_all $COMPILER
-done
-
-wait_summarize_and_exit
diff --git a/config/yaml/volta.yaml b/config/yaml/volta.yaml
deleted file mode 100644
index f67af9c2a44..00000000000
--- a/config/yaml/volta.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-packages:
- kokkos:
-  variants: +cuda +openmp +volta70 +cuda_lambda +wrapper ^cuda@10.1
-  compiler: [gcc@7.2.0]
diff --git a/containers/src/Kokkos_Bitset.hpp b/containers/src/Kokkos_Bitset.hpp
index cd5ca4ea512..f50ab0a0f7e 100644
--- a/containers/src/Kokkos_Bitset.hpp
+++ b/containers/src/Kokkos_Bitset.hpp
@@ -28,24 +28,6 @@
 
 namespace Kokkos {
 
-namespace Impl {
-//! Either append to the label if the property already exists, or set it.
-template <typename... P>
-auto with_updated_label(const ViewCtorProp<P...>& view_ctor_prop,
-                        const std::string& label) {
-  using vcp_t = ViewCtorProp<P...>;
-  //! If the label property is already set, append. Otherwise, set label.
-  if constexpr (vcp_t::has_label) {
-    vcp_t new_ctor_props(view_ctor_prop);
-    static_cast<ViewCtorProp<void, std::string>&>(new_ctor_props)
-        .value.append(label);
-    return new_ctor_props;
-  } else {
-    return Impl::with_properties_if_unset(view_ctor_prop, label);
-  }
-}
-}  // namespace Impl
-
 template <typename Device = Kokkos::DefaultExecutionSpace>
 class Bitset;
 
@@ -92,9 +74,10 @@ class Bitset {
   using block_view_type = View<unsigned*, Device, MemoryTraits<RandomAccess>>;
 
  public:
-  /// constructor
+  Bitset() = default;
+
   /// arg_size := number of bit in set
-  Bitset(unsigned arg_size = 0u) : Bitset(Kokkos::view_alloc(), arg_size) {}
+  Bitset(unsigned arg_size) : Bitset(Kokkos::view_alloc(), arg_size) {}
 
   template <class... P>
   Bitset(const Impl::ViewCtorProp<P...>& arg_prop, unsigned arg_size)
@@ -108,9 +91,8 @@ class Bitset {
         "Allocation properties should not contain the 'pointer' property.");
 
     //! Update 'label' property and allocate.
-    const auto prop_copy = Kokkos::Impl::with_updated_label(
-        Impl::with_properties_if_unset(arg_prop, std::string("Bitset")),
-        " - blocks");
+    const auto prop_copy =
+        Impl::with_properties_if_unset(arg_prop, std::string("Bitset"));
     m_blocks =
         block_view_type(prop_copy, ((m_size + block_mask) >> block_shift));
 
@@ -310,8 +292,8 @@ class Bitset {
   }
 
  private:
-  unsigned m_size;
-  unsigned m_last_block_mask;
+  unsigned m_size            = 0;
+  unsigned m_last_block_mask = 0;
   block_view_type m_blocks;
 
  private:
diff --git a/containers/src/Kokkos_DualView.hpp b/containers/src/Kokkos_DualView.hpp
index 84bced2cc44..e821570a8d5 100644
--- a/containers/src/Kokkos_DualView.hpp
+++ b/containers/src/Kokkos_DualView.hpp
@@ -292,15 +292,6 @@ class DualView : public ViewTraits<DataType, Properties...> {
         d_view(src.d_view),
         h_view(src.h_view) {}
 
-  //! Copy assignment operator (shallow copy assignment)
-  template <typename DT, typename... DP>
-  DualView& operator=(const DualView<DT, DP...>& src) {
-    modified_flags = src.modified_flags;
-    d_view         = src.d_view;
-    h_view         = src.h_view;
-    return *this;
-  }
-
   //! Subview constructor
   template <class DT, class... DP, class Arg0, class... Args>
   DualView(const DualView<DT, DP...>& src, const Arg0& arg0, Args... args)
diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp
index 52aa86d8ee4..5fa59f1b7cd 100644
--- a/containers/src/Kokkos_DynRankView.hpp
+++ b/containers/src/Kokkos_DynRankView.hpp
@@ -1340,7 +1340,7 @@ class ViewMapping<
 
   template <class MemoryTraits>
   struct apply {
-    static_assert(Kokkos::is_memory_traits<MemoryTraits>::value, "");
+    static_assert(Kokkos::is_memory_traits<MemoryTraits>::value);
 
     using traits_type =
         Kokkos::ViewTraits<data_type, array_layout,
@@ -1653,8 +1653,17 @@ KOKKOS_FUNCTION auto as_view_of_rank_n(
         Kokkos::abort("Converting DynRankView to a View of mis-matched rank!");)
   }
 
-  return View<typename RankDataType<T, N>::type, Args...>(
-      v.data(), v.impl_map().layout());
+  auto layout = v.impl_map().layout();
+
+  if constexpr (std::is_same_v<decltype(layout), Kokkos::LayoutLeft> ||
+                std::is_same_v<decltype(layout), Kokkos::LayoutRight> ||
+                std::is_same_v<decltype(layout), Kokkos::LayoutStride> ||
+                is_layouttiled<decltype(layout)>::value) {
+    for (int i = N; i < 7; ++i)
+      layout.dimension[i] = KOKKOS_IMPL_CTOR_DEFAULT_ARG;
+  }
+
+  return View<typename RankDataType<T, N>::type, Args...>(v.data(), layout);
 }
 
 template <typename Function, typename... Args>
diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp
index 92aead28784..91a7e4a9273 100644
--- a/containers/src/Kokkos_OffsetView.hpp
+++ b/containers/src/Kokkos_OffsetView.hpp
@@ -124,15 +124,8 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds(
                                              args...);
          Kokkos::Impl::throw_runtime_exception(std::string(buffer));))
 
-    KOKKOS_IF_ON_DEVICE((
-        /* Check #1: is there a SharedAllocationRecord?
-          (we won't use it, but if it is not there then there isn't
-           a corresponding SharedAllocationHeader containing a label).
-          This check should cover the case of Views that don't
-          have the Unmanaged trait but were initialized by pointer. */
-        if (tracker.has_record()) {
-          Kokkos::Impl::operator_bounds_error_on_device(map);
-        } else { Kokkos::abort("OffsetView bounds error"); }))
+    KOKKOS_IF_ON_DEVICE(
+        (Kokkos::abort("OffsetView bounds error"); (void)tracker;))
   }
 }
 
diff --git a/containers/src/Kokkos_UnorderedMap.hpp b/containers/src/Kokkos_UnorderedMap.hpp
index e001c062de3..78a6a238ece 100644
--- a/containers/src/Kokkos_UnorderedMap.hpp
+++ b/containers/src/Kokkos_UnorderedMap.hpp
@@ -243,16 +243,16 @@ class UnorderedMap {
   using const_map_type = UnorderedMap<const_key_type, const_value_type,
                                       device_type, hasher_type, equal_to_type>;
 
-  static const bool is_set = std::is_void<value_type>::value;
-  static const bool has_const_key =
-      std::is_same<const_key_type, declared_key_type>::value;
-  static const bool has_const_value =
-      is_set || std::is_same<const_value_type, declared_value_type>::value;
+  static constexpr bool is_set = std::is_void_v<value_type>;
+  static constexpr bool has_const_key =
+      std::is_same_v<const_key_type, declared_key_type>;
+  static constexpr bool has_const_value =
+      is_set || std::is_same_v<const_value_type, declared_value_type>;
 
-  static const bool is_insertable_map =
+  static constexpr bool is_insertable_map =
       !has_const_key && (is_set || !has_const_value);
-  static const bool is_modifiable_map = has_const_key && !has_const_value;
-  static const bool is_const_map      = has_const_key && has_const_value;
+  static constexpr bool is_modifiable_map = has_const_key && !has_const_value;
+  static constexpr bool is_const_map      = has_const_key && has_const_value;
 
   using insert_result = UnorderedMapInsertResult;
 
@@ -337,27 +337,27 @@ class UnorderedMap {
         Impl::get_property<Impl::LabelTag>(prop_copy) + " - size"));
 
     m_available_indexes =
-        bitset_type(Kokkos::Impl::with_updated_label(prop_copy, " - bitset"),
+        bitset_type(Kokkos::Impl::append_to_label(prop_copy, " - bitset"),
                     calculate_capacity(capacity_hint));
 
     m_hash_lists = size_type_view(
-        Kokkos::Impl::with_updated_label(prop_copy_noinit, " - hash list"),
+        Kokkos::Impl::append_to_label(prop_copy_noinit, " - hash list"),
         Impl::find_hash_size(capacity()));
 
     m_next_index = size_type_view(
-        Kokkos::Impl::with_updated_label(prop_copy_noinit, " - next index"),
+        Kokkos::Impl::append_to_label(prop_copy_noinit, " - next index"),
         capacity() + 1);  // +1 so that the *_at functions can always return a
                           // valid reference
 
-    m_keys = key_type_view(
-        Kokkos::Impl::with_updated_label(prop_copy, " - keys"), capacity());
+    m_keys = key_type_view(Kokkos::Impl::append_to_label(prop_copy, " - keys"),
+                           capacity());
 
-    m_values = value_type_view(
-        Kokkos::Impl::with_updated_label(prop_copy, " - values"),
-        is_set ? 0 : capacity());
+    m_values =
+        value_type_view(Kokkos::Impl::append_to_label(prop_copy, " - values"),
+                        is_set ? 0 : capacity());
 
     m_scalars =
-        scalars_view(Kokkos::Impl::with_updated_label(prop_copy, " - scalars"));
+        scalars_view(Kokkos::Impl::append_to_label(prop_copy, " - scalars"));
 
     /**
      * Deep copies should also be done using the space instance if given.
diff --git a/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
index 8f8cd9523b7..a979ee40d8c 100644
--- a/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
+++ b/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
@@ -27,6 +27,18 @@
 namespace Kokkos {
 namespace Impl {
 
+//! Append to the label contained in view_ctor_prop.
+template <typename... P>
+auto append_to_label(const ViewCtorProp<P...>& view_ctor_prop,
+                     const std::string& label) {
+  using vcp_t = ViewCtorProp<P...>;
+  static_assert(vcp_t::has_label);
+  vcp_t new_ctor_props(view_ctor_prop);
+  static_cast<ViewCtorProp<void, std::string>&>(new_ctor_props)
+      .value.append(label);
+  return new_ctor_props;
+}
+
 uint32_t find_hash_size(uint32_t size);
 
 template <typename Map>
diff --git a/containers/unit_tests/Makefile b/containers/unit_tests/Makefile
index 2e35832cc89..18410882bca 100644
--- a/containers/unit_tests/Makefile
+++ b/containers/unit_tests/Makefile
@@ -35,8 +35,8 @@ TESTS = Bitset DualView DynamicView DynViewAPI_generic DynViewAPI_rank12345 DynV
 tmp := $(foreach device, $(KOKKOS_DEVICELIST), \
   tmp2 := $(foreach test, $(TESTS), \
     $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\
-      $(shell echo "\#include<Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \
-      $(shell echo "\#include<Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \
+      $(shell echo "$(H)include<Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \
+      $(shell echo "$(H)include<Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \
      )\
   ) \
 )
diff --git a/containers/unit_tests/TestBitset.hpp b/containers/unit_tests/TestBitset.hpp
index 3ad0d2bf573..9923453f72c 100644
--- a/containers/unit_tests/TestBitset.hpp
+++ b/containers/unit_tests/TestBitset.hpp
@@ -23,6 +23,8 @@
 #include <Kokkos_Bitset.hpp>
 #include <array>
 
+#include <../../core/unit_test/tools/include/ToolTestingUtilities.hpp>
+
 namespace Test {
 
 namespace Impl {
@@ -155,7 +157,7 @@ void test_bitset() {
 
   {
     unsigned ts = 100u;
-    bitset_type b1;
+    bitset_type b1(Kokkos::view_alloc("MyBitset"), 0);
     ASSERT_TRUE(b1.is_allocated());
 
     b1 = bitset_type(ts);
@@ -165,6 +167,9 @@ void test_bitset() {
     ASSERT_TRUE(b1.is_allocated());
     ASSERT_TRUE(b2.is_allocated());
     ASSERT_TRUE(b3.is_allocated());
+
+    bitset_type b4;
+    ASSERT_FALSE(b4.is_allocated());
   }
 
   std::array<unsigned, 7> test_sizes = {
@@ -237,6 +242,24 @@ void test_bitset() {
 }
 
 TEST(TEST_CATEGORY, bitset) { test_bitset<TEST_EXECSPACE>(); }
+
+TEST(TEST_CATEGORY, bitset_default_constructor_no_alloc) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableAllocs());
+
+  auto success = validate_absence(
+      [&]() {
+        Kokkos::Bitset bs;
+        EXPECT_FALSE(bs.is_allocated());
+      },
+      [&](AllocateDataEvent) {
+        return MatchDiagnostic{true, {"Found alloc event"}};
+      });
+  ASSERT_TRUE(success);
+
+  listen_tool_events(Config::DisableAll());
+}
+
 }  // namespace Test
 
 #endif  // KOKKOS_TEST_BITSET_HPP
diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt
index 7f3916da312..e0dba03e1ec 100644
--- a/core/perf_test/CMakeLists.txt
+++ b/core/perf_test/CMakeLists.txt
@@ -50,8 +50,8 @@ ELSE()
   FetchContent_Declare(
     googlebenchmark
     DOWNLOAD_EXTRACT_TIMESTAMP FALSE
-    URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz
-    URL_HASH MD5=14d14849e075af116143a161bc3b927b
+    URL https://github.com/google/benchmark/archive/refs/tags/v1.7.1.tar.gz
+    URL_HASH MD5=0459a6c530df9851bee6504c3e37c2e7
   )
   FetchContent_MakeAvailable(googlebenchmark)
   list(POP_BACK CMAKE_MESSAGE_INDENT)
diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt
index 012af0a7d06..b84677e61b6 100644
--- a/core/src/CMakeLists.txt
+++ b/core/src/CMakeLists.txt
@@ -18,10 +18,16 @@ IF (NOT desul_FOUND)
   ENDIF()
   IF(KOKKOS_ENABLE_SYCL)
     SET(DESUL_ATOMICS_ENABLE_SYCL ON)
+    IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL)
+      SET(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON)
+    ENDIF()
   ENDIF()
   IF(KOKKOS_ENABLE_OPENMPTARGET)
     SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP
   ENDIF()
+  IF(KOKKOS_ENABLE_OPENACC)
+    SET(DESUL_ATOMICS_ENABLE_OPENACC ON)
+  ENDIF()
   CONFIGURE_FILE(
     ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/Config.hpp.cmake.in
     ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp
@@ -80,10 +86,6 @@ IF (KOKKOS_ENABLE_HPX)
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp)
 ENDIF()
 
-IF (NOT KOKKOS_ENABLE_MEMKIND)
-  LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_HBWSpace.cpp)
-ENDIF()
-
 IF (KOKKOS_ENABLE_SERIAL)
   APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp)
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp)
@@ -180,20 +182,15 @@ IF (Kokkos_ENABLE_IMPL_MDSPAN)
 ENDIF()
 
 KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC)
-KOKKOS_LINK_TPL(kokkoscore PUBLIC MEMKIND)
-IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE)
-  KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA)
-ENDIF()
+KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL)
-KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT)
 # On *nix-like systems (Linux, macOS) we need pthread for C++ std::thread
 IF (NOT WIN32)
   KOKKOS_LINK_TPL(kokkoscore PUBLIC THREADS)
 ENDIF()
 IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE)
   KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM)
-  KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL)
 ENDIF()
 
 # FIXME: We need a proper solution to figure out whether to enable
diff --git a/core/src/Cuda/Kokkos_Cuda.hpp b/core/src/Cuda/Kokkos_Cuda.hpp
index 8bfaf8317b6..276d03da265 100644
--- a/core/src/Cuda/Kokkos_Cuda.hpp
+++ b/core/src/Cuda/Kokkos_Cuda.hpp
@@ -46,7 +46,6 @@ static_assert(false,
 
 namespace Kokkos {
 namespace Impl {
-class CudaExec;
 class CudaInternal;
 }  // namespace Impl
 }  // namespace Kokkos
@@ -129,33 +128,16 @@ class Cuda {
 
   /// \brief True if and only if this method is being called in a
   ///   thread-parallel function.
-  KOKKOS_INLINE_FUNCTION static int in_parallel() {
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() {
 #if defined(__CUDA_ARCH__)
     return true;
 #else
     return false;
 #endif
   }
-
-  /** \brief  Set the device in a "sleep" state.
-   *
-   * This function sets the device in a "sleep" state in which it is
-   * not ready for work.  This may consume less resources than if the
-   * device were in an "awake" state, but it may also take time to
-   * bring the device from a sleep state to be ready for work.
-   *
-   * \return True if the device is in the "sleep" state, else false if
-   *   the device is actively working and could not enter the "sleep"
-   *   state.
-   */
-  static bool sleep();
-
-  /// \brief Wake the device from the 'sleep' state so it is ready for work.
-  ///
-  /// \return True if the device is in the "ready" state, else "false"
-  ///  if the device is actively working (which also means that it's
-  ///  awake).
-  static bool wake();
+#endif
 
   /// \brief Wait until all dispatched functors complete.
   ///
@@ -199,18 +181,37 @@ class Cuda {
   //! Initialize, telling the CUDA run-time library which device to use.
   static void impl_initialize(InitializationSettings const&);
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
   /// \brief Cuda device architecture of the selected device.
   ///
   /// This matches the __CUDA_ARCH__ specification.
-  static size_type device_arch();
+  KOKKOS_DEPRECATED static size_type device_arch() {
+    const cudaDeviceProp& cudaProp = Cuda().cuda_device_prop();
+    return cudaProp.major * 100 + cudaProp.minor;
+  }
 
   //! Query device count.
-  static size_type detect_device_count();
+  KOKKOS_DEPRECATED static size_type detect_device_count() {
+    int count;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count));
+    return count;
+  }
 
   /** \brief  Detect the available devices and their architecture
    *          as defined by the __CUDA_ARCH__ specification.
    */
-  static std::vector<unsigned> detect_device_arch();
+  KOKKOS_DEPRECATED static std::vector<unsigned> detect_device_arch() {
+    int count;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count));
+    std::vector<unsigned> out;
+    for (int i = 0; i < count; ++i) {
+      cudaDeviceProp prop;
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i));
+      out.push_back(prop.major * 100 + prop.minor);
+    }
+    return out;
+  }
+#endif
 
   cudaStream_t cuda_stream() const;
   int cuda_device() const;
diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp
index c6512f44dad..0944937e1bf 100644
--- a/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -33,7 +33,6 @@
 
 //#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
 #include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_MemorySpace.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
 
@@ -83,11 +82,11 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
   KOKKOS_IMPL_CUDA_SAFE_CALL(
       (CudaInternal::singleton().cuda_memcpy_async_wrapper(
           dst, src, n, cudaMemcpyDefault, s)));
-  Impl::cuda_stream_synchronize(
-      s,
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync",
       Kokkos::Tools::Experimental::SpecialSynchronizationCases::
           DeepCopyResourceSynchronization,
-      "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync");
+      [&]() { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(s)); });
 }
 
 }  // namespace Impl
@@ -135,11 +134,23 @@ void kokkos_impl_cuda_set_pin_uvm_to_host(bool val) {
 
 namespace Kokkos {
 
-CudaSpace::CudaSpace() : m_device(Kokkos::Cuda().cuda_device()) {}
+CudaSpace::CudaSpace()
+    : m_device(Kokkos::Cuda().cuda_device()),
+      m_stream(Kokkos::Cuda().cuda_stream()) {}
+CudaSpace::CudaSpace(int device_id, cudaStream_t stream)
+    : m_device(device_id), m_stream(stream) {}
 
-CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {}
+CudaUVMSpace::CudaUVMSpace()
+    : m_device(Kokkos::Cuda().cuda_device()),
+      m_stream(Kokkos::Cuda().cuda_stream()) {}
+CudaUVMSpace::CudaUVMSpace(int device_id, cudaStream_t stream)
+    : m_device(device_id), m_stream(stream) {}
 
-CudaHostPinnedSpace::CudaHostPinnedSpace() {}
+CudaHostPinnedSpace::CudaHostPinnedSpace()
+    : m_device(Kokkos::Cuda().cuda_device()),
+      m_stream(Kokkos::Cuda().cuda_stream()) {}
+CudaHostPinnedSpace::CudaHostPinnedSpace(int device_id, cudaStream_t stream)
+    : m_device(device_id), m_stream(stream) {}
 
 size_t memory_threshold_g = 40000;  // 40 kB
 
@@ -161,52 +172,38 @@ void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size,
 }
 
 namespace {
-void *impl_allocate_common(const Cuda &exec_space, const char *arg_label,
-                           const size_t arg_alloc_size,
+void *impl_allocate_common(const int device_id,
+                           [[maybe_unused]] const cudaStream_t stream,
+                           const char *arg_label, const size_t arg_alloc_size,
                            const size_t arg_logical_size,
                            const Kokkos::Tools::SpaceHandle arg_handle,
-                           bool exec_space_provided) {
+                           [[maybe_unused]] bool stream_sync_only) {
   void *ptr = nullptr;
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id));
 
+  cudaError_t error_code = cudaSuccess;
 #ifndef CUDART_VERSION
 #error CUDART_VERSION undefined!
 #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
-  cudaError_t error_code;
   if (arg_alloc_size >= memory_threshold_g) {
-    if (exec_space_provided) {
-      error_code =
-          exec_space.impl_internal_space_instance()->cuda_malloc_async_wrapper(
-              &ptr, arg_alloc_size);
-      exec_space.fence("Kokkos::Cuda: backend fence after async malloc");
-    } else {
-      error_code = Impl::CudaInternal::singleton().cuda_malloc_async_wrapper(
-          &ptr, arg_alloc_size);
-      Impl::cuda_device_synchronize(
-          "Kokkos::Cuda: backend fence after async malloc");
+    error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream);
+
+    if (error_code == cudaSuccess) {
+      if (stream_sync_only) {
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      } else {
+        Impl::cuda_device_synchronize(
+            "Kokkos::Cuda: backend fence after async malloc");
+      }
     }
-  } else {
-    error_code =
-        (exec_space_provided
-             ? exec_space.impl_internal_space_instance()->cuda_malloc_wrapper(
-                   &ptr, arg_alloc_size)
-             : Impl::CudaInternal::singleton().cuda_malloc_wrapper(
-                   &ptr, arg_alloc_size));
-  }
-#else
-  cudaError_t error_code;
-  if (exec_space_provided) {
-    error_code = exec_space.impl_internal_space_instance()->cuda_malloc_wrapper(
-        &ptr, arg_alloc_size);
-  } else {
-    error_code = Impl::CudaInternal::singleton().cuda_malloc_wrapper(
-        &ptr, arg_alloc_size);
-  }
+  } else
 #endif
+  { error_code = cudaMalloc(&ptr, arg_alloc_size); }
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
     // This is the only way to clear the last error, which
     // we should do here since we're turning it into an
     // exception here
-    exec_space.impl_internal_space_instance()->cuda_get_last_error_wrapper();
+    cudaGetLastError();
     throw Experimental::CudaRawMemoryAllocationFailure(
         arg_alloc_size, error_code,
         Experimental::RawMemoryAllocationFailure::AllocationMechanism::
@@ -226,7 +223,7 @@ void *CudaSpace::impl_allocate(
     const char *arg_label, const size_t arg_alloc_size,
     const size_t arg_logical_size,
     const Kokkos::Tools::SpaceHandle arg_handle) const {
-  return impl_allocate_common(Kokkos::Cuda{}, arg_label, arg_alloc_size,
+  return impl_allocate_common(m_device, m_stream, arg_label, arg_alloc_size,
                               arg_logical_size, arg_handle, false);
 }
 
@@ -234,8 +231,9 @@ void *CudaSpace::impl_allocate(
     const Cuda &exec_space, const char *arg_label, const size_t arg_alloc_size,
     const size_t arg_logical_size,
     const Kokkos::Tools::SpaceHandle arg_handle) const {
-  return impl_allocate_common(exec_space, arg_label, arg_alloc_size,
-                              arg_logical_size, arg_handle, true);
+  return impl_allocate_common(
+      exec_space.cuda_device(), exec_space.cuda_stream(), arg_label,
+      arg_alloc_size, arg_logical_size, arg_handle, true);
 }
 
 void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const {
@@ -256,28 +254,27 @@ void *CudaUVMSpace::impl_allocate(
   if (arg_alloc_size > 0) {
     Kokkos::Impl::num_uvm_allocations++;
 
-    auto error_code =
-        Impl::CudaInternal::singleton().cuda_malloc_managed_wrapper(
-            &ptr, arg_alloc_size, cudaMemAttachGlobal);
-
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
-    if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host())
-      KOKKOS_IMPL_CUDA_SAFE_CALL(
-          (Impl::CudaInternal::singleton().cuda_mem_advise_wrapper(
-              ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
-              cudaCpuDeviceId)));
-#endif
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
+    cudaError_t error_code =
+        cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
 
     if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
       // This is the only way to clear the last error, which
       // we should do here since we're turning it into an
       // exception here
-      Impl::CudaInternal::singleton().cuda_get_last_error_wrapper();
+      cudaGetLastError();
       throw Experimental::CudaRawMemoryAllocationFailure(
           arg_alloc_size, error_code,
           Experimental::RawMemoryAllocationFailure::AllocationMechanism::
               CudaMallocManaged);
     }
+
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
+    if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host())
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
+                        cudaCpuDeviceId));
+#endif
   }
   Cuda::impl_static_fence(
       "Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation");
@@ -302,13 +299,14 @@ void *CudaHostPinnedSpace::impl_allocate(
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   void *ptr = nullptr;
 
-  auto error_code = Impl::CudaInternal::singleton().cuda_host_alloc_wrapper(
-      &ptr, arg_alloc_size, cudaHostAllocDefault);
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
+  cudaError_t error_code =
+      cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault);
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
     // This is the only way to clear the last error, which
     // we should do here since we're turning it into an
     // exception here
-    Impl::CudaInternal::singleton().cuda_get_last_error_wrapper();
+    cudaGetLastError();
     throw Experimental::CudaRawMemoryAllocationFailure(
         arg_alloc_size, error_code,
         Experimental::RawMemoryAllocationFailure::AllocationMechanism::
@@ -350,18 +348,17 @@ void CudaSpace::impl_deallocate(
     if (arg_alloc_size >= memory_threshold_g) {
       Impl::cuda_device_synchronize(
           "Kokkos::Cuda: backend fence before async free");
-      KOKKOS_IMPL_CUDA_SAFE_CALL(
-          (Impl::CudaInternal::singleton().cuda_free_async_wrapper(
-              arg_alloc_ptr)));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, m_stream));
       Impl::cuda_device_synchronize(
           "Kokkos::Cuda: backend fence after async free");
     } else {
-      KOKKOS_IMPL_CUDA_SAFE_CALL(
-          (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr)));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
     }
 #else
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr)));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
 #endif
   } catch (...) {
   }
@@ -393,8 +390,8 @@ void CudaUVMSpace::impl_deallocate(
   try {
     if (arg_alloc_ptr != nullptr) {
       Kokkos::Impl::num_uvm_allocations--;
-      KOKKOS_IMPL_CUDA_SAFE_CALL(
-          (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr)));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
     }
   } catch (...) {
   }
@@ -424,8 +421,8 @@ void CudaHostPinnedSpace::impl_deallocate(
                                       reported_size);
   }
   try {
-    KOKKOS_IMPL_CUDA_SAFE_CALL((
-        Impl::CudaInternal::singleton().cuda_free_host_wrapper(arg_alloc_ptr)));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
   } catch (...) {
   }
 }
@@ -438,160 +435,6 @@ void CudaHostPinnedSpace::impl_deallocate(
 namespace Kokkos {
 namespace Impl {
 
-#ifdef KOKKOS_ENABLE_DEBUG
-SharedAllocationRecord<void, void>
-    SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record;
-
-SharedAllocationRecord<void, void>
-    SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record;
-
-SharedAllocationRecord<void, void>
-    SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::s_root_record;
-#endif
-
-//==============================================================================
-// <editor-fold desc="SharedAllocationRecord destructors"> {{{1
-
-SharedAllocationRecord<Kokkos::CudaSpace, void>::~SharedAllocationRecord() {
-  auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
-  m_space.deallocate(m_label.c_str(),
-                     SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     alloc_size, (alloc_size - sizeof(SharedAllocationHeader)));
-}
-
-void SharedAllocationRecord<Kokkos::CudaSpace, void>::deep_copy_header_no_exec(
-    void *ptr, const void *header) {
-  Kokkos::Cuda exec;
-  Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>(exec, ptr, header,
-                                               sizeof(SharedAllocationHeader));
-  exec.fence(
-      "SharedAllocationRecord<Kokkos::CudaSpace, "
-      "void>::SharedAllocationRecord(): fence after copying header from "
-      "HostSpace");
-}
-
-SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::~SharedAllocationRecord() {
-  m_space.deallocate(m_label.c_str(),
-                     SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     SharedAllocationRecord<void, void>::m_alloc_size,
-                     (SharedAllocationRecord<void, void>::m_alloc_size -
-                      sizeof(SharedAllocationHeader)));
-}
-
-SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
-                       void>::~SharedAllocationRecord() {
-  m_space.deallocate(m_label.c_str(),
-                     SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     SharedAllocationRecord<void, void>::m_alloc_size,
-                     (SharedAllocationRecord<void, void>::m_alloc_size -
-                      sizeof(SharedAllocationHeader)));
-}
-
-// </editor-fold> end SharedAllocationRecord destructors }}}1
-//==============================================================================
-
-//==============================================================================
-// <editor-fold desc="SharedAllocationRecord constructors"> {{{1
-
-SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord(
-    const Kokkos::CudaSpace &arg_space, const std::string &arg_label,
-    const size_t arg_alloc_size,
-    const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record,
-#endif
-          Impl::checked_allocation_with_header(arg_space, arg_label,
-                                               arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-
-  SharedAllocationHeader header;
-
-  this->base_t::_fill_host_accessible_header_info(header, arg_label);
-
-  // Copy to device memory
-  Kokkos::Cuda exec;
-  Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>(
-      exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader));
-  exec.fence(
-      "SharedAllocationRecord<Kokkos::CudaSpace, "
-      "void>::SharedAllocationRecord(): fence after copying header from "
-      "HostSpace");
-}
-
-SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord(
-    const Kokkos::Cuda &arg_exec_space, const Kokkos::CudaSpace &arg_space,
-    const std::string &arg_label, const size_t arg_alloc_size,
-    const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record,
-#endif
-          Impl::checked_allocation_with_header(arg_exec_space, arg_space,
-                                               arg_label, arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-
-  SharedAllocationHeader header;
-
-  this->base_t::_fill_host_accessible_header_info(header, arg_label);
-
-  // Copy to device memory
-  Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>(arg_exec_space,
-                                               RecordBase::m_alloc_ptr, &header,
-                                               sizeof(SharedAllocationHeader));
-}
-
-SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::SharedAllocationRecord(
-    const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label,
-    const size_t arg_alloc_size,
-    const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record,
-#endif
-          Impl::checked_allocation_with_header(arg_space, arg_label,
-                                               arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
-                                                  arg_label);
-}
-
-SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::
-    SharedAllocationRecord(
-        const Kokkos::CudaHostPinnedSpace &arg_space,
-        const std::string &arg_label, const size_t arg_alloc_size,
-        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
-                                  void>::s_root_record,
-#endif
-          Impl::checked_allocation_with_header(arg_space, arg_label,
-                                               arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
-                                                  arg_label);
-}
-
-// </editor-fold> end SharedAllocationRecord constructors }}}1
-//==============================================================================
-
 void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
                            bool to_device) {
   if ((ptr == nullptr) || (bytes == 0)) return;
@@ -620,19 +463,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
 
 #include <impl/Kokkos_SharedAlloc_timpl.hpp>
 
-namespace Kokkos {
-namespace Impl {
-
-// To avoid additional compilation cost for something that's (mostly?) not
-// performance sensitive, we explicity instantiate these CRTP base classes here,
-// where we have access to the associated *_timpl.hpp header files.
-template class SharedAllocationRecordCommon<Kokkos::CudaSpace>;
-template class HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>;
-template class SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>;
-template class SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>;
-
-}  // end namespace Impl
-}  // end namespace Kokkos
+KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
+    Kokkos::CudaSpace);
+KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
+    Kokkos::CudaUVMSpace);
+KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
+    Kokkos::CudaHostPinnedSpace);
 
 // </editor-fold> end Explicit instantiations of CRTP Base classes }}}1
 //==============================================================================
diff --git a/core/src/Cuda/Kokkos_CudaSpace.hpp b/core/src/Cuda/Kokkos_CudaSpace.hpp
index b8fa335cd3b..0e20193e8b4 100644
--- a/core/src/Cuda/Kokkos_CudaSpace.hpp
+++ b/core/src/Cuda/Kokkos_CudaSpace.hpp
@@ -68,6 +68,11 @@ class CudaSpace {
   /*--------------------------------*/
 
   CudaSpace();
+
+ private:
+  CudaSpace(int device_id, cudaStream_t stream);
+
+ public:
   CudaSpace(CudaSpace&& rhs)      = default;
   CudaSpace(const CudaSpace& rhs) = default;
   CudaSpace& operator=(CudaSpace&& rhs) = default;
@@ -89,9 +94,11 @@ class CudaSpace {
                   const size_t arg_alloc_size,
                   const size_t arg_logical_size = 0) const;
 
+  static CudaSpace impl_create(int device_id, cudaStream_t stream) {
+    return CudaSpace(device_id, stream);
+  }
+
  private:
-  template <class, class, class, class>
-  friend class Kokkos::Experimental::LogicalMemorySpace;
   void* impl_allocate(const Cuda& exec_space, const char* arg_label,
                       const size_t arg_alloc_size,
                       const size_t arg_logical_size = 0,
@@ -112,10 +119,10 @@ class CudaSpace {
   static constexpr const char* name() { return m_name; }
 
  private:
-  int m_device;  ///< Which Cuda device
+  int m_device;
+  cudaStream_t m_stream;
 
   static constexpr const char* m_name = "Cuda";
-  friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
 };
 
 template <>
@@ -149,6 +156,11 @@ class CudaUVMSpace {
   /*--------------------------------*/
 
   CudaUVMSpace();
+
+ private:
+  CudaUVMSpace(int device_id, cudaStream_t stream);
+
+ public:
   CudaUVMSpace(CudaUVMSpace&& rhs)      = default;
   CudaUVMSpace(const CudaUVMSpace& rhs) = default;
   CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default;
@@ -156,6 +168,16 @@ class CudaUVMSpace {
   ~CudaUVMSpace()                                  = default;
 
   /**\brief  Allocate untracked memory in the cuda space */
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
+    return allocate(arg_alloc_size);
+  }
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const {
+    return allocate(arg_label, arg_alloc_size, arg_logical_size);
+  }
   void* allocate(const size_t arg_alloc_size) const;
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
                  const size_t arg_logical_size = 0) const;
@@ -167,8 +189,6 @@ class CudaUVMSpace {
                   const size_t arg_logical_size = 0) const;
 
  private:
-  template <class, class, class, class>
-  friend class Kokkos::Experimental::LogicalMemorySpace;
   void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
                       const size_t arg_logical_size = 0,
                       const Kokkos::Tools::SpaceHandle =
@@ -189,8 +209,13 @@ class CudaUVMSpace {
 #endif
   /*--------------------------------*/
 
+  static CudaUVMSpace impl_create(int device_id, cudaStream_t stream) {
+    return CudaUVMSpace(device_id, stream);
+  }
+
  private:
-  int m_device;  ///< Which Cuda device
+  int m_device;
+  cudaStream_t m_stream;
 
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
   static bool kokkos_impl_cuda_pin_uvm_to_host_v;
@@ -223,6 +248,11 @@ class CudaHostPinnedSpace {
   /*--------------------------------*/
 
   CudaHostPinnedSpace();
+
+ private:
+  CudaHostPinnedSpace(int device_id, cudaStream_t stream);
+
+ public:
   CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs)      = default;
   CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default;
   CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default;
@@ -230,6 +260,16 @@ class CudaHostPinnedSpace {
   ~CudaHostPinnedSpace()                                         = default;
 
   /**\brief  Allocate untracked memory in the space */
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
+    return allocate(arg_alloc_size);
+  }
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const {
+    return allocate(arg_label, arg_alloc_size, arg_logical_size);
+  }
   void* allocate(const size_t arg_alloc_size) const;
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
                  const size_t arg_logical_size = 0) const;
@@ -240,9 +280,11 @@ class CudaHostPinnedSpace {
                   const size_t arg_alloc_size,
                   const size_t arg_logical_size = 0) const;
 
+  static CudaHostPinnedSpace impl_create(int device_id, cudaStream_t stream) {
+    return CudaHostPinnedSpace(device_id, stream);
+  }
+
  private:
-  template <class, class, class, class>
-  friend class Kokkos::Experimental::LogicalMemorySpace;
   void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
                       const size_t arg_logical_size = 0,
                       const Kokkos::Tools::SpaceHandle =
@@ -258,6 +300,9 @@ class CudaHostPinnedSpace {
   static constexpr const char* name() { return m_name; }
 
  private:
+  int m_device;
+  cudaStream_t m_stream;
+
   static constexpr const char* m_name = "CudaHostPinned";
 
   /*--------------------------------*/
@@ -280,15 +325,12 @@ const std::unique_ptr<Kokkos::Cuda>& cuda_get_deep_copy_space(
     bool initialize = true);
 
 static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace,
-                                              Kokkos::CudaSpace>::assignable,
-              "");
-static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace,
-                                              Kokkos::CudaUVMSpace>::assignable,
-              "");
+                                              Kokkos::CudaSpace>::assignable);
+static_assert(Kokkos::Impl::MemorySpaceAccess<
+              Kokkos::CudaUVMSpace, Kokkos::CudaUVMSpace>::assignable);
 static_assert(
     Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace,
-                                    Kokkos::CudaHostPinnedSpace>::assignable,
-    "");
+                                    Kokkos::CudaHostPinnedSpace>::assignable);
 
 //----------------------------------------
 
@@ -516,179 +558,10 @@ struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-namespace Impl {
-
-template <>
-class SharedAllocationRecord<Kokkos::CudaSpace, void>
-    : public HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace> {
- private:
-  friend class SharedAllocationRecord<Kokkos::CudaUVMSpace, void>;
-  friend class SharedAllocationRecordCommon<Kokkos::CudaSpace>;
-  friend class HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>;
-
-  using RecordBase = SharedAllocationRecord<void, void>;
-  using base_t =
-      HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-
-#ifdef KOKKOS_ENABLE_DEBUG
-  static RecordBase s_root_record;
-#endif
-
-  const Kokkos::CudaSpace m_space;
-
- protected:
-  ~SharedAllocationRecord();
-  SharedAllocationRecord() = default;
-
-  // This constructor does not forward to the one without exec_space arg
-  // in order to work around https://github.com/kokkos/kokkos/issues/5258
-  // This constructor is templated so I can't just put it into the cpp file
-  // like the other constructor.
-  template <typename ExecutionSpace>
-  SharedAllocationRecord(
-      const ExecutionSpace& /*exec_space*/, const Kokkos::CudaSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
-      : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-            &SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record,
-#endif
-            Impl::checked_allocation_with_header(arg_space, arg_label,
-                                                 arg_alloc_size),
-            sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-            arg_label),
-        m_space(arg_space) {
-
-    SharedAllocationHeader header;
-
-    this->base_t::_fill_host_accessible_header_info(header, arg_label);
-
-    // Copy to device memory
-    // workaround for issue with NVCC and MSVC
-    // https://github.com/kokkos/kokkos/issues/5258
-    deep_copy_header_no_exec(RecordBase::m_alloc_ptr, &header);
-  }
-
-  SharedAllocationRecord(
-      const Kokkos::Cuda& exec_space, const Kokkos::CudaSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-
-  SharedAllocationRecord(
-      const Kokkos::CudaSpace& arg_space, const std::string& arg_label,
-      const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-
-  // helper function to work around MSVC+NVCC issue
-  // https://github.com/kokkos/kokkos/issues/5258
-  static void deep_copy_header_no_exec(void*, const void*);
-};
-
-template <>
-class SharedAllocationRecord<Kokkos::CudaUVMSpace, void>
-    : public SharedAllocationRecordCommon<Kokkos::CudaUVMSpace> {
- private:
-  friend class SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>;
-
-  using base_t     = SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>;
-  using RecordBase = SharedAllocationRecord<void, void>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-
-  static RecordBase s_root_record;
-
-  const Kokkos::CudaUVMSpace m_space;
-
- protected:
-  ~SharedAllocationRecord();
-  SharedAllocationRecord() = default;
-
-  // This constructor does not forward to the one without exec_space arg
-  // in order to work around https://github.com/kokkos/kokkos/issues/5258
-  // This constructor is templated so I can't just put it into the cpp file
-  // like the other constructor.
-  template <typename ExecutionSpace>
-  SharedAllocationRecord(
-      const ExecutionSpace& /*exec_space*/,
-      const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label,
-      const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
-      : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-            &SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record,
-#endif
-            Impl::checked_allocation_with_header(arg_space, arg_label,
-                                                 arg_alloc_size),
-            sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-            arg_label),
-        m_space(arg_space) {
-    this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
-                                                    arg_label);
-  }
-
-  SharedAllocationRecord(
-      const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label,
-      const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-};
-
-template <>
-class SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>
-    : public SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace> {
- private:
-  friend class SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>;
-
-  using RecordBase = SharedAllocationRecord<void, void>;
-  using base_t     = SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-
-  static RecordBase s_root_record;
-
-  const Kokkos::CudaHostPinnedSpace m_space;
-
- protected:
-  ~SharedAllocationRecord();
-  SharedAllocationRecord() = default;
-
-  // This constructor does not forward to the one without exec_space arg
-  // in order to work around https://github.com/kokkos/kokkos/issues/5258
-  // This constructor is templated so I can't just put it into the cpp file
-  // like the other constructor.
-  template <typename ExecutionSpace>
-  SharedAllocationRecord(
-      const ExecutionSpace& /*exec_space*/,
-      const Kokkos::CudaHostPinnedSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
-      : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-            &SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
-                                    void>::s_root_record,
-#endif
-            Impl::checked_allocation_with_header(arg_space, arg_label,
-                                                 arg_alloc_size),
-            sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-            arg_label),
-        m_space(arg_space) {
-    this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
-                                                    arg_label);
-  }
-
-  SharedAllocationRecord(
-      const Kokkos::CudaHostPinnedSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
+KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(
+    Kokkos::CudaSpace);
+KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace);
+KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace);
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/core/src/Cuda/Kokkos_Cuda_Error.hpp b/core/src/Cuda/Kokkos_Cuda_Error.hpp
index f68e05f7804..c4458c910ca 100644
--- a/core/src/Cuda/Kokkos_Cuda_Error.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_Error.hpp
@@ -27,10 +27,6 @@
 namespace Kokkos {
 namespace Impl {
 
-void cuda_stream_synchronize(
-    const cudaStream_t stream,
-    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
-    const std::string& name);
 void cuda_device_synchronize(const std::string& name);
 void cuda_stream_synchronize(const cudaStream_t stream,
                              const std::string& name);
diff --git a/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp
index a4d064e544a..5a821ab64a3 100644
--- a/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp
@@ -23,8 +23,7 @@
 
 #include <Kokkos_Graph_fwd.hpp>
 
-#include <impl/Kokkos_GraphImpl.hpp>    // GraphAccess needs to be complete
-#include <impl/Kokkos_SharedAlloc.hpp>  // SharedAllocationRecord
+#include <impl/Kokkos_GraphImpl.hpp>  // GraphAccess needs to be complete
 
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_Parallel_Reduce.hpp>
@@ -50,10 +49,6 @@ class GraphNodeKernelImpl<Kokkos::Cuda, PolicyType, Functor, PatternTag,
   // covers and we're not modifying it
   Kokkos::ObservingRawPtr<const cudaGraph_t> m_graph_ptr    = nullptr;
   Kokkos::ObservingRawPtr<cudaGraphNode_t> m_graph_node_ptr = nullptr;
-  // Note: owned pointer to CudaSpace memory (used for global memory launches),
-  // which we're responsible for deallocating, but not responsible for calling
-  // its destructor.
-  using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
   // Basically, we have to make this mutable for the same reasons that the
   // global kernel buffers in the Cuda instance are mutable...
   mutable Kokkos::OwningRawPtr<base_t> m_driver_storage = nullptr;
@@ -82,9 +77,7 @@ class GraphNodeKernelImpl<Kokkos::Cuda, PolicyType, Functor, PatternTag,
 
   ~GraphNodeKernelImpl() {
     if (m_driver_storage) {
-      // We should be the only owner, but this is still the easiest way to
-      // allocate and deallocate aligned memory for these sorts of things
-      Record::decrement(Record::get_record(m_driver_storage));
+      Kokkos::CudaSpace().deallocate(m_driver_storage, sizeof(base_t));
     }
   }
 
@@ -99,13 +92,8 @@ class GraphNodeKernelImpl<Kokkos::Cuda, PolicyType, Functor, PatternTag,
 
   Kokkos::ObservingRawPtr<base_t> allocate_driver_memory_buffer() const {
     KOKKOS_EXPECTS(m_driver_storage == nullptr)
-
-    auto* record = Record::allocate(
-        Kokkos::CudaSpace{}, "GraphNodeKernel global memory functor storage",
-        sizeof(base_t));
-
-    Record::increment(record);
-    m_driver_storage = reinterpret_cast<base_t*>(record->data());
+    m_driver_storage = static_cast<base_t*>(Kokkos::CudaSpace().allocate(
+        "GraphNodeKernel global memory functor storage", sizeof(base_t)));
     KOKKOS_ENSURES(m_driver_storage != nullptr)
     return m_driver_storage;
   }
diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index d7f853d9910..849e8b3b30e 100644
--- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -26,10 +26,10 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <Cuda/Kokkos_Cuda_Error.hpp>
-#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
-#include <Cuda/Kokkos_Cuda_Instance.hpp>
-#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
+//#include <Cuda/Kokkos_Cuda_Error.hpp>
+//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
+//#include <Cuda/Kokkos_Cuda_Instance.hpp>
+//#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Tools.hpp>
 #include <impl/Kokkos_CheckedIntegerOps.hpp>
@@ -97,21 +97,21 @@ __global__ void query_cuda_kernel_arch(int *d_arch) {
 }
 
 /** Query what compute capability is actually launched to the device: */
-int cuda_kernel_arch() {
+int cuda_kernel_arch(int device_id) {
   int arch    = 0;
   int *d_arch = nullptr;
 
-  KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_malloc_wrapper(
-      reinterpret_cast<void **>(&d_arch), sizeof(int))));
-  KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper(
-      d_arch, &arch, sizeof(int), cudaMemcpyDefault)));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaMalloc(reinterpret_cast<void **>(&d_arch), sizeof(int)));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault));
 
   query_cuda_kernel_arch<<<1, 1>>>(d_arch);
 
-  KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper(
-      &arch, d_arch, sizeof(int), cudaMemcpyDefault)));
   KOKKOS_IMPL_CUDA_SAFE_CALL(
-      (CudaInternal::singleton().cuda_free_wrapper(d_arch)));
+      cudaMemcpy(&arch, d_arch, sizeof(int), cudaMemcpyDefault));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(d_arch));
   return arch;
 }
 
@@ -135,7 +135,6 @@ Kokkos::View<uint32_t *, Kokkos::CudaSpace> cuda_global_unique_token_locks(
   return locks;
 }
 
-// FIXME_CUDA_MULTIPLE_DEVICES
 void cuda_device_synchronize(const std::string &name) {
   Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
       name,
@@ -144,16 +143,16 @@ void cuda_device_synchronize(const std::string &name) {
 #if defined(KOKKOS_COMPILER_CLANG)
       // annotate with __host__ silence a clang warning about using
       // cudaDeviceSynchronize in device code
-      [] __host__() {
-        KOKKOS_IMPL_CUDA_SAFE_CALL(
-            (CudaInternal::singleton().cuda_device_synchronize_wrapper()));
-      });
+      [] __host__()
 #else
-      []() {
-        KOKKOS_IMPL_CUDA_SAFE_CALL(
-            (CudaInternal::singleton().cuda_device_synchronize_wrapper()));
-      });
+      []()
 #endif
+      {
+        for (int cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) {
+          KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device));
+          KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+        }
+      });
 }
 
 void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr,
@@ -168,25 +167,11 @@ void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr,
       });
 }
 
-void cuda_stream_synchronize(
-    const cudaStream_t stream,
-    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
-    const std::string &name) {
-  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
-      name, reason, [&]() {
-        KOKKOS_IMPL_CUDA_SAFE_CALL(
-            (Impl::CudaInternal::singleton().cuda_stream_synchronize_wrapper(
-                stream)));
-      });
-}
-
 void cuda_internal_error_throw(cudaError e, const char *name, const char *file,
                                const int line) {
   std::ostringstream out;
-  out << name << " error( "
-      << CudaInternal::singleton().cuda_get_error_name_wrapper<false>(e)
-      << "): "
-      << CudaInternal::singleton().cuda_get_error_string_wrapper<false>(e);
+  out << name << " error( " << cudaGetErrorName(e)
+      << "): " << cudaGetErrorString(e);
   if (file) {
     out << " " << file << ":" << line;
   }
@@ -196,10 +181,8 @@ void cuda_internal_error_throw(cudaError e, const char *name, const char *file,
 void cuda_internal_error_abort(cudaError e, const char *name, const char *file,
                                const int line) {
   std::ostringstream out;
-  out << name << " error( "
-      << CudaInternal::singleton().cuda_get_error_name_wrapper<false>(e)
-      << "): "
-      << CudaInternal::singleton().cuda_get_error_string_wrapper<false>(e);
+  out << name << " error( " << cudaGetErrorName(e)
+      << "): " << cudaGetErrorString(e);
   if (file) {
     out << " " << file << ":" << line;
   }
@@ -208,96 +191,6 @@ void cuda_internal_error_abort(cudaError e, const char *name, const char *file,
   host_abort(out.str().c_str());
 }
 
-//----------------------------------------------------------------------------
-// Some significant cuda device properties:
-//
-// cudaDeviceProp::name                : Text label for device
-// cudaDeviceProp::major               : Device major number
-// cudaDeviceProp::minor               : Device minor number
-// cudaDeviceProp::warpSize            : number of threads per warp
-// cudaDeviceProp::multiProcessorCount : number of multiprocessors
-// cudaDeviceProp::sharedMemPerBlock   : capacity of shared memory per block
-// cudaDeviceProp::totalConstMem       : capacity of constant memory
-// cudaDeviceProp::totalGlobalMem      : capacity of global memory
-// cudaDeviceProp::maxGridSize[3]      : maximum grid size
-
-//
-//  Section 4.4.2.4 of the CUDA Toolkit Reference Manual
-//
-// struct cudaDeviceProp {
-//   char name[256];
-//   size_t totalGlobalMem;
-//   size_t sharedMemPerBlock;
-//   int regsPerBlock;
-//   int warpSize;
-//   size_t memPitch;
-//   int maxThreadsPerBlock;
-//   int maxThreadsDim[3];
-//   int maxGridSize[3];
-//   size_t totalConstMem;
-//   int major;
-//   int minor;
-//   int clockRate;
-//   size_t textureAlignment;
-//   int deviceOverlap;
-//   int multiProcessorCount;
-//   int kernelExecTimeoutEnabled;
-//   int integrated;
-//   int canMapHostMemory;
-//   int computeMode;
-//   int concurrentKernels;
-//   int ECCEnabled;
-//   int pciBusID;
-//   int pciDeviceID;
-//   int tccDriver;
-//   int asyncEngineCount;
-//   int unifiedAddressing;
-//   int memoryClockRate;
-//   int memoryBusWidth;
-//   int l2CacheSize;
-//   int maxThreadsPerMultiProcessor;
-// };
-
-namespace {
-
-class CudaInternalDevices {
- public:
-  enum { MAXIMUM_DEVICE_COUNT = 64 };
-  struct cudaDeviceProp m_cudaProp[MAXIMUM_DEVICE_COUNT];
-  int m_cudaDevCount;
-
-  CudaInternalDevices();
-
-  static const CudaInternalDevices &singleton();
-};
-
-CudaInternalDevices::CudaInternalDevices() {
-  // See 'cudaSetDeviceFlags' for host-device thread interaction
-  // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
-
-  KOKKOS_IMPL_CUDA_SAFE_CALL(
-      (CudaInternal::singleton().cuda_get_device_count_wrapper<false>(
-          &m_cudaDevCount)));
-
-  if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
-    Kokkos::abort(
-        "Sorry, you have more GPUs per node than we thought anybody would ever "
-        "have. Please report this to github.com/kokkos/kokkos.");
-  }
-  for (int i = 0; i < m_cudaDevCount; ++i) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (CudaInternal::singleton().cuda_get_device_properties_wrapper<false>(
-            m_cudaProp + i, i)));
-  }
-}
-
-const CudaInternalDevices &CudaInternalDevices::singleton() {
-  static CudaInternalDevices self;
-  return self;
-}
-
-}  // namespace
-
 //----------------------------------------------------------------------------
 
 int Impl::CudaInternal::concurrency() {
@@ -307,8 +200,6 @@ int Impl::CudaInternal::concurrency() {
 }
 
 void CudaInternal::print_configuration(std::ostream &s) const {
-  const CudaInternalDevices &dev_info = CudaInternalDevices::singleton();
-
 #if defined(KOKKOS_ENABLE_CUDA)
   s << "macro  KOKKOS_ENABLE_CUDA      : defined\n";
 #endif
@@ -317,22 +208,23 @@ void CudaInternal::print_configuration(std::ostream &s) const {
     << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n';
 #endif
 
-  for (int i = 0; i < dev_info.m_cudaDevCount; ++i) {
-    s << "Kokkos::Cuda[ " << i << " ] " << dev_info.m_cudaProp[i].name
-      << " capability " << dev_info.m_cudaProp[i].major << "."
-      << dev_info.m_cudaProp[i].minor << ", Total Global Memory: "
-      << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem)
+  for (int i : get_visible_devices()) {
+    cudaDeviceProp prop;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i));
+    s << "Kokkos::Cuda[ " << i << " ] " << prop.name << " capability "
+      << prop.major << "." << prop.minor
+      << ", Total Global Memory: " << human_memory_size(prop.totalGlobalMem)
       << ", Shared Memory per Block: "
-      << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock);
+      << human_memory_size(prop.sharedMemPerBlock);
     if (m_cudaDev == i) s << " : Selected";
-    s << std::endl;
+    s << '\n';
   }
 }
 
 //----------------------------------------------------------------------------
 
 CudaInternal::~CudaInternal() {
-  if (m_stream || m_scratchSpace || m_scratchFlags || m_scratchUnified) {
+  if (m_scratchSpace || m_scratchFlags || m_scratchUnified) {
     std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
               << std::endl;
   }
@@ -370,45 +262,53 @@ void CudaInternal::fence() const {
   fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence");
 }
 
-void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) {
+void CudaInternal::initialize(cudaStream_t stream) {
   KOKKOS_EXPECTS(!is_initialized());
 
   if (was_finalized)
     Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
   was_initialized = true;
 
+  // Check that the device associated with the stream matches cuda_device
+  CUcontext context;
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuStreamGetCtx(stream, &context)));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxPushCurrent(context)));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxGetDevice(&m_cudaDev)));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev));
+
+  m_stream = stream;
+  CudaInternal::cuda_devices.insert(m_cudaDev);
+
+  // Allocate a staging buffer for constant mem in pinned host memory
+  // and an event to avoid overwriting driver for previous kernel launches
+  if (!constantMemHostStagingPerDevice[m_cudaDev])
+    KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper(
+        reinterpret_cast<void **>(&constantMemHostStagingPerDevice[m_cudaDev]),
+        CudaTraits::ConstantMemoryUsage)));
+
+  if (!constantMemReusablePerDevice[m_cudaDev])
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (cuda_event_create_wrapper(&constantMemReusablePerDevice[m_cudaDev])));
+
   //----------------------------------
   // Multiblock reduction uses scratch flags for counters
   // and scratch space for partial reduction values.
   // Allocate some initial space.  This will grow as needed.
 
   {
-    const unsigned reduce_block_count =
-        m_maxWarpCount * Impl::CudaTraits::WarpSize;
+    // Maximum number of warps,
+    // at most one warp per thread in a warp for reduction.
+    auto const maxWarpCount = std::min<unsigned>(
+        m_deviceProp.maxThreadsPerBlock / CudaTraits::WarpSize,
+        CudaTraits::WarpSize);
+    unsigned const reduce_block_count =
+        maxWarpCount * Impl::CudaTraits::WarpSize;
 
     (void)scratch_unified(16 * sizeof(size_type));
     (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type));
     (void)scratch_space(reduce_block_count * 16 * sizeof(size_type));
   }
 
-  // Init the array for used for arbitrarily sized atomics
-  if (this == &singleton()) {
-    desul::Impl::init_lock_arrays();  // FIXME
-  }
-
-  // Allocate a staging buffer for constant mem in pinned host memory
-  // and an event to avoid overwriting driver for previous kernel launches
-  if (this == &singleton()) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper(
-        reinterpret_cast<void **>(&constantMemHostStaging),
-        CudaTraits::ConstantMemoryUsage)));
-
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (cuda_event_create_wrapper(&constantMemReusable)));
-  }
-
-  m_stream        = stream;
-  m_manage_stream = manage_stream;
   for (int i = 0; i < m_n_team_scratch; ++i) {
     m_team_scratch_current_size[i] = 0;
     m_team_scratch_ptr[i]          = nullptr;
@@ -427,22 +327,23 @@ void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) {
 Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const {
   if (verify_is_initialized("scratch_flags") &&
       m_scratchFlagsCount < scratch_count(size)) {
-    m_scratchFlagsCount = scratch_count(size);
+    auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream);
 
-    using Record =
-        Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
+    if (m_scratchFlags) {
+      mem_space.deallocate(m_scratchFlags,
+                           m_scratchFlagsCount * sizeScratchGrain);
+    }
 
-    if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));
+    m_scratchFlagsCount = scratch_count(size);
 
     std::size_t alloc_size =
         multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain);
-    Record *const r = Record::allocate(
-        Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags", alloc_size);
-
-    Record::increment(r);
-
-    m_scratchFlags = reinterpret_cast<size_type *>(r->data());
+    m_scratchFlags = static_cast<size_type *>(
+        mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size));
 
+    // We only zero-initialize the allocation when we actually allocate.
+    // It's the responsibility of the features using scratch_flags,
+    // namely parallel_reduce and parallel_scan, to reset the used values to 0.
     KOKKOS_IMPL_CUDA_SAFE_CALL(
         (cuda_memset_wrapper(m_scratchFlags, 0, alloc_size)));
   }
@@ -453,21 +354,19 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const {
 Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const {
   if (verify_is_initialized("scratch_space") &&
       m_scratchSpaceCount < scratch_count(size)) {
-    m_scratchSpaceCount = scratch_count(size);
+    auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream);
 
-    using Record =
-        Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
+    if (m_scratchSpace) {
+      mem_space.deallocate(m_scratchSpace,
+                           m_scratchSpaceCount * sizeScratchGrain);
+    }
 
-    if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));
+    m_scratchSpaceCount = scratch_count(size);
 
     std::size_t alloc_size =
         multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain);
-    Record *const r = Record::allocate(
-        Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace", alloc_size);
-
-    Record::increment(r);
-
-    m_scratchSpace = reinterpret_cast<size_type *>(r->data());
+    m_scratchSpace = static_cast<size_type *>(
+        mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size));
   }
 
   return m_scratchSpace;
@@ -476,23 +375,20 @@ Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const {
 Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const {
   if (verify_is_initialized("scratch_unified") &&
       m_scratchUnifiedCount < scratch_count(size)) {
-    m_scratchUnifiedCount = scratch_count(size);
+    auto mem_space =
+        Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream);
 
-    using Record =
-        Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>;
+    if (m_scratchUnified) {
+      mem_space.deallocate(m_scratchUnified,
+                           m_scratchUnifiedCount * sizeScratchGrain);
+    }
 
-    if (m_scratchUnified)
-      Record::decrement(Record::get_record(m_scratchUnified));
+    m_scratchUnifiedCount = scratch_count(size);
 
     std::size_t alloc_size =
         multiply_overflow_abort(m_scratchUnifiedCount, sizeScratchGrain);
-    Record *const r =
-        Record::allocate(Kokkos::CudaHostPinnedSpace(),
-                         "Kokkos::InternalScratchUnified", alloc_size);
-
-    Record::increment(r);
-
-    m_scratchUnified = reinterpret_cast<size_type *>(r->data());
+    m_scratchUnified = static_cast<size_type *>(
+        mem_space.allocate("Kokkos::InternalScratchUnified", alloc_size));
   }
 
   return m_scratchUnified;
@@ -500,21 +396,16 @@ Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const {
 
 Cuda::size_type *CudaInternal::scratch_functor(const std::size_t size) const {
   if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) {
-    m_scratchFunctorSize = size;
-
-    using Record =
-        Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
-
-    if (m_scratchFunctor)
-      Record::decrement(Record::get_record(m_scratchFunctor));
+    auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream);
 
-    Record *const r =
-        Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFunctor",
-                         m_scratchFunctorSize);
+    if (m_scratchFunctor) {
+      mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize);
+    }
 
-    Record::increment(r);
+    m_scratchFunctorSize = size;
 
-    m_scratchFunctor = reinterpret_cast<size_type *>(r->data());
+    m_scratchFunctor = static_cast<size_type *>(mem_space.allocate(
+        "Kokkos::InternalScratchFunctor", m_scratchFunctorSize));
   }
 
   return m_scratchFunctor;
@@ -537,21 +428,21 @@ void *CudaInternal::resize_team_scratch_space(int scratch_pool_id,
   // Multiple ParallelFor/Reduce Teams can call this function at the same time
   // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race
   // condition.
+  auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream);
   if (m_team_scratch_current_size[scratch_pool_id] == 0) {
     m_team_scratch_current_size[scratch_pool_id] = bytes;
     m_team_scratch_ptr[scratch_pool_id] =
-        Kokkos::kokkos_malloc<Kokkos::CudaSpace>(
-            "Kokkos::CudaSpace::TeamScratchMemory",
-            m_team_scratch_current_size[scratch_pool_id]);
+        mem_space.allocate("Kokkos::CudaSpace::TeamScratchMemory",
+                           m_team_scratch_current_size[scratch_pool_id]);
   }
   if ((bytes > m_team_scratch_current_size[scratch_pool_id]) ||
       ((bytes < m_team_scratch_current_size[scratch_pool_id]) &&
        (force_shrink))) {
+    mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id],
+                         m_team_scratch_current_size[scratch_pool_id]);
     m_team_scratch_current_size[scratch_pool_id] = bytes;
     m_team_scratch_ptr[scratch_pool_id] =
-        Kokkos::kokkos_realloc<Kokkos::CudaSpace>(
-            m_team_scratch_ptr[scratch_pool_id],
-            m_team_scratch_current_size[scratch_pool_id]);
+        mem_space.allocate("Kokkos::CudaSpace::TeamScratchMemory", bytes);
   }
   return m_team_scratch_ptr[scratch_pool_id];
 }
@@ -568,50 +459,33 @@ void CudaInternal::finalize() {
 
   was_finalized = true;
 
-  // Only finalize this if we're the singleton
-  if (this == &singleton()) {
-    (void)Impl::cuda_global_unique_token_locks(true);
-    desul::Impl::finalize_lock_arrays();  // FIXME
-
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (cuda_free_host_wrapper(constantMemHostStaging)));
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (cuda_event_destroy_wrapper(constantMemReusable)));
-    auto &deep_copy_space =
-        Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false);
-    if (deep_copy_space)
-      deep_copy_space->impl_internal_space_instance()->finalize();
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (cuda_stream_destroy_wrapper(cuda_get_deep_copy_stream())));
-  }
-
+  auto cuda_mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream);
   if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
-    using RecordCuda = Kokkos::Impl::SharedAllocationRecord<CudaSpace>;
-    using RecordHost =
-        Kokkos::Impl::SharedAllocationRecord<CudaHostPinnedSpace>;
-
-    RecordCuda::decrement(RecordCuda::get_record(m_scratchFlags));
-    RecordCuda::decrement(RecordCuda::get_record(m_scratchSpace));
-    RecordHost::decrement(RecordHost::get_record(m_scratchUnified));
-    if (m_scratchFunctorSize > 0)
-      RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor));
+    auto host_mem_space =
+        Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream);
+    cuda_mem_space.deallocate(m_scratchFlags,
+                              m_scratchFlagsCount * sizeScratchGrain);
+    cuda_mem_space.deallocate(m_scratchSpace,
+                              m_scratchSpaceCount * sizeScratchGrain);
+    host_mem_space.deallocate(m_scratchUnified,
+                              m_scratchUnifiedCount * sizeScratchGrain);
+    if (m_scratchFunctorSize > 0) {
+      cuda_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize);
+    }
   }
 
   for (int i = 0; i < m_n_team_scratch; ++i) {
     if (m_team_scratch_current_size[i] > 0)
-      Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr[i]);
+      cuda_mem_space.deallocate(m_team_scratch_ptr[i],
+                                m_team_scratch_current_size[i]);
   }
 
-  if (m_manage_stream && get_stream() != nullptr)
-    KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_stream_destroy_wrapper(m_stream)));
-
   m_scratchSpaceCount   = 0;
   m_scratchFlagsCount   = 0;
   m_scratchUnifiedCount = 0;
   m_scratchSpace        = nullptr;
   m_scratchFlags        = nullptr;
   m_scratchUnified      = nullptr;
-  m_stream              = nullptr;
   for (int i = 0; i < m_n_team_scratch; ++i) {
     m_team_scratch_current_size[i] = 0;
     m_team_scratch_ptr[i]          = nullptr;
@@ -624,30 +498,6 @@ void CudaInternal::finalize() {
 
 //----------------------------------------------------------------------------
 
-Cuda::size_type cuda_internal_multiprocessor_count() {
-  return CudaInternal::singleton().m_multiProcCount;
-}
-
-CudaSpace::size_type cuda_internal_maximum_concurrent_block_count() {
-#if defined(KOKKOS_ARCH_KEPLER)
-  // Compute capability 3.0 through 3.7
-  enum : int { max_resident_blocks_per_multiprocessor = 16 };
-#else
-  // Compute capability 5.0 through 6.2
-  enum : int { max_resident_blocks_per_multiprocessor = 32 };
-#endif
-  return CudaInternal::singleton().m_multiProcCount *
-         max_resident_blocks_per_multiprocessor;
-};
-
-Cuda::size_type cuda_internal_maximum_warp_count() {
-  return CudaInternal::singleton().m_maxWarpCount;
-}
-
-std::array<Cuda::size_type, 3> cuda_internal_maximum_grid_count() {
-  return CudaInternal::singleton().m_maxBlock;
-}
-
 Cuda::size_type *cuda_internal_scratch_space(const Cuda &instance,
                                              const std::size_t size) {
   return instance.impl_internal_space_instance()->scratch_space(size);
@@ -670,10 +520,6 @@ Cuda::size_type *cuda_internal_scratch_unified(const Cuda &instance,
 
 namespace Kokkos {
 
-Cuda::size_type Cuda::detect_device_count() {
-  return Impl::CudaInternalDevices::singleton().m_cudaDevCount;
-}
-
 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
 int Cuda::concurrency() {
 #else
@@ -687,25 +533,23 @@ int Cuda::impl_is_initialized() {
 }
 
 void Cuda::impl_initialize(InitializationSettings const &settings) {
-  const int cuda_device_id = Impl::get_gpu(settings);
-  const auto &dev_info     = Impl::CudaInternalDevices::singleton();
-
-  const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id];
+  const std::vector<int> &visible_devices = Impl::get_visible_devices();
+  const int cuda_device_id =
+      Impl::get_gpu(settings).value_or(visible_devices[0]);
 
-  Impl::CudaInternal::m_cudaDev    = cuda_device_id;
+  cudaDeviceProp cudaProp;
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaGetDeviceProperties(&cudaProp, cuda_device_id));
   Impl::CudaInternal::m_deviceProp = cudaProp;
-
-  Kokkos::Impl::cuda_device_synchronize(
-      "Kokkos::CudaInternal::initialize: Fence on space initialization");
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
 
   // Query what compute capability architecture a kernel executes:
-  Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch();
+  Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(cuda_device_id);
 
   if (Impl::CudaInternal::m_cudaArch == 0) {
-    std::stringstream ss;
-    ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n";
-    std::string msg = ss.str();
-    Kokkos::abort(msg.c_str());
+    Kokkos::abort(
+        "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n");
   }
 
   int compiled_major = Impl::CudaInternal::m_cudaArch / 100;
@@ -761,77 +605,41 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
   }
 #endif
 
-  //----------------------------------
-  // number of multiprocessors
-  Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount;
-
-  //----------------------------------
-  // Maximum number of warps,
-  // at most one warp per thread in a warp for reduction.
-  Impl::CudaInternal::m_maxWarpCount =
-      cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize;
-
-  if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) {
-    Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize;
-  }
-
-  //----------------------------------
-  // Maximum number of blocks:
-
-  Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0];
-  Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1];
-  Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2];
-
-  Impl::CudaInternal::m_shmemPerSM       = cudaProp.sharedMemPerMultiprocessor;
-  Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock;
-  Impl::CudaInternal::m_maxBlocksPerSM =
-      Impl::CudaInternal::m_cudaArch < 500
-          ? 16
-          : (Impl::CudaInternal::m_cudaArch < 750
-                 ? 32
-                 : (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32));
-  Impl::CudaInternal::m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor;
-  Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock;
-
   //----------------------------------
 
   cudaStream_t singleton_stream;
-  KOKKOS_IMPL_CUDA_SAFE_CALL(
-      (Impl::CudaInternal::singleton().cuda_stream_create_wrapper(
-          &singleton_stream)));
-
-  auto &cuda_singleton = Impl::CudaInternal::singleton();
-  cuda_singleton.initialize(singleton_stream, /*manage*/ true);
-}
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream));
 
-std::vector<unsigned> Cuda::detect_device_arch() {
-  const Impl::CudaInternalDevices &s = Impl::CudaInternalDevices::singleton();
-
-  std::vector<unsigned> output(s.m_cudaDevCount);
-
-  for (int i = 0; i < s.m_cudaDevCount; ++i) {
-    output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor;
-  }
+  // Init the array for used for arbitrarily sized atomics
+  desul::Impl::init_lock_arrays();  // FIXME
 
-  return output;
+  Impl::CudaInternal::singleton().initialize(singleton_stream);
 }
 
-Cuda::size_type Cuda::device_arch() {
-  const int dev_id = Impl::CudaInternal::singleton().m_cudaDev;
+void Cuda::impl_finalize() {
+  (void)Impl::cuda_global_unique_token_locks(true);
+  desul::Impl::finalize_lock_arrays();  // FIXME
 
-  int dev_arch = 0;
-
-  if (0 <= dev_id) {
-    const struct cudaDeviceProp &cudaProp =
-        Impl::CudaInternalDevices::singleton().m_cudaProp[dev_id];
-
-    dev_arch = cudaProp.major * 100 + cudaProp.minor;
+  for (const auto cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaFreeHost(Kokkos::Impl::CudaInternal::constantMemHostStagingPerDevice
+                         [cuda_device]));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy(
+        Kokkos::Impl::CudaInternal::constantMemReusablePerDevice[cuda_device]));
   }
 
-  return dev_arch;
-}
+  auto &deep_copy_space = Impl::cuda_get_deep_copy_space(/*initialize*/ false);
+  if (deep_copy_space)
+    deep_copy_space->impl_internal_space_instance()->finalize();
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaStreamDestroy(Impl::cuda_get_deep_copy_stream()));
 
-void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); }
+  Impl::CudaInternal::singleton().finalize();
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaStreamDestroy(Impl::CudaInternal::singleton().m_stream));
+}
 
 Cuda::Cuda()
     : m_space_instance(&Impl::CudaInternal::singleton(),
@@ -845,13 +653,17 @@ KOKKOS_DEPRECATED Cuda::Cuda(cudaStream_t stream, bool manage_stream)
            manage_stream ? Impl::ManageStream::yes : Impl::ManageStream::no) {}
 
 Cuda::Cuda(cudaStream_t stream, Impl::ManageStream manage_stream)
-    : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) {
-        ptr->finalize();
-        delete ptr;
-      }) {
+    : m_space_instance(
+          new Impl::CudaInternal, [manage_stream](Impl::CudaInternal *ptr) {
+            ptr->finalize();
+            if (static_cast<bool>(manage_stream)) {
+              KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(ptr->m_stream));
+            }
+            delete ptr;
+          }) {
   Impl::CudaInternal::singleton().verify_is_initialized(
       "Cuda instance constructor");
-  m_space_instance->initialize(stream, static_cast<bool>(manage_stream));
+  m_space_instance->initialize(stream);
 }
 
 void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const {
diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
index a324adecfeb..24f4af31019 100644
--- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -22,6 +22,10 @@
 #include <atomic>
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 #include <cuda_runtime_api.h>
+#include "Kokkos_CudaSpace.hpp"
+
+#include <set>
+#include <map>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -55,27 +59,10 @@ struct CudaTraits {
       unsigned long[ConstantMemoryUsage / sizeof(unsigned long)];
 
   static constexpr int ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */;
-
-  KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count(
-      CudaSpace::size_type i) {
-    return (i + WarpIndexMask) >> WarpIndexShift;
-  }
-
-  KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_align(
-      CudaSpace::size_type i) {
-    constexpr CudaSpace::size_type Mask = ~WarpIndexMask;
-    return (i + WarpIndexMask) & Mask;
-  }
 };
 
 //----------------------------------------------------------------------------
 
-CudaSpace::size_type cuda_internal_multiprocessor_count();
-CudaSpace::size_type cuda_internal_maximum_warp_count();
-std::array<CudaSpace::size_type, 3> cuda_internal_maximum_grid_count();
-
-CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
-
 CudaSpace::size_type* cuda_internal_scratch_flags(const Cuda&,
                                                   const std::size_t size);
 CudaSpace::size_type* cuda_internal_scratch_space(const Cuda&,
@@ -101,18 +88,10 @@ class CudaInternal {
  public:
   using size_type = Cuda::size_type;
 
-  inline static int m_cudaDev = -1;
+  int m_cudaDev = -1;
 
   // Device Properties
-  inline static int m_cudaArch                      = -1;
-  inline static unsigned m_multiProcCount           = 0;
-  inline static unsigned m_maxWarpCount             = 0;
-  inline static std::array<size_type, 3> m_maxBlock = {0, 0, 0};
-  inline static int m_shmemPerSM                    = 0;
-  inline static int m_maxShmemPerBlock              = 0;
-  inline static int m_maxBlocksPerSM                = 0;
-  inline static int m_maxThreadsPerSM               = 0;
-  inline static int m_maxThreadsPerBlock            = 0;
+  inline static int m_cudaArch = -1;
   static int concurrency();
 
   inline static cudaDeviceProp m_deviceProp;
@@ -129,7 +108,6 @@ class CudaInternal {
   mutable size_type* m_scratchFunctor;
   cudaStream_t m_stream;
   uint32_t m_instance_id;
-  bool m_manage_stream;
 
   // Team Scratch Level 1 Space
   int m_n_team_scratch = 10;
@@ -142,11 +120,11 @@ class CudaInternal {
   bool was_initialized = false;
   bool was_finalized   = false;
 
-  // FIXME_CUDA: these want to be per-device, not per-stream...  use of 'static'
-  //  here will break once there are multiple devices though
-  inline static unsigned long* constantMemHostStaging = nullptr;
-  inline static cudaEvent_t constantMemReusable       = nullptr;
-  inline static std::mutex constantMemMutex;
+  inline static std::set<int> cuda_devices = {};
+  inline static std::map<int, unsigned long*> constantMemHostStagingPerDevice =
+      {};
+  inline static std::map<int, cudaEvent_t> constantMemReusablePerDevice = {};
+  inline static std::map<int, std::mutex> constantMemMutexPerDevice     = {};
 
   static CudaInternal& singleton();
 
@@ -156,7 +134,7 @@ class CudaInternal {
     return nullptr != m_scratchSpace && nullptr != m_scratchFlags;
   }
 
-  void initialize(cudaStream_t stream, bool manage_stream);
+  void initialize(cudaStream_t stream);
   void finalize();
 
   void print_configuration(std::ostream&) const;
@@ -247,12 +225,6 @@ class CudaInternal {
     return cudaDeviceSetLimit(limit, value);
   }
 
-  template <bool setCudaDevice = true>
-  cudaError_t cuda_device_synchronize_wrapper() const {
-    if constexpr (setCudaDevice) set_cuda_device();
-    return cudaDeviceSynchronize();
-  }
-
   template <bool setCudaDevice = true>
   cudaError_t cuda_event_create_wrapper(cudaEvent_t* event) const {
     if constexpr (setCudaDevice) set_cuda_device();
@@ -290,37 +262,6 @@ class CudaInternal {
     return cudaFreeHost(ptr);
   }
 
-  template <bool setCudaDevice = true>
-  cudaError_t cuda_get_device_count_wrapper(int* count) const {
-    if constexpr (setCudaDevice) set_cuda_device();
-    return cudaGetDeviceCount(count);
-  }
-
-  template <bool setCudaDevice = true>
-  cudaError_t cuda_get_device_properties_wrapper(cudaDeviceProp* prop,
-                                                 int device) const {
-    if constexpr (setCudaDevice) set_cuda_device();
-    return cudaGetDeviceProperties(prop, device);
-  }
-
-  template <bool setCudaDevice = true>
-  const char* cuda_get_error_name_wrapper(cudaError_t error) const {
-    if constexpr (setCudaDevice) set_cuda_device();
-    return cudaGetErrorName(error);
-  }
-
-  template <bool setCudaDevice = true>
-  const char* cuda_get_error_string_wrapper(cudaError_t error) const {
-    if constexpr (setCudaDevice) set_cuda_device();
-    return cudaGetErrorString(error);
-  }
-
-  template <bool setCudaDevice = true>
-  cudaError_t cuda_get_last_error_wrapper() const {
-    if constexpr (setCudaDevice) set_cuda_device();
-    return cudaGetLastError();
-  }
-
   template <bool setCudaDevice = true>
   cudaError_t cuda_graph_add_dependencies_wrapper(
       cudaGraph_t graph, const cudaGraphNode_t* from, const cudaGraphNode_t* to,
@@ -506,10 +447,10 @@ class CudaInternal {
   }
 
   template <typename T, bool setCudaDevice = true>
-  cudaError_t cuda_func_set_attributes_wrapper(T* entry, cudaFuncAttribute attr,
-                                               int value) const {
+  cudaError_t cuda_func_set_attribute_wrapper(T* entry, cudaFuncAttribute attr,
+                                              int value) const {
     if constexpr (setCudaDevice) set_cuda_device();
-    return cudaFuncSetAttributes(entry, attr, value);
+    return cudaFuncSetAttribute(entry, attr, value);
   }
 
   template <bool setCudaDevice = true>
diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index 82a72b69021..b0dadb45f72 100644
--- a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -21,7 +21,6 @@
 #ifdef KOKKOS_ENABLE_CUDA
 
 #include <mutex>
-#include <string>
 #include <cstdint>
 #include <cmath>
 #include <Kokkos_Parallel.hpp>
@@ -118,42 +117,43 @@ inline bool is_empty_launch(dim3 const& grid, dim3 const& block) {
 }
 
 inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
-  if (cuda_instance->m_maxShmemPerBlock < shmem) {
+  int const maxShmemPerBlock = cuda_instance->m_deviceProp.sharedMemPerBlock;
+  if (maxShmemPerBlock < shmem) {
     Kokkos::Impl::throw_runtime_exception(
-        std::string("CudaParallelLaunch (or graph node creation) FAILED: shared"
-                    " memory request is too large"));
+        "CudaParallelLaunch (or graph node creation) FAILED: shared memory "
+        "request is too large");
   }
 }
 
 // These functions need to be templated on DriverType and LaunchBounds
 // so that the static bool is unique for each type combo
 // KernelFuncPtr does not necessarily contain that type information.
-// FIXME_CUDA_MULTIPLE_DEVICES
 template <class DriverType, class LaunchBounds, class KernelFuncPtr>
 const cudaFuncAttributes& get_cuda_kernel_func_attributes(
-    const KernelFuncPtr& func) {
+    int cuda_device, const KernelFuncPtr& func) {
   // Only call cudaFuncGetAttributes once for each unique kernel
   // by leveraging static variable initialization rules
-  auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
+  static std::map<int, cudaFuncAttributes> func_attr;
+  if (func_attr.find(cuda_device) == func_attr.end()) {
     cudaFuncAttributes attr;
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (CudaInternal::singleton().cuda_func_get_attributes_wrapper(&attr,
-                                                                    func)));
-    return attr;
-  };
-  static cudaFuncAttributes func_attr = wrap_get_attributes();
-  return func_attr;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
+    func_attr.emplace(cuda_device, attr);
+  }
+  return func_attr[cuda_device];
 }
 
 template <class DriverType, class LaunchBounds, class KernelFuncPtr>
-inline void configure_shmem_preference(const KernelFuncPtr& func,
+inline void configure_shmem_preference(const int cuda_device,
+                                       const KernelFuncPtr& func,
                                        const cudaDeviceProp& device_props,
                                        const size_t block_size, int& shmem,
                                        const size_t occupancy) {
 #ifndef KOKKOS_ARCH_KEPLER
 
   const auto& func_attr =
-      get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
+      get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(cuda_device,
+                                                                func);
 
   // Compute limits for number of blocks due to registers/SM
   const size_t regs_per_sm     = device_props.regsPerMultiprocessor;
@@ -222,7 +222,7 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
   // FIXME_CUDA_MULTIPLE_DEVICES
   auto set_cache_config = [&] {
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (CudaInternal::singleton().cuda_func_set_attributes_wrapper(
+        (CudaInternal::singleton().cuda_func_set_attribute_wrapper(
             func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout)));
     return carveout;
   };
@@ -387,8 +387,8 @@ struct CudaParallelLaunchKernelInvoker<
             driver.get_policy().impl_get_desired_occupancy().value();
         size_t block_size = block.x * block.y * block.z;
         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-            shmem, desired_occupancy);
+            cuda_instance->m_cudaDev, base_t::get_kernel_func(),
+            cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy);
       }
 
       void const* args[] = {&driver};
@@ -487,8 +487,8 @@ struct CudaParallelLaunchKernelInvoker<
             driver.get_policy().impl_get_desired_occupancy().value();
         size_t block_size = block.x * block.y * block.z;
         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-            shmem, desired_occupancy);
+            cuda_instance->m_cudaDev, base_t::get_kernel_func(),
+            cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy);
       }
 
       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@@ -576,13 +576,16 @@ struct CudaParallelLaunchKernelInvoker<
   static void invoke_kernel(DriverType const& driver, dim3 const& grid,
                             dim3 const& block, int shmem,
                             CudaInternal const* cuda_instance) {
+    int cuda_device = cuda_instance->m_cudaDev;
     // Wait until the previous kernel that uses the constant buffer is done
-    std::lock_guard<std::mutex> lock(CudaInternal::constantMemMutex);
+    std::lock_guard<std::mutex> lock(
+        CudaInternal::constantMemMutexPerDevice[cuda_device]);
     KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_synchronize_wrapper(
-        CudaInternal::constantMemReusable)));
+        CudaInternal::constantMemReusablePerDevice[cuda_device])));
 
     // Copy functor (synchronously) to staging buffer in pinned host memory
-    unsigned long* staging = cuda_instance->constantMemHostStaging;
+    unsigned long* staging =
+        cuda_instance->constantMemHostStagingPerDevice[cuda_device];
     memcpy(staging, &driver, sizeof(DriverType));
 
     // Copy functor asynchronously from there to constant memory on the device
@@ -597,7 +600,7 @@ struct CudaParallelLaunchKernelInvoker<
 
     // Record an event that says when the constant buffer can be reused
     KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_record_wrapper(
-        CudaInternal::constantMemReusable)));
+        CudaInternal::constantMemReusablePerDevice[cuda_device])));
   }
 
   inline static void create_parallel_launch_graph_node(
@@ -665,8 +668,8 @@ struct CudaParallelLaunchImpl<
         Impl::configure_shmem_preference<
             DriverType,
             Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-            shmem, desired_occupancy);
+            cuda_instance->m_cudaDev, base_t::get_kernel_func(),
+            cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy);
       }
 
       desul::ensure_cuda_lock_arrays_on_device();
@@ -675,18 +678,17 @@ struct CudaParallelLaunchImpl<
       base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
 
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-      KOKKOS_IMPL_CUDA_SAFE_CALL(
-          (cuda_instance->cuda_get_last_error_wrapper()));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
       cuda_instance->fence(
           "Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error");
 #endif
     }
   }
 
-  static cudaFuncAttributes get_cuda_func_attributes() {
+  static cudaFuncAttributes get_cuda_func_attributes(int cuda_device) {
     return get_cuda_kernel_func_attributes<
         DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-        base_t::get_kernel_func());
+        cuda_device, base_t::get_kernel_func());
   }
 };
 
diff --git a/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp b/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp
index 7492ab49e56..2c7eba7a18f 100644
--- a/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp
@@ -40,8 +40,8 @@ template <>
 inline TileSizeProperties get_tile_size_properties<Kokkos::Cuda>(
     const Kokkos::Cuda& space) {
   TileSizeProperties properties;
-  properties.max_threads =
-      space.impl_internal_space_instance()->m_maxThreadsPerSM;
+  properties.max_threads = space.impl_internal_space_instance()
+                               ->m_deviceProp.maxThreadsPerMultiProcessor;
   properties.default_largest_tile_size = 16;
   properties.default_tile_size         = 2;
   properties.max_total_tile_size       = 512;
diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
index 49d6c112e37..63038984004 100644
--- a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -28,7 +28,6 @@
 #include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
 #include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
 #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
-#include <Kokkos_MinMaxClamp.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
 #include <typeinfo>
@@ -42,8 +41,8 @@ namespace Impl {
 template <typename ParallelType, typename Policy, typename LaunchBounds>
 int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
   cudaFuncAttributes attr =
-      CudaParallelLaunch<ParallelType,
-                         LaunchBounds>::get_cuda_func_attributes();
+      CudaParallelLaunch<ParallelType, LaunchBounds>::get_cuda_func_attributes(
+          pol.space().cuda_device());
   auto const& prop = pol.space().cuda_device_prop();
 
   // Limits due to registers/SM, MDRange doesn't have
@@ -96,7 +95,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
 
   inline void execute() const {
     if (m_rp.m_num_tiles == 0) return;
-    const auto maxblocks = cuda_internal_maximum_grid_count();
+    const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize;
     if (RP::rank == 2) {
       const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1);
       KOKKOS_ASSERT(block.x > 0);
@@ -325,19 +324,18 @@ class ParallelReduce<CombinedFunctorReducerType,
   // Determine block size constrained by shared memory:
   inline unsigned local_block_size(const FunctorType& f) {
     unsigned n = CudaTraits::WarpSize * 8;
+    int const maxShmemPerBlock =
+        m_policy.space().cuda_device_prop().sharedMemPerBlock;
     int shmem_size =
         cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
             f, n);
     using closure_type =
         Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
                              Policy, Kokkos::Cuda>;
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<closure_type,
-                           LaunchBounds>::get_cuda_func_attributes();
+    cudaFuncAttributes attr = CudaParallelLaunch<closure_type, LaunchBounds>::
+        get_cuda_func_attributes(m_policy.space().cuda_device());
     while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
+        (n && (maxShmemPerBlock < shmem_size)) ||
         (n >
          static_cast<unsigned>(
              Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
index 34729992812..0f052be3c30 100644
--- a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@@ -28,7 +28,6 @@
 #include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
 #include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
 #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
-#include <Kokkos_MinMaxClamp.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
 #include <typeinfo>
@@ -86,18 +85,18 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
     const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
 
     cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelFor,
-                           LaunchBounds>::get_cuda_func_attributes();
+        CudaParallelLaunch<ParallelFor, LaunchBounds>::get_cuda_func_attributes(
+            m_policy.space().cuda_device());
     const int block_size =
         Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
             m_policy.space().impl_internal_space_instance(), attr, m_functor, 1,
             0, 0);
     KOKKOS_ASSERT(block_size > 0);
     dim3 block(1, block_size, 1);
+    const int maxGridSizeX = m_policy.space().cuda_device_prop().maxGridSize[0];
     dim3 grid(
-        std::min(
-            typename Policy::index_type((nwork + block.y - 1) / block.y),
-            typename Policy::index_type(cuda_internal_maximum_grid_count()[0])),
+        std::min(typename Policy::index_type((nwork + block.y - 1) / block.y),
+                 typename Policy::index_type(maxGridSizeX)),
         1, 1);
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
     if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) {
@@ -244,10 +243,10 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
       if (CudaTraits::WarpSize < word_count.value) {
         __syncthreads();
       } else if (word_count.value > 1) {
-        // Inside cuda_single_inter_block_reduce_scan() above, shared[i] below
-        // might have been updated by a single thread within a warp without
-        // synchronization afterwards. Synchronize threads within warp to avoid
-        // potential racecondition.
+        // Inside cuda_single_inter_block_reduce_scan() and final() above,
+        // shared[i] below might have been updated by a single thread within a
+        // warp without synchronization afterwards. Synchronize threads within
+        // warp to avoid potential race condition.
         __syncwarp(0xffffffff);
       }
 
@@ -260,19 +259,18 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
   // Determine block size constrained by shared memory:
   inline unsigned local_block_size(const FunctorType& f) {
     unsigned n = CudaTraits::WarpSize * 8;
+    const int maxShmemPerBlock =
+        m_policy.space().cuda_device_prop().sharedMemPerBlock;
     int shmem_size =
         cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
             f, n);
     using closure_type =
         Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
                              Policy, Kokkos::Cuda>;
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<closure_type,
-                           LaunchBounds>::get_cuda_func_attributes();
+    cudaFuncAttributes attr = CudaParallelLaunch<closure_type, LaunchBounds>::
+        get_cuda_func_attributes(m_policy.space().cuda_device());
     while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
+        (n && (maxShmemPerBlock < shmem_size)) ||
         (n >
          static_cast<unsigned>(
              Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
@@ -615,11 +613,11 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
     // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit
     // testing
 
+    const int maxShmemPerBlock =
+        m_policy.space().cuda_device_prop().sharedMemPerBlock;
     unsigned n = CudaTraits::WarpSize * 4;
     while (n &&
-           unsigned(m_policy.space()
-                        .impl_internal_space_instance()
-                        ->m_maxShmemPerBlock) <
+           unsigned(maxShmemPerBlock) <
                cuda_single_inter_block_reduce_scan_shmem<true, WorkTag,
                                                          value_type>(f, n)) {
       n >>= 1;
@@ -939,11 +937,11 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit
     // testing
 
+    const int maxShmemPerBlock =
+        m_policy.space().cuda_device_prop().sharedMemPerBlock;
     unsigned n = CudaTraits::WarpSize * 4;
     while (n &&
-           unsigned(m_policy.space()
-                        .impl_internal_space_instance()
-                        ->m_maxShmemPerBlock) <
+           unsigned(maxShmemPerBlock) <
                cuda_single_inter_block_reduce_scan_shmem<true, WorkTag,
                                                          value_type>(f, n)) {
       n >>= 1;
diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
index b4679b4e0da..9f7be45c839 100644
--- a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -32,7 +32,7 @@
 #include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
 #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
 #include <Cuda/Kokkos_Cuda_Team.hpp>
-#include <Kokkos_MinMaxClamp.hpp>
+#include <Kokkos_MinMax.hpp>
 #include <Kokkos_Vectorization.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
@@ -98,7 +98,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
         Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
     cudaFuncAttributes attr =
         CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
-            get_cuda_func_attributes();
+            get_cuda_func_attributes(space().cuda_device());
     int block_size =
         Kokkos::Impl::cuda_get_max_block_size<FunctorType,
                                               typename traits::launch_bounds>(
@@ -137,7 +137,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
         Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
     cudaFuncAttributes attr =
         CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
-            get_cuda_func_attributes();
+            get_cuda_func_attributes(space().cuda_device());
     const int block_size =
         Kokkos::Impl::cuda_get_opt_block_size<FunctorType,
                                               typename traits::launch_bounds>(
@@ -262,7 +262,8 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
         m_tune_team(bool(team_size_request <= 0)),
         m_tune_vector(bool(vector_length_request <= 0)) {
     // Make sure league size is permissible
-    if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()[0]))
+    const int maxGridSizeX = m_space.cuda_device_prop().maxGridSize[0];
+    if (league_size_ >= maxGridSizeX)
       Impl::throw_runtime_exception(
           "Requested too large league_size for TeamPolicy on Cuda execution "
           "space.");
@@ -369,7 +370,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
 
     cudaFuncAttributes attr =
         CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
-            get_cuda_func_attributes();
+            get_cuda_func_attributes(space().cuda_device());
     const int block_size = std::forward<BlockSizeCallable>(block_size_callable)(
         space().impl_internal_space_instance(), attr, f,
         (size_t)impl_vector_length(),
@@ -539,8 +540,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     auto internal_space_instance =
         m_policy.space().impl_internal_space_instance();
     cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelFor,
-                           LaunchBounds>::get_cuda_func_attributes();
+        CudaParallelLaunch<ParallelFor, LaunchBounds>::get_cuda_func_attributes(
+            internal_space_instance->m_cudaDev);
     m_team_size =
         m_team_size >= 0
             ? m_team_size
@@ -575,10 +576,11 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
                   static_cast<std::int64_t>(m_league_size))));
     }
 
+    const int maxShmemPerBlock =
+        m_policy.space().cuda_device_prop().sharedMemPerBlock;
     const int shmem_size_total = m_shmem_begin + m_shmem_size;
-    if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) {
-      printf("%i %i\n", internal_space_instance->m_maxShmemPerBlock,
-             shmem_size_total);
+    if (maxShmemPerBlock < shmem_size_total) {
+      printf("%i %i\n", maxShmemPerBlock, shmem_size_total);
       Kokkos::Impl::throw_runtime_exception(std::string(
           "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
     }
@@ -623,6 +625,22 @@ class ParallelReduce<CombinedFunctorReducerType,
 
  public:
   using functor_type = FunctorType;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the reduction is performed.
+  // Within the reduction, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the reduction, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < sizeof(Kokkos::Cuda::size_type),
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>,
+      Kokkos::Cuda::size_type>;
   using size_type    = Cuda::size_type;
   using reducer_type = ReducerType;
 
@@ -646,9 +664,11 @@ class ParallelReduce<CombinedFunctorReducerType,
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
   const bool m_result_ptr_host_accessible;
-  size_type* m_scratch_space;
-  size_type* m_scratch_flags;
-  size_type* m_unified_space;
+  word_size_type* m_scratch_space;
+  // m_scratch_flags must be of type Cuda::size_type due to use of atomics
+  // for tracking metadata in Kokkos_Cuda_ReduceScan.hpp
+  Cuda::size_type* m_scratch_flags;
+  word_size_type* m_unified_space;
   size_type m_team_begin;
   size_type m_shmem_begin;
   size_type m_shmem_size;
@@ -692,13 +712,14 @@ class ParallelReduce<CombinedFunctorReducerType,
   }
 
   __device__ inline void run(SHMEMReductionTag&, const int& threadid) const {
-    const integral_nonzero_constant<
-        size_type, ReducerType::static_value_size() / sizeof(size_type)>
+    const integral_nonzero_constant<word_size_type,
+                                    ReducerType::static_value_size() /
+                                        sizeof(word_size_type)>
         word_count(m_functor_reducer.get_reducer().value_size() /
-                   sizeof(size_type));
+                   sizeof(word_size_type));
 
     reference_type value = m_functor_reducer.get_reducer().init(
-        kokkos_impl_cuda_shared_memory<size_type>() +
+        kokkos_impl_cuda_shared_memory<word_size_type>() +
         threadIdx.y * word_count.value);
 
     // Iterate this block through the league
@@ -721,18 +742,19 @@ class ParallelReduce<CombinedFunctorReducerType,
     if (!zero_length)
       do_final_reduction = cuda_single_inter_block_reduce_scan<false>(
           m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x,
-          kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+          kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
           m_scratch_flags);
 
     if (do_final_reduction) {
       // This is the final block with the final result at the final threads'
       // location
 
-      size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
+      word_size_type* const shared =
+          kokkos_impl_cuda_shared_memory<word_size_type>() +
+          (blockDim.y - 1) * word_count.value;
       size_type* const global =
           m_result_ptr_device_accessible
-              ? reinterpret_cast<size_type*>(m_result_ptr)
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
               : (m_unified_space ? m_unified_space : m_scratch_space);
 
       if (threadIdx.y == 0) {
@@ -787,7 +809,8 @@ class ParallelReduce<CombinedFunctorReducerType,
       *result = value;
     } else if (Impl::cuda_inter_block_reduction(
                    value, init, m_functor_reducer.get_reducer(),
-                   m_scratch_space, result, m_scratch_flags, blockDim.y)) {
+                   reinterpret_cast<pointer_type>(m_scratch_space), result,
+                   m_scratch_flags, blockDim.y)) {
       const unsigned id = threadIdx.y * blockDim.x + threadIdx.x;
       if (id == 0) {
         m_functor_reducer.get_reducer().final(&value);
@@ -808,13 +831,15 @@ class ParallelReduce<CombinedFunctorReducerType,
           1u, UseShflReduction ? std::min(m_league_size, size_type(1024 * 32))
                                : std::min(int(m_league_size), m_team_size));
 
-      m_scratch_space = cuda_internal_scratch_space(
-          m_policy.space(),
-          m_functor_reducer.get_reducer().value_size() * block_count);
+      m_scratch_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
+              m_policy.space(),
+              m_functor_reducer.get_reducer().value_size() * block_count));
       m_scratch_flags =
           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
-      m_unified_space = cuda_internal_scratch_unified(
-          m_policy.space(), m_functor_reducer.get_reducer().value_size());
+      m_unified_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
+              m_policy.space(), m_functor_reducer.get_reducer().value_size()));
 
       dim3 block(m_vector_size, m_team_size, 1);
       dim3 grid(block_count, 1, 1);
@@ -847,7 +872,8 @@ class ParallelReduce<CombinedFunctorReducerType,
             }
           } else {
             const int size = m_functor_reducer.get_reducer().value_size();
-            DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size);
+            DeepCopy<HostSpace, CudaSpace, Cuda>(m_policy.space(), m_result_ptr,
+                                                 m_scratch_space, size);
           }
         }
       }
@@ -883,9 +909,8 @@ class ParallelReduce<CombinedFunctorReducerType,
         m_vector_size(arg_policy.impl_vector_length()) {
     auto internal_space_instance =
         m_policy.space().impl_internal_space_instance();
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelReduce,
-                           LaunchBounds>::get_cuda_func_attributes();
+    cudaFuncAttributes attr = CudaParallelLaunch<ParallelReduce, LaunchBounds>::
+        get_cuda_func_attributes(internal_space_instance->m_cudaDev);
     m_team_size =
         m_team_size >= 0
             ? m_team_size
@@ -940,6 +965,8 @@ class ParallelReduce<CombinedFunctorReducerType,
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
 
+    const int maxShmemPerBlock =
+        m_policy.space().cuda_device_prop().sharedMemPerBlock;
     const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
 
     if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
@@ -948,7 +975,7 @@ class ParallelReduce<CombinedFunctorReducerType,
           std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
     }
 
-    if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) {
+    if (maxShmemPerBlock < shmem_size_total) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much "
                       "L0 scratch memory"));
diff --git a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index 7ccedbfe28d..3037c4ab541 100644
--- a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -103,7 +103,7 @@ template <class FunctorType>
 __device__ bool cuda_inter_block_reduction(
     typename FunctorType::reference_type value,
     typename FunctorType::reference_type neutral, const FunctorType& reducer,
-    Cuda::size_type* const m_scratch_space,
+    typename FunctorType::pointer_type const m_scratch_space,
     typename FunctorType::pointer_type const /*result*/,
     Cuda::size_type* const m_scratch_flags,
     const int max_active_thread = blockDim.y) {
@@ -117,7 +117,7 @@ __device__ bool cuda_inter_block_reduction(
 
   // One thread in the block writes block result to global scratch_memory
   if (id == 0) {
-    pointer_type global = ((pointer_type)m_scratch_space) + blockIdx.x;
+    pointer_type global = m_scratch_space + blockIdx.x;
     *global             = value;
   }
 
@@ -140,7 +140,7 @@ __device__ bool cuda_inter_block_reduction(
       last_block = true;
       value      = neutral;
 
-      pointer_type const volatile global = (pointer_type)m_scratch_space;
+      pointer_type const volatile global = m_scratch_space;
 
       // Reduce all global values with splitting work over threads in one warp
       const int step_size =
@@ -702,8 +702,7 @@ inline void check_reduced_view_shmem_size(const Policy& policy,
   unsigned reqShmemSize =
       cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, ValueType>(
           functor, minBlockSize);
-  size_t maxShmemPerBlock =
-      policy.space().impl_internal_space_instance()->m_maxShmemPerBlock;
+  size_t maxShmemPerBlock = policy.space().cuda_device_prop().sharedMemPerBlock;
 
   if (reqShmemSize > maxShmemPerBlock) {
     Kokkos::Impl::throw_runtime_exception(
diff --git a/core/src/Cuda/Kokkos_Cuda_Task.hpp b/core/src/Cuda/Kokkos_Cuda_Task.hpp
index baff7ef3f55..86d6d91bbee 100644
--- a/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -84,8 +84,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
   KOKKOS_INLINE_FUNCTION
   static void iff_single_thread_recursive_execute(scheduler_type const&) {}
 
-  static int get_max_team_count(execution_space const&) {
-    return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block;
+  static int get_max_team_count(execution_space const& space) {
+    return space.cuda_device_prop().multiProcessorCount * warps_per_block;
   }
 
   __device__ static void driver(scheduler_type scheduler,
@@ -225,7 +225,11 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
   // FIXME_CUDA_MULTIPLE_DEVICES
   static void execute(scheduler_type const& scheduler) {
     const int shared_per_warp = 2048;
-    const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
+    const Kokkos::Cuda& exec  = scheduler.get_execution_space();
+    const auto& impl_instance = exec.impl_internal_space_instance();
+    const int multi_processor_count =
+        exec.cuda_device_prop().multiProcessorCount;
+    const dim3 grid(multi_processor_count, 1, 1);
     const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
     const int shared_total    = shared_per_warp * warps_per_block;
     const cudaStream_t stream = nullptr;
@@ -245,34 +249,30 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
     // Query the stack size, in bytes:
 
     size_t previous_stack_size = 0;
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (CudaInternal::singleton().cuda_device_get_limit_wrapper(
-            &previous_stack_size, cudaLimitStackSize)));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_get_limit_wrapper(
+        &previous_stack_size, cudaLimitStackSize));
 
     // If not large enough then set the stack size, in bytes:
 
     const size_t larger_stack_size = 1 << 11;
 
     if (previous_stack_size < larger_stack_size) {
-      KOKKOS_IMPL_CUDA_SAFE_CALL(
-          (CudaInternal::singleton().cuda_device_set_limit_wrapper(
-              cudaLimitStackSize, larger_stack_size)));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper(
+          cudaLimitStackSize, larger_stack_size));
     }
 
     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
         scheduler, shared_per_warp);
 
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (CudaInternal::singleton().cuda_get_last_error_wrapper()));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
 
     Impl::cuda_device_synchronize(
         "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
         "Cuda>::execute: Post Task Execution");
 
     if (previous_stack_size < larger_stack_size) {
-      KOKKOS_IMPL_CUDA_SAFE_CALL(
-          (CudaInternal::singleton().cuda_device_set_limit_wrapper(
-              cudaLimitStackSize, previous_stack_size)));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper(
+          cudaLimitStackSize, previous_stack_size));
     }
   }
 
@@ -300,8 +300,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
     set_cuda_task_base_apply_function_pointer<TaskType>
         <<<1, 1>>>(ptr_ptr, dtor_ptr);
 
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (CudaInternal::singleton().cuda_get_last_error_wrapper()));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+
     Impl::cuda_device_synchronize(
         "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
         "Cuda>::execute: Post Get Function Pointer for Tasks");
@@ -466,7 +466,13 @@ class TaskQueueSpecializationConstrained<
   static void execute(scheduler_type const& scheduler) {
     const int shared_per_warp = 2048;
     const int warps_per_block = 4;
-    const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
+    const Kokkos::Cuda exec   = Cuda();  // FIXME_CUDA_MULTIPLE_DEVICES
+    const auto& impl_instance = exec.impl_internal_space_instance();
+    const int multi_processor_count =
+        // FIXME not sure why this didn't work
+        // exec.cuda_device_prop().multiProcessorCount;
+        impl_instance->m_deviceProp.multiProcessorCount;
+    const dim3 grid(multi_processor_count, 1, 1);
     // const dim3 grid( 1 , 1 , 1 );
     const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
     const int shared_total    = shared_per_warp * warps_per_block;
@@ -482,34 +488,30 @@ class TaskQueueSpecializationConstrained<
     // Query the stack size, in bytes:
 
     size_t previous_stack_size = 0;
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (CudaInternal::singleton().cuda_device_get_limit_wrapper(
-            &previous_stack_size, cudaLimitStackSize)));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_get_limit_wrapper(
+        &previous_stack_size, cudaLimitStackSize));
 
     // If not large enough then set the stack size, in bytes:
 
     const size_t larger_stack_size = 2048;
 
     if (previous_stack_size < larger_stack_size) {
-      KOKKOS_IMPL_CUDA_SAFE_CALL(
-          (CudaInternal::singleton().cuda_device_set_limit_wrapper(
-              cudaLimitStackSize, larger_stack_size)));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper(
+          cudaLimitStackSize, larger_stack_size));
     }
 
     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
         scheduler, shared_per_warp);
 
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (CudaInternal::singleton().cuda_get_last_error_wrapper()));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
 
     Impl::cuda_device_synchronize(
         "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
         "Kokkos::Cuda>::execute: Post Execute Task");
 
     if (previous_stack_size < larger_stack_size) {
-      KOKKOS_IMPL_CUDA_SAFE_CALL(
-          (CudaInternal::singleton().cuda_device_set_limit_wrapper(
-              cudaLimitStackSize, previous_stack_size)));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper(
+          cudaLimitStackSize, previous_stack_size));
     }
   }
 
@@ -532,8 +534,7 @@ class TaskQueueSpecializationConstrained<
     set_cuda_task_base_apply_function_pointer<TaskType>
         <<<1, 1>>>(ptr_ptr, dtor_ptr);
 
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (CudaInternal::singleton().cuda_get_last_error_wrapper()));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
     Impl::cuda_device_synchronize(
         "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
         "Kokkos::Cuda>::get_function_pointer: Post Get Function Pointer");
diff --git a/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
index abb747e39a1..94a428493f4 100644
--- a/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
@@ -22,7 +22,6 @@
 
 #include <Cuda/Kokkos_CudaSpace.hpp>
 #include <Kokkos_UniqueToken.hpp>
-#include <impl/Kokkos_SharedAlloc.hpp>
 
 namespace Kokkos {
 
diff --git a/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
index a945a716bc3..c7ea6988a5d 100644
--- a/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@@ -77,7 +77,9 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
 
   inline void execute() {
     const int warps_per_block = 4;
-    const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
+    const int multi_processor_count =
+        m_policy.space().cuda_device_prop().multiProcessorCount;
+    const dim3 grid(multi_processor_count, 1, 1);
     const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
     const int shared = 0;
 
diff --git a/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp
index c7f0d12d914..517c592af72 100644
--- a/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp
@@ -25,23 +25,14 @@ namespace Impl {
 
 template <class T, class... P>
 struct ZeroMemset<Kokkos::Cuda, View<T, P...>> {
-  ZeroMemset(const Kokkos::Cuda& exec_space_instance, const View<T, P...>& dst,
-             typename View<T, P...>::const_value_type&) {
+  ZeroMemset(const Kokkos::Cuda& exec_space_instance,
+             const View<T, P...>& dst) {
     KOKKOS_IMPL_CUDA_SAFE_CALL(
         (exec_space_instance.impl_internal_space_instance()
              ->cuda_memset_async_wrapper(
                  dst.data(), 0,
                  dst.size() * sizeof(typename View<T, P...>::value_type))));
   }
-
-  ZeroMemset(const View<T, P...>& dst,
-             typename View<T, P...>::const_value_type&) {
-    // FIXME_CUDA_MULTIPLE_DEVICES
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (Kokkos::Impl::CudaInternal::singleton().cuda_memset_wrapper(
-            dst.data(), 0,
-            dst.size() * sizeof(typename View<T, P...>::value_type))));
-  }
 };
 
 }  // namespace Impl
diff --git a/core/src/HIP/Kokkos_HIP.cpp b/core/src/HIP/Kokkos_HIP.cpp
index f78bfd28b2f..309e07fb3fb 100644
--- a/core/src/HIP/Kokkos_HIP.cpp
+++ b/core/src/HIP/Kokkos_HIP.cpp
@@ -18,6 +18,7 @@
 #define KOKKOS_IMPL_PUBLIC_INCLUDE
 #endif
 
+#include <Kokkos_Core.hpp>
 #include <HIP/Kokkos_HIP.hpp>
 #include <HIP/Kokkos_HIP_Instance.hpp>
 
@@ -41,7 +42,9 @@ int HIP::impl_is_initialized() {
 }
 
 void HIP::impl_initialize(InitializationSettings const& settings) {
-  const int hip_device_id = Impl::get_gpu(settings);
+  const std::vector<int>& visible_devices = Impl::get_visible_devices();
+  const int hip_device_id =
+      Impl::get_gpu(settings).value_or(visible_devices[0]);
 
   Impl::HIPInternal::m_hipDev = hip_device_id;
   KOKKOS_IMPL_HIP_SAFE_CALL(
@@ -89,10 +92,23 @@ void HIP::impl_initialize(InitializationSettings const& settings) {
 
   hipStream_t singleton_stream;
   KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&singleton_stream));
-  Impl::HIPInternal::singleton().initialize(singleton_stream, /*manage*/ true);
+  Impl::HIPInternal::singleton().initialize(singleton_stream);
 }
 
-void HIP::impl_finalize() { Impl::HIPInternal::singleton().finalize(); }
+void HIP::impl_finalize() {
+  (void)Impl::hip_global_unique_token_locks(true);
+
+  desul::Impl::finalize_lock_arrays();  // FIXME
+
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipEventDestroy(Impl::HIPInternal::constantMemReusable));
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipHostFree(Impl::HIPInternal::constantMemHostStaging));
+
+  Impl::HIPInternal::singleton().finalize();
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipStreamDestroy(Impl::HIPInternal::singleton().m_stream));
+}
 
 HIP::HIP()
     : m_space_instance(&Impl::HIPInternal::singleton(),
@@ -102,13 +118,17 @@ HIP::HIP()
 }
 
 HIP::HIP(hipStream_t const stream, Impl::ManageStream manage_stream)
-    : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) {
-        ptr->finalize();
-        delete ptr;
-      }) {
+    : m_space_instance(
+          new Impl::HIPInternal, [manage_stream](Impl::HIPInternal* ptr) {
+            ptr->finalize();
+            if (static_cast<bool>(manage_stream)) {
+              KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(ptr->m_stream));
+            }
+            delete ptr;
+          }) {
   Impl::HIPInternal::singleton().verify_is_initialized(
       "HIP instance constructor");
-  m_space_instance->initialize(stream, static_cast<bool>(manage_stream));
+  m_space_instance->initialize(stream);
 }
 
 KOKKOS_DEPRECATED HIP::HIP(hipStream_t const stream, bool manage_stream)
diff --git a/core/src/HIP/Kokkos_HIP.hpp b/core/src/HIP/Kokkos_HIP.hpp
index 61ed346b218..3a88e97ee3d 100644
--- a/core/src/HIP/Kokkos_HIP.hpp
+++ b/core/src/HIP/Kokkos_HIP.hpp
@@ -57,13 +57,15 @@ class HIP {
   //! \name Functions that all Kokkos devices must implement.
   //@{
 
-  KOKKOS_INLINE_FUNCTION static int in_parallel() {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() {
 #if defined(__HIP_DEVICE_COMPILE__)
     return true;
 #else
     return false;
 #endif
   }
+#endif
 
   /** \brief Wait until all dispatched functors complete.
    *
@@ -94,9 +96,13 @@ class HIP {
 
   static int impl_is_initialized();
 
-  //  static size_type device_arch();
-
-  static size_type detect_device_count();
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED static size_type detect_device_count() {
+    int count;
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count));
+    return count;
+  }
+#endif
 
 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
   static int concurrency();
diff --git a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp
index 576c53426bc..5f0df72df17 100644
--- a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp
+++ b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp
@@ -20,13 +20,11 @@
 #include <Kokkos_Graph_fwd.hpp>
 
 #include <impl/Kokkos_GraphImpl.hpp>
-#include <impl/Kokkos_SharedAlloc.hpp>
 
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_Parallel_Reduce.hpp>
 #include <Kokkos_PointerOwnership.hpp>
 
-#include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp>
 #include <HIP/Kokkos_HIP_GraphNode_Impl.hpp>
 
 namespace Kokkos {
@@ -43,7 +41,6 @@ class GraphNodeKernelImpl<Kokkos::HIP, PolicyType, Functor, PatternTag, Args...>
   using base_t =
       typename PatternImplSpecializationFromTag<PatternTag, Functor, Policy,
                                                 Args..., Kokkos::HIP>::type;
-  using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
 
   // TODO use the name and executionspace
   template <typename PolicyDeduced, typename... ArgsDeduced>
@@ -60,7 +57,7 @@ class GraphNodeKernelImpl<Kokkos::HIP, PolicyType, Functor, PatternTag, Args...>
 
   ~GraphNodeKernelImpl() {
     if (m_driver_storage) {
-      Record::decrement(Record::get_record(m_driver_storage));
+      Kokkos::HIPSpace().deallocate(m_driver_storage, sizeof(base_t));
     }
   }
 
@@ -78,15 +75,9 @@ class GraphNodeKernelImpl<Kokkos::HIP, PolicyType, Functor, PatternTag, Args...>
 
   Kokkos::ObservingRawPtr<base_t> allocate_driver_memory_buffer() const {
     KOKKOS_EXPECTS(m_driver_storage == nullptr);
-
-    auto* record = Record::allocate(
-        Kokkos::HIPSpace{}, "GraphNodeKernel global memory functor storage",
-        sizeof(base_t));
-
-    Record::increment(record);
-    m_driver_storage = reinterpret_cast<base_t*>(record->data());
+    m_driver_storage = static_cast<base_t*>(Kokkos::HIPSpace().allocate(
+        "GraphNodeKernel global memory functor storage", sizeof(base_t)));
     KOKKOS_ENSURES(m_driver_storage != nullptr);
-
     return m_driver_storage;
   }
 
diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp
index 7f04eb721cb..22c0db047f6 100644
--- a/core/src/HIP/Kokkos_HIP_Instance.cpp
+++ b/core/src/HIP/Kokkos_HIP_Instance.cpp
@@ -27,6 +27,7 @@
 #include <HIP/Kokkos_HIP.hpp>
 #include <HIP/Kokkos_HIP_Space.hpp>
 #include <impl/Kokkos_CheckedIntegerOps.hpp>
+#include <impl/Kokkos_DeviceManagement.hpp>
 #include <impl/Kokkos_Error.hpp>
 
 /*--------------------------------------------------------------------------*/
@@ -89,10 +90,14 @@ void HIPInternal::print_configuration(std::ostream &s) const {
     << '\n';
 #endif
 
-  int hipDevCount;
-  KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount));
+  s << "macro KOKKOS_ENABLE_ROCTHRUST : "
+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+    << "defined\n";
+#else
+    << "undefined\n";
+#endif
 
-  for (int i = 0; i < hipDevCount; ++i) {
+  for (int i : get_visible_devices()) {
     hipDeviceProp_t hipProp;
     KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i));
     std::string gpu_type = hipProp.integrated == 1 ? "APU" : "dGPU";
@@ -159,14 +164,13 @@ void HIPInternal::fence(const std::string &name) const {
       [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); });
 }
 
-void HIPInternal::initialize(hipStream_t stream, bool manage_stream) {
+void HIPInternal::initialize(hipStream_t stream) {
   KOKKOS_EXPECTS(!is_initialized());
 
   if (was_finalized)
     Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n");
 
-  m_stream        = stream;
-  m_manage_stream = manage_stream;
+  m_stream = stream;
 
   //----------------------------------
   // Multiblock reduction uses scratch flags for counters
@@ -192,20 +196,19 @@ void HIPInternal::initialize(hipStream_t stream, bool manage_stream) {
 Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) {
   if (verify_is_initialized("scratch_space") &&
       m_scratchSpaceCount < scratch_count(size)) {
-    m_scratchSpaceCount = scratch_count(size);
+    Kokkos::HIPSpace mem_space;
 
-    using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
+    if (m_scratchSpace) {
+      mem_space.deallocate(m_scratchSpace,
+                           m_scratchSpaceCount * sizeScratchGrain);
+    }
 
-    if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));
+    m_scratchSpaceCount = scratch_count(size);
 
     std::size_t alloc_size =
         multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain);
-    Record *const r = Record::allocate(
-        Kokkos::HIPSpace(), "Kokkos::InternalScratchSpace", alloc_size);
-
-    Record::increment(r);
-
-    m_scratchSpace = reinterpret_cast<size_type *>(r->data());
+    m_scratchSpace = static_cast<size_type *>(
+        mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size));
   }
 
   return m_scratchSpace;
@@ -214,21 +217,23 @@ Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) {
 Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) {
   if (verify_is_initialized("scratch_flags") &&
       m_scratchFlagsCount < scratch_count(size)) {
-    m_scratchFlagsCount = scratch_count(size);
+    Kokkos::HIPSpace mem_space;
 
-    using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
+    if (m_scratchFlags) {
+      mem_space.deallocate(m_scratchFlags,
+                           m_scratchFlagsCount * sizeScratchGrain);
+    }
 
-    if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));
+    m_scratchFlagsCount = scratch_count(size);
 
     std::size_t alloc_size =
         multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain);
-    Record *const r = Record::allocate(
-        Kokkos::HIPSpace(), "Kokkos::InternalScratchFlags", alloc_size);
-
-    Record::increment(r);
-
-    m_scratchFlags = reinterpret_cast<size_type *>(r->data());
+    m_scratchFlags = static_cast<size_type *>(
+        mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size));
 
+    // We only zero-initialize the allocation when we actually allocate.
+    // It's the responsibility of the features using scratch_flags,
+    // namely parallel_reduce and parallel_scan, to reset the used values to 0.
     KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchFlags, 0, alloc_size));
   }
 
@@ -238,29 +243,20 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) {
 Kokkos::HIP::size_type *HIPInternal::stage_functor_for_execution(
     void const *driver, std::size_t const size) const {
   if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) {
-    m_scratchFunctorSize = size;
-
-    using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
-    using RecordHost =
-        Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPHostPinnedSpace, void>;
+    Kokkos::HIPSpace device_mem_space;
+    Kokkos::HIPHostPinnedSpace host_mem_space;
 
     if (m_scratchFunctor) {
-      Record::decrement(Record::get_record(m_scratchFunctor));
-      RecordHost::decrement(RecordHost::get_record(m_scratchFunctorHost));
+      device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize);
+      host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize);
     }
 
-    Record *const r =
-        Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFunctor",
-                         m_scratchFunctorSize);
-    RecordHost *const r_host = RecordHost::allocate(
-        Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost",
-        m_scratchFunctorSize);
-
-    Record::increment(r);
-    RecordHost::increment(r_host);
+    m_scratchFunctorSize = size;
 
-    m_scratchFunctor     = reinterpret_cast<size_type *>(r->data());
-    m_scratchFunctorHost = reinterpret_cast<size_type *>(r_host->data());
+    m_scratchFunctor     = static_cast<size_type *>(device_mem_space.allocate(
+        "Kokkos::InternalScratchFunctor", m_scratchFunctorSize));
+    m_scratchFunctorHost = static_cast<size_type *>(host_mem_space.allocate(
+        "Kokkos::InternalScratchFunctorHost", m_scratchFunctorSize));
   }
 
   // When using HSA_XNACK=1, it is necessary to copy the driver to the host to
@@ -323,23 +319,18 @@ void HIPInternal::finalize() {
   this->fence("Kokkos::HIPInternal::finalize: fence on finalization");
   was_finalized = true;
 
-  if (this == &singleton()) {
-    (void)Kokkos::Impl::hip_global_unique_token_locks(true);
-    desul::Impl::finalize_lock_arrays();  // FIXME
-
-    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging));
-    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable));
-  }
-
   if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
-    using RecordHIP = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace>;
+    Kokkos::HIPSpace device_mem_space;
 
-    RecordHIP::decrement(RecordHIP::get_record(m_scratchFlags));
-    RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace));
+    device_mem_space.deallocate(m_scratchFlags,
+                                m_scratchSpaceCount * sizeScratchGrain);
+    device_mem_space.deallocate(m_scratchSpace,
+                                m_scratchFlagsCount * sizeScratchGrain);
 
     if (m_scratchFunctorSize > 0) {
-      RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctor));
-      RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctorHost));
+      device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize);
+      Kokkos::HIPHostPinnedSpace host_mem_space;
+      host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize);
     }
   }
 
@@ -348,14 +339,10 @@ void HIPInternal::finalize() {
       Kokkos::kokkos_free<Kokkos::HIPSpace>(m_team_scratch_ptr[i]);
   }
 
-  if (m_manage_stream && m_stream != nullptr)
-    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(m_stream));
-
   m_scratchSpaceCount = 0;
   m_scratchFlagsCount = 0;
   m_scratchSpace      = nullptr;
   m_scratchFlags      = nullptr;
-  m_stream            = nullptr;
   for (int i = 0; i < m_n_team_scratch; ++i) {
     m_team_scratch_current_size[i] = 0;
     m_team_scratch_ptr[i]          = nullptr;
@@ -419,13 +406,3 @@ void Kokkos::Impl::create_HIP_instances(std::vector<HIP> &instances) {
     instances[s] = HIP(stream, ManageStream::yes);
   }
 }
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-HIP::size_type HIP::detect_device_count() {
-  int hipDevCount;
-  KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount));
-  return hipDevCount;
-}
-}  // namespace Kokkos
diff --git a/core/src/HIP/Kokkos_HIP_Instance.hpp b/core/src/HIP/Kokkos_HIP_Instance.hpp
index 63ad66686bb..142008124af 100644
--- a/core/src/HIP/Kokkos_HIP_Instance.hpp
+++ b/core/src/HIP/Kokkos_HIP_Instance.hpp
@@ -98,7 +98,6 @@ class HIPInternal {
   uint32_t m_instance_id =
       Kokkos::Tools::Experimental::Impl::idForInstance<HIP>(
           reinterpret_cast<uintptr_t>(this));
-  bool m_manage_stream = false;
 
   // Team Scratch Level 1 Space
   int m_n_team_scratch                            = 10;
@@ -124,7 +123,7 @@ class HIPInternal {
     return nullptr != m_scratchSpace && nullptr != m_scratchFlags;
   }
 
-  void initialize(hipStream_t stream, bool manage_stream);
+  void initialize(hipStream_t stream);
   void finalize();
 
   void print_configuration(std::ostream &) const;
diff --git a/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp b/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp
new file mode 100644
index 00000000000..db07c360b5c
--- /dev/null
+++ b/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp
@@ -0,0 +1,173 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_HIP_PARALLEL_FOR_MDRANGE_HPP
+#define KOKKOS_HIP_PARALLEL_FOR_MDRANGE_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+#include <HIP/Kokkos_HIP_BlockSize_Deduction.hpp>
+#include <HIP/Kokkos_HIP_KernelLaunch.hpp>
+#include <KokkosExp_MDRangePolicy.hpp>
+#include <impl/KokkosExp_IterateTileGPU.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+// ParallelFor
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, HIP> {
+ public:
+  using Policy       = Kokkos::MDRangePolicy<Traits...>;
+  using functor_type = FunctorType;
+
+ private:
+  using array_index_type = typename Policy::array_index_type;
+  using index_type       = typename Policy::index_type;
+  using LaunchBounds     = typename Policy::launch_bounds;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+ public:
+  ParallelFor()                   = delete;
+  ParallelFor(ParallelFor const&) = default;
+  ParallelFor& operator=(ParallelFor const&) = delete;
+
+  inline __device__ void operator()() const {
+    Kokkos::Impl::DeviceIterateTile<Policy::rank, Policy, FunctorType,
+                                    typename Policy::work_tag>(m_policy,
+                                                               m_functor)
+        .exec_range();
+  }
+
+  inline void execute() const {
+    using ClosureType = ParallelFor<FunctorType, Policy, HIP>;
+    if (m_policy.m_num_tiles == 0) return;
+    auto const maxblocks = hip_internal_maximum_grid_count();
+    if (Policy::rank == 2) {
+      dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1);
+      dim3 const grid(
+          std::min<array_index_type>(
+              (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) /
+                  block.x,
+              maxblocks[0]),
+          std::min<array_index_type>(
+              (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) /
+                  block.y,
+              maxblocks[1]),
+          1);
+      hip_parallel_launch<ClosureType, LaunchBounds>(
+          *this, grid, block, 0,
+          m_policy.space().impl_internal_space_instance(), false);
+    } else if (Policy::rank == 3) {
+      dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1],
+                       m_policy.m_tile[2]);
+      dim3 const grid(
+          std::min<array_index_type>(
+              (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) /
+                  block.x,
+              maxblocks[0]),
+          std::min<array_index_type>(
+              (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) /
+                  block.y,
+              maxblocks[1]),
+          std::min<array_index_type>(
+              (m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) /
+                  block.z,
+              maxblocks[2]));
+      hip_parallel_launch<ClosureType, LaunchBounds>(
+          *this, grid, block, 0,
+          m_policy.space().impl_internal_space_instance(), false);
+    } else if (Policy::rank == 4) {
+      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
+      // threadIdx.z
+      dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1],
+                       m_policy.m_tile[2], m_policy.m_tile[3]);
+      dim3 const grid(
+          std::min<array_index_type>(
+              m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]),
+          std::min<array_index_type>(
+              (m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) /
+                  block.y,
+              maxblocks[1]),
+          std::min<array_index_type>(
+              (m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) /
+                  block.z,
+              maxblocks[2]));
+      hip_parallel_launch<ClosureType, LaunchBounds>(
+          *this, grid, block, 0,
+          m_policy.space().impl_internal_space_instance(), false);
+    } else if (Policy::rank == 5) {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4
+      // to threadIdx.z
+      dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1],
+                       m_policy.m_tile[2] * m_policy.m_tile[3],
+                       m_policy.m_tile[4]);
+      dim3 const grid(
+          std::min<array_index_type>(
+              m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]),
+          std::min<array_index_type>(
+              m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]),
+          std::min<array_index_type>(
+              (m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) /
+                  block.z,
+              maxblocks[2]));
+      hip_parallel_launch<ClosureType, LaunchBounds>(
+          *this, grid, block, 0,
+          m_policy.space().impl_internal_space_instance(), false);
+    } else if (Policy::rank == 6) {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y;
+      // id4,id5 to threadIdx.z
+      dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1],
+                       m_policy.m_tile[2] * m_policy.m_tile[3],
+                       m_policy.m_tile[4] * m_policy.m_tile[5]);
+      dim3 const grid(
+          std::min<array_index_type>(
+              m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]),
+          std::min<array_index_type>(
+              m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]),
+          std::min<array_index_type>(
+              m_policy.m_tile_end[4] * m_policy.m_tile_end[5], maxblocks[2]));
+      hip_parallel_launch<ClosureType, LaunchBounds>(
+          *this, grid, block, 0,
+          m_policy.space().impl_internal_space_instance(), false);
+    } else {
+      Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n");
+    }
+
+  }  // end execute
+
+  ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
+    using closure_type =
+        ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, HIP>;
+    unsigned block_size = hip_get_max_blocksize<closure_type, LaunchBounds>();
+    if (block_size == 0)
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid "
+                      "tile size."));
+    return block_size;
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp b/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp
new file mode 100644
index 00000000000..9355c1c75fb
--- /dev/null
+++ b/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp
@@ -0,0 +1,100 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_HIP_PARALLEL_FOR_RANGE_HPP
+#define KOKKOS_HIP_PARALLEL_FOR_RANGE_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+#include <HIP/Kokkos_HIP_BlockSize_Deduction.hpp>
+#include <HIP/Kokkos_HIP_KernelLaunch.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::HIP> {
+ public:
+  using Policy = Kokkos::RangePolicy<Traits...>;
+
+ private:
+  using Member       = typename Policy::member_type;
+  using WorkTag      = typename Policy::work_tag;
+  using LaunchBounds = typename Policy::launch_bounds;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+  template <class TagType>
+  inline __device__ std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const Member i) const {
+    m_functor(i);
+  }
+
+  template <class TagType>
+  inline __device__ std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const Member i) const {
+    m_functor(TagType(), i);
+  }
+
+ public:
+  using functor_type = FunctorType;
+
+  ParallelFor()                   = delete;
+  ParallelFor(ParallelFor const&) = default;
+  ParallelFor& operator=(ParallelFor const&) = delete;
+
+  inline __device__ void operator()() const {
+    const Member work_stride = blockDim.y * gridDim.x;
+    const Member work_end    = m_policy.end();
+
+    for (Member iwork =
+             m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x;
+         iwork < work_end;
+         iwork = iwork < work_end - work_stride ? iwork + work_stride
+                                                : work_end) {
+      this->template exec_range<WorkTag>(iwork);
+    }
+  }
+
+  inline void execute() const {
+    const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
+
+    using DriverType = ParallelFor<FunctorType, Policy, Kokkos::HIP>;
+    const int block_size =
+        Kokkos::Impl::hip_get_preferred_blocksize<DriverType, LaunchBounds>();
+    const dim3 block(1, block_size, 1);
+    const dim3 grid(
+        typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1);
+
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a "
+                      "valid execution configuration."));
+    }
+    Kokkos::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
+        *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
+        false);
+  }
+
+  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp b/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp
new file mode 100644
index 00000000000..bf0c2193383
--- /dev/null
+++ b/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp
@@ -0,0 +1,177 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_HIP_PARALLEL_FOR_TEAM_HPP
+#define KOKKOS_HIP_PARALLEL_FOR_TEAM_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+#include <HIP/Kokkos_HIP_KernelLaunch.hpp>
+#include <HIP/Kokkos_HIP_Team.hpp>
+#include <HIP/Kokkos_HIP_Instance.hpp>
+#include <HIP/Kokkos_HIP_TeamPolicyInternal.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <typename FunctorType, typename... Properties>
+class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> {
+ public:
+  using Policy       = TeamPolicy<Properties...>;
+  using functor_type = FunctorType;
+  using size_type    = HIP::size_type;
+
+ private:
+  using member_type   = typename Policy::member_type;
+  using work_tag      = typename Policy::work_tag;
+  using launch_bounds = typename Policy::launch_bounds;
+
+  // Algorithmic constraints: blockDim.y is a power of two AND
+  // blockDim.y  == blockDim.z == 1 shared memory utilization:
+  //
+  //  [ team   reduce space ]
+  //  [ team   shared space ]
+
+  FunctorType const m_functor;
+  Policy const m_policy;
+  size_type const m_league_size;
+  int m_team_size;
+  size_type const m_vector_size;
+  int m_shmem_begin;
+  int m_shmem_size;
+  void* m_scratch_ptr[2];
+  size_t m_scratch_size[2];
+  int m_scratch_pool_id = -1;
+  int32_t* m_scratch_locks;
+  size_t m_num_scratch_locks;
+
+  template <typename TagType>
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
+      const member_type& member) const {
+    m_functor(member);
+  }
+
+  template <typename TagType>
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team(
+      const member_type& member) const {
+    m_functor(TagType(), member);
+  }
+
+ public:
+  ParallelFor()                   = delete;
+  ParallelFor(ParallelFor const&) = default;
+  ParallelFor& operator=(ParallelFor const&) = delete;
+
+  __device__ inline void operator()() const {
+    // Iterate this block through the league
+    int64_t threadid = 0;
+    if (m_scratch_size[1] > 0) {
+      threadid = hip_get_scratch_index(m_league_size, m_scratch_locks,
+                                       m_num_scratch_locks);
+    }
+
+    int const int_league_size = static_cast<int>(m_league_size);
+    for (int league_rank = blockIdx.x; league_rank < int_league_size;
+         league_rank += gridDim.x) {
+      this->template exec_team<work_tag>(typename Policy::member_type(
+          kokkos_impl_hip_shared_memory<void>(), m_shmem_begin, m_shmem_size,
+          static_cast<void*>(static_cast<char*>(m_scratch_ptr[1]) +
+                             ptrdiff_t(threadid / (blockDim.x * blockDim.y)) *
+                                 m_scratch_size[1]),
+          m_scratch_size[1], league_rank, m_league_size));
+    }
+    if (m_scratch_size[1] > 0) {
+      hip_release_scratch_index(m_scratch_locks, threadid);
+    }
+  }
+
+  inline void execute() const {
+    int64_t const shmem_size_total = m_shmem_begin + m_shmem_size;
+    dim3 const grid(static_cast<int>(m_league_size), 1, 1);
+    dim3 const block(static_cast<int>(m_vector_size),
+                     static_cast<int>(m_team_size), 1);
+
+    using closure_type =
+        ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP>;
+    Impl::hip_parallel_launch<closure_type, launch_bounds>(
+        *this, grid, block, shmem_size_total,
+        m_policy.space().impl_internal_space_instance(),
+        true);  // copy to device and execute
+  }
+
+  ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_league_size(arg_policy.league_size()),
+        m_team_size(arg_policy.team_size()),
+        m_vector_size(arg_policy.impl_vector_length()) {
+    auto internal_space_instance =
+        m_policy.space().impl_internal_space_instance();
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor, ParallelForTag());
+
+    m_shmem_begin = (sizeof(double) * (m_team_size + 2));
+    m_shmem_size =
+        (m_policy.scratch_size(0, m_team_size) +
+         FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
+    m_scratch_size[0]   = m_policy.scratch_size(0, m_team_size);
+    m_scratch_size[1]   = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks     = internal_space_instance->m_scratch_locks;
+    m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
+
+    // Functor's reduce memory, team scan memory, and team shared memory depend
+    // upon team size.
+    m_scratch_ptr[0] = nullptr;
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space();
+      m_scratch_ptr[1]  = internal_space_instance->resize_team_scratch_space(
+          m_scratch_pool_id,
+          static_cast<std::int64_t>(m_scratch_size[1]) *
+              (std::min(
+                  static_cast<std::int64_t>(HIP().concurrency() /
+                                            (m_team_size * m_vector_size)),
+                  static_cast<std::int64_t>(m_league_size))));
+    }
+
+    int const shmem_size_total = m_shmem_begin + m_shmem_size;
+    if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) {
+      Kokkos::Impl::throw_runtime_exception(std::string(
+          "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory"));
+    }
+
+    size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
+      Kokkos::Impl::throw_runtime_exception(std::string(
+          "Kokkos::Impl::ParallelFor< HIP > requested too large team size."));
+    }
+  }
+
+  ~ParallelFor() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->release_team_scratch_space(m_scratch_pool_id);
+    }
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp
similarity index 61%
rename from core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
rename to core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp
index 0fa325cb12c..55b6218d1c8 100644
--- a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
+++ b/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp
@@ -14,157 +14,19 @@
 //
 //@HEADER
 
-#ifndef KOKKOS_HIP_PARALLEL_MDRANGE_HPP
-#define KOKKOS_HIP_PARALLEL_MDRANGE_HPP
+#ifndef KOKKOS_HIP_PARALLEL_REDUCE_MDRANGE_HPP
+#define KOKKOS_HIP_PARALLEL_REDUCE_MDRANGE_HPP
+
+#include <Kokkos_Parallel.hpp>
 
 #include <HIP/Kokkos_HIP_BlockSize_Deduction.hpp>
 #include <HIP/Kokkos_HIP_KernelLaunch.hpp>
 #include <HIP/Kokkos_HIP_ReduceScan.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 #include <impl/KokkosExp_IterateTileGPU.hpp>
-#include <Kokkos_Parallel.hpp>
 
 namespace Kokkos {
 namespace Impl {
-// ParallelFor
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, HIP> {
- public:
-  using Policy       = Kokkos::MDRangePolicy<Traits...>;
-  using functor_type = FunctorType;
-
- private:
-  using array_index_type = typename Policy::array_index_type;
-  using index_type       = typename Policy::index_type;
-  using LaunchBounds     = typename Policy::launch_bounds;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-
- public:
-  ParallelFor()                   = delete;
-  ParallelFor(ParallelFor const&) = default;
-  ParallelFor& operator=(ParallelFor const&) = delete;
-
-  inline __device__ void operator()() const {
-    Kokkos::Impl::DeviceIterateTile<Policy::rank, Policy, FunctorType,
-                                    typename Policy::work_tag>(m_policy,
-                                                               m_functor)
-        .exec_range();
-  }
-
-  inline void execute() const {
-    using ClosureType = ParallelFor<FunctorType, Policy, HIP>;
-    if (m_policy.m_num_tiles == 0) return;
-    auto const maxblocks = hip_internal_maximum_grid_count();
-    if (Policy::rank == 2) {
-      dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1);
-      dim3 const grid(
-          std::min<array_index_type>(
-              (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) /
-                  block.x,
-              maxblocks[0]),
-          std::min<array_index_type>(
-              (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) /
-                  block.y,
-              maxblocks[1]),
-          1);
-      hip_parallel_launch<ClosureType, LaunchBounds>(
-          *this, grid, block, 0,
-          m_policy.space().impl_internal_space_instance(), false);
-    } else if (Policy::rank == 3) {
-      dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1],
-                       m_policy.m_tile[2]);
-      dim3 const grid(
-          std::min<array_index_type>(
-              (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) /
-                  block.x,
-              maxblocks[0]),
-          std::min<array_index_type>(
-              (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) /
-                  block.y,
-              maxblocks[1]),
-          std::min<array_index_type>(
-              (m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) /
-                  block.z,
-              maxblocks[2]));
-      hip_parallel_launch<ClosureType, LaunchBounds>(
-          *this, grid, block, 0,
-          m_policy.space().impl_internal_space_instance(), false);
-    } else if (Policy::rank == 4) {
-      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
-      // threadIdx.z
-      dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1],
-                       m_policy.m_tile[2], m_policy.m_tile[3]);
-      dim3 const grid(
-          std::min<array_index_type>(
-              m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]),
-          std::min<array_index_type>(
-              (m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) /
-                  block.y,
-              maxblocks[1]),
-          std::min<array_index_type>(
-              (m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) /
-                  block.z,
-              maxblocks[2]));
-      hip_parallel_launch<ClosureType, LaunchBounds>(
-          *this, grid, block, 0,
-          m_policy.space().impl_internal_space_instance(), false);
-    } else if (Policy::rank == 5) {
-      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4
-      // to threadIdx.z
-      dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1],
-                       m_policy.m_tile[2] * m_policy.m_tile[3],
-                       m_policy.m_tile[4]);
-      dim3 const grid(
-          std::min<array_index_type>(
-              m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]),
-          std::min<array_index_type>(
-              m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]),
-          std::min<array_index_type>(
-              (m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) /
-                  block.z,
-              maxblocks[2]));
-      hip_parallel_launch<ClosureType, LaunchBounds>(
-          *this, grid, block, 0,
-          m_policy.space().impl_internal_space_instance(), false);
-    } else if (Policy::rank == 6) {
-      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y;
-      // id4,id5 to threadIdx.z
-      dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1],
-                       m_policy.m_tile[2] * m_policy.m_tile[3],
-                       m_policy.m_tile[4] * m_policy.m_tile[5]);
-      dim3 const grid(
-          std::min<array_index_type>(
-              m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]),
-          std::min<array_index_type>(
-              m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]),
-          std::min<array_index_type>(
-              m_policy.m_tile_end[4] * m_policy.m_tile_end[5], maxblocks[2]));
-      hip_parallel_launch<ClosureType, LaunchBounds>(
-          *this, grid, block, 0,
-          m_policy.space().impl_internal_space_instance(), false);
-    } else {
-      Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n");
-    }
-
-  }  // end execute
-
-  ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
-
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy&, const Functor&) {
-    using closure_type =
-        ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, HIP>;
-    unsigned block_size = hip_get_max_blocksize<closure_type, LaunchBounds>();
-    if (block_size == 0)
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid "
-                      "tile size."));
-    return block_size;
-  }
-};
 
 // ParallelReduce
 template <class CombinedFunctorReducerType, class... Traits>
diff --git a/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp b/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp
new file mode 100644
index 00000000000..c8981866e8a
--- /dev/null
+++ b/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp
@@ -0,0 +1,329 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_HIP_PARALLEL_REDUCE_RANGE_HPP
+#define KOKKOS_HIP_PARALLEL_REDUCE_RANGE_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+#include <HIP/Kokkos_HIP_BlockSize_Deduction.hpp>
+#include <HIP/Kokkos_HIP_KernelLaunch.hpp>
+#include <HIP/Kokkos_HIP_ReduceScan.hpp>
+#include <HIP/Kokkos_HIP_Shuffle_Reduce.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
+                     Kokkos::HIP> {
+ public:
+  using Policy      = Kokkos::RangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+ private:
+  using WorkRange    = typename Policy::WorkRange;
+  using WorkTag      = typename Policy::work_tag;
+  using Member       = typename Policy::member_type;
+  using LaunchBounds = typename Policy::launch_bounds;
+
+ public:
+  using pointer_type   = typename ReducerType::pointer_type;
+  using value_type     = typename ReducerType::value_type;
+  using reference_type = typename ReducerType::reference_type;
+  using functor_type   = FunctorType;
+  using reducer_type   = ReducerType;
+  using size_type      = Kokkos::HIP::size_type;
+  using index_type     = typename Policy::index_type;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::HIP::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the scan is performed.
+  // Within the scan, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the scan, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < sizeof(size_type),
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
+
+  // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
+  // blockDim.z == 1
+
+  const CombinedFunctorReducerType m_functor_reducer;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
+  const bool m_result_ptr_host_accessible;
+  word_size_type* m_scratch_space = nullptr;
+  size_type* m_scratch_flags      = nullptr;
+
+  static constexpr bool UseShflReduction = false;
+
+ private:
+  struct ShflReductionTag {};
+  struct SHMEMReductionTag {};
+
+  // Make the exec_range calls call to Reduce::DeviceIterateTile
+  template <class TagType>
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const Member& i, reference_type update) const {
+    m_functor_reducer.get_functor()(i, update);
+  }
+
+  template <class TagType>
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const Member& i, reference_type update) const {
+    m_functor_reducer.get_functor()(TagType(), i, update);
+  }
+
+ public:
+  __device__ inline void operator()() const {
+    using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag,
+                                            SHMEMReductionTag>;
+    run(ReductionTag{});
+  }
+
+  __device__ inline void run(SHMEMReductionTag) const {
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
+    const integral_nonzero_constant<word_size_type,
+                                    ReducerType::static_value_size() /
+                                        sizeof(word_size_type)>
+        word_count(reducer.value_size() / sizeof(word_size_type));
+
+    {
+      reference_type value = reducer.init(reinterpret_cast<pointer_type>(
+          ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() +
+          threadIdx.y * word_count.value));
+
+      // Number of blocks is bounded so that the reduction can be limited to two
+      // passes. Each thread block is given an approximately equal amount of
+      // work to perform. Accumulate the values for this block. The accumulation
+      // ordering does not match the final pass, but is arithmetically
+      // equivalent.
+
+      const WorkRange range(m_policy, blockIdx.x, gridDim.x);
+
+      for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
+           iwork < iwork_end; iwork += blockDim.y) {
+        this->template exec_range<WorkTag>(iwork, value);
+      }
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    // Shortcut for length zero reduction
+    bool do_final_reduction = m_policy.begin() == m_policy.end();
+    if (!do_final_reduction)
+      do_final_reduction = hip_single_inter_block_reduce_scan<false>(
+          reducer, blockIdx.x, gridDim.x,
+          ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>(),
+          m_scratch_space, m_scratch_flags);
+    if (do_final_reduction) {
+      // This is the final block with the final result at the final threads'
+      // location
+
+      word_size_type* const shared =
+          ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() +
+          (blockDim.y - 1) * word_count.value;
+      word_size_type* const global =
+          m_result_ptr_device_accessible
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
+              : m_scratch_space;
+
+      if (threadIdx.y == 0) {
+        reducer.final(reinterpret_cast<value_type*>(shared));
+      }
+
+      if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) {
+        __syncthreads();
+      }
+
+      for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
+        global[i] = shared[i];
+      }
+    }
+  }
+
+  __device__ inline void run(ShflReductionTag) const {
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
+
+    value_type value;
+    reducer.init(&value);
+    // Number of blocks is bounded so that the reduction can be limited to two
+    // passes. Each thread block is given an approximately equal amount of work
+    // to perform. Accumulate the values for this block. The accumulation
+    // ordering does not match the final pass, but is arithmetically equivalent.
+
+    WorkRange const range(m_policy, blockIdx.x, gridDim.x);
+
+    for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
+         iwork < iwork_end; iwork += blockDim.y) {
+      this->template exec_range<WorkTag>(iwork, value);
+    }
+
+    pointer_type const result = reinterpret_cast<pointer_type>(m_scratch_space);
+
+    int max_active_thread = static_cast<int>(range.end() - range.begin()) <
+                                    static_cast<int>(blockDim.y)
+                                ? range.end() - range.begin()
+                                : blockDim.y;
+
+    max_active_thread =
+        (max_active_thread == 0) ? blockDim.y : max_active_thread;
+
+    value_type init;
+    reducer.init(&init);
+    if (m_policy.begin() == m_policy.end()) {
+      reducer.final(&value);
+      pointer_type const final_result =
+          m_result_ptr_device_accessible ? m_result_ptr : result;
+      *final_result = value;
+    } else if (Impl::hip_inter_block_shuffle_reduction<>(
+                   value, init, reducer, m_scratch_space, result,
+                   m_scratch_flags, max_active_thread)) {
+      unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x;
+      if (id == 0) {
+        reducer.final(&value);
+        pointer_type const final_result =
+            m_result_ptr_device_accessible ? m_result_ptr : result;
+        *final_result = value;
+      }
+    }
+  }
+
+  // Determine block size constrained by shared memory:
+  inline unsigned local_block_size(const FunctorType& f) {
+    const auto& instance = m_policy.space().impl_internal_space_instance();
+    auto shmem_functor   = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, WorkTag,
+                                                      value_type>(f, n);
+    };
+    return Kokkos::Impl::hip_get_preferred_blocksize<ParallelReduce,
+                                                     LaunchBounds>(
+        instance, shmem_functor);
+  }
+
+  inline void execute() {
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
+
+    const index_type nwork     = m_policy.end() - m_policy.begin();
+    const bool need_device_set = ReducerType::has_init_member_function() ||
+                                 ReducerType::has_final_member_function() ||
+                                 !m_result_ptr_host_accessible ||
+                                 !std::is_same<ReducerType, InvalidType>::value;
+    if ((nwork > 0) || need_device_set) {
+      const int block_size = local_block_size(m_functor_reducer.get_functor());
+      if (block_size == 0) {
+        Kokkos::Impl::throw_runtime_exception(
+            std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                        "valid execution configuration."));
+      }
+
+      // REQUIRED ( 1 , N , 1 )
+      dim3 block(1, block_size, 1);
+      // use a slightly less constrained, but still well bounded limit for
+      // scratch
+      int nblocks = (nwork + block.y - 1) / block.y;
+      // Heuristic deciding the value of nblocks.
+      // The general idea here is we want to:
+      //    1. Not undersubscribe the device (i.e., we want at least
+      //    preferred_block_min blocks)
+      //    2. Have each thread reduce > 1 value to minimize overheads
+      //    3. Limit the total # of blocks, to avoid unbounded scratch space
+      constexpr int block_max           = 4096;
+      constexpr int preferred_block_min = 1024;
+
+      if (nblocks < preferred_block_min) {
+        // keep blocks as is, already have low parallelism
+      } else if (nblocks > block_max) {
+        // "large dispatch" -> already have lots of parallelism
+        nblocks = block_max;
+      } else {
+        // in the intermediate range, try to have each thread process multiple
+        // items to offset the cost of the reduction (with not enough
+        // parallelism to hide it)
+        int items_per_thread =
+            (nwork + nblocks * block_size - 1) / (nblocks * block_size);
+        if (items_per_thread < 4) {
+          int ratio = std::min(
+              (nblocks + preferred_block_min - 1) / preferred_block_min,
+              (4 + items_per_thread - 1) / items_per_thread);
+          nblocks /= ratio;
+        }
+      }
+
+      // TODO: down casting these uses more space than required?
+      m_scratch_space =
+          (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space(
+              m_policy.space(), reducer.value_size() * nblocks);
+      // Intentionally do not downcast to word_size_type since we use HIP
+      // atomics in Kokkos_HIP_ReduceScan.hpp
+      m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags(
+          m_policy.space(), sizeof(size_type));
+      // Required grid.x <= block.y
+      dim3 grid(nblocks, 1, 1);
+
+      if (nwork == 0) {
+        block = dim3(1, 1, 1);
+        grid  = dim3(1, 1, 1);
+      }
+      const int shmem =
+          UseShflReduction
+              ? 0
+              : hip_single_inter_block_reduce_scan_shmem<false, WorkTag,
+                                                         value_type>(
+                    m_functor_reducer.get_functor(), block.y);
+
+      Kokkos::Impl::hip_parallel_launch<ParallelReduce, LaunchBounds>(
+          *this, grid, block, shmem,
+          m_policy.space().impl_internal_space_instance(),
+          false);  // copy to device and execute
+
+      if (!m_result_ptr_device_accessible && m_result_ptr) {
+        const int size = reducer.value_size();
+        DeepCopy<HostSpace, HIPSpace, HIP>(m_policy.space(), m_result_ptr,
+                                           m_scratch_space, size);
+      }
+    } else {
+      if (m_result_ptr) {
+        reducer.init(m_result_ptr);
+      }
+    }
+  }
+
+  template <class ViewType>
+  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                 const Policy& arg_policy, const ViewType& arg_result)
+      : m_functor_reducer(arg_functor_reducer),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<HIPSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_host_accessible(
+            MemorySpaceAccess<Kokkos::HostSpace,
+                              typename ViewType::memory_space>::accessible) {}
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp b/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp
new file mode 100644
index 00000000000..609ba28b866
--- /dev/null
+++ b/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp
@@ -0,0 +1,394 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_HIP_PARALLEL_REDUCE_TEAM_HPP
+#define KOKKOS_HIP_PARALLEL_REDUCE_TEAM_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+#include <HIP/Kokkos_HIP_KernelLaunch.hpp>
+#include <HIP/Kokkos_HIP_Team.hpp>
+#include <HIP/Kokkos_HIP_Instance.hpp>
+#include <HIP/Kokkos_HIP_TeamPolicyInternal.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class CombinedFunctorReducerType, class... Properties>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::TeamPolicy<Properties...>, HIP> {
+ public:
+  using Policy      = TeamPolicyInternal<HIP, Properties...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+ private:
+  using member_type   = typename Policy::member_type;
+  using work_tag      = typename Policy::work_tag;
+  using launch_bounds = typename Policy::launch_bounds;
+
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+  using value_type     = typename ReducerType::value_type;
+
+ public:
+  using functor_type = FunctorType;
+  using size_type    = HIP::size_type;
+
+  // static int constexpr UseShflReduction = false;
+  // FIXME_HIP This should be disabled unconditionally for best performance, but
+  // it currently causes tests to fail.
+  static constexpr int UseShflReduction =
+      (ReducerType::static_value_size() != 0);
+
+ private:
+  struct ShflReductionTag {};
+  struct SHMEMReductionTag {};
+
+  // Algorithmic constraints: blockDim.y is a power of two AND
+  // blockDim.y == blockDim.z == 1 shared memory utilization:
+  //
+  //  [ global reduce space ]
+  //  [ team   reduce space ]
+  //  [ team   shared space ]
+  //
+
+  const CombinedFunctorReducerType m_functor_reducer;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
+  const bool m_result_ptr_host_accessible;
+  size_type* m_scratch_space;
+  size_type* m_scratch_flags;
+  size_type m_team_begin;
+  size_type m_shmem_begin;
+  size_type m_shmem_size;
+  void* m_scratch_ptr[2];
+  size_t m_scratch_size[2];
+  int m_scratch_pool_id = -1;
+  int32_t* m_scratch_locks;
+  size_t m_num_scratch_locks;
+  const size_type m_league_size;
+  int m_team_size;
+  const size_type m_vector_size;
+
+  template <class TagType>
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
+      member_type const& member, reference_type update) const {
+    m_functor_reducer.get_functor()(member, update);
+  }
+
+  template <class TagType>
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team(
+      member_type const& member, reference_type update) const {
+    m_functor_reducer.get_functor()(TagType(), member, update);
+  }
+
+  __device__ inline void iterate_through_league(int const threadid,
+                                                reference_type value) const {
+    int const int_league_size = static_cast<int>(m_league_size);
+    for (int league_rank = blockIdx.x; league_rank < int_league_size;
+         league_rank += gridDim.x) {
+      this->template exec_team<work_tag>(
+          member_type(
+              kokkos_impl_hip_shared_memory<char>() + m_team_begin,
+              m_shmem_begin, m_shmem_size,
+              reinterpret_cast<void*>(
+                  reinterpret_cast<char*>(m_scratch_ptr[1]) +
+                  static_cast<ptrdiff_t>(threadid / (blockDim.x * blockDim.y)) *
+                      m_scratch_size[1]),
+              m_scratch_size[1], league_rank, m_league_size),
+          value);
+    }
+  }
+
+  int compute_block_count() const {
+    constexpr auto light_weight =
+        Kokkos::Experimental::WorkItemProperty::HintLightWeight;
+    constexpr typename Policy::work_item_property property;
+    // Numbers were tuned on MI210 using dot product and yAx benchmarks
+    constexpr int block_max =
+        (property & light_weight) == light_weight ? 2097152 : 65536;
+    constexpr int preferred_block_min = 1024;
+    int block_count                   = m_league_size;
+    if (block_count < preferred_block_min) {
+      // keep blocks as is, already low parallelism
+    } else if (block_count >= block_max) {
+      block_count = block_max;
+
+    } else {
+      int nwork = m_league_size * m_team_size;
+      int items_per_thread =
+          (nwork + block_count * m_team_size - 1) / (block_count * m_team_size);
+      if (items_per_thread < 4) {
+        int ratio = std::min(
+            (block_count + preferred_block_min - 1) / preferred_block_min,
+            (4 + items_per_thread - 1) / items_per_thread);
+        block_count /= ratio;
+      }
+    }
+
+    return block_count;
+  }
+
+ public:
+  __device__ inline void operator()() const {
+    int64_t threadid = 0;
+    if (m_scratch_size[1] > 0) {
+      threadid = hip_get_scratch_index(m_league_size, m_scratch_locks,
+                                       m_num_scratch_locks);
+    }
+
+    using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag,
+                                            SHMEMReductionTag>;
+    run(ReductionTag{}, threadid);
+
+    if (m_scratch_size[1] > 0) {
+      hip_release_scratch_index(m_scratch_locks, threadid);
+    }
+  }
+
+  __device__ inline void run(SHMEMReductionTag, int const threadid) const {
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
+
+    integral_nonzero_constant<size_type, ReducerType::static_value_size() /
+                                             sizeof(size_type)> const
+        word_count(reducer.value_size() / sizeof(size_type));
+
+    reference_type value =
+        reducer.init(kokkos_impl_hip_shared_memory<size_type>() +
+                     threadIdx.y * word_count.value);
+    // Iterate this block through the league
+    iterate_through_league(threadid, value);
+
+    // Reduce with final value at blockDim.y - 1 location.
+    bool do_final_reduce = (m_league_size == 0);
+    if (!do_final_reduce)
+      do_final_reduce =
+          hip_single_inter_block_reduce_scan<false, FunctorType, work_tag>(
+              reducer, blockIdx.x, gridDim.x,
+              kokkos_impl_hip_shared_memory<size_type>(), m_scratch_space,
+              m_scratch_flags);
+    if (do_final_reduce) {
+      // This is the final block with the final result at the final threads'
+      // location
+
+      size_type* const shared = kokkos_impl_hip_shared_memory<size_type>() +
+                                (blockDim.y - 1) * word_count.value;
+      size_type* const global = m_result_ptr_device_accessible
+                                    ? reinterpret_cast<size_type*>(m_result_ptr)
+                                    : m_scratch_space;
+
+      if (threadIdx.y == 0) {
+        reducer.final(reinterpret_cast<value_type*>(shared));
+      }
+
+      if (HIPTraits::WarpSize < word_count.value) {
+        __syncthreads();
+      }
+
+      for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
+        global[i] = shared[i];
+      }
+    }
+  }
+
+  __device__ inline void run(ShflReductionTag, int const threadid) const {
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
+
+    value_type value;
+    reducer.init(&value);
+
+    // Iterate this block through the league
+    iterate_through_league(threadid, value);
+
+    pointer_type const result =
+        m_result_ptr_device_accessible
+            ? m_result_ptr
+            : reinterpret_cast<pointer_type>(m_scratch_space);
+
+    value_type init;
+    reducer.init(&init);
+    if (m_league_size == 0) {
+      reducer.final(&value);
+      *result = value;
+    } else if (Impl::hip_inter_block_shuffle_reduction(
+                   value, init, reducer, m_scratch_space, result,
+                   m_scratch_flags, blockDim.y)) {
+      unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x;
+      if (id == 0) {
+        reducer.final(&value);
+        *result = value;
+      }
+    }
+  }
+
+  inline void execute() {
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
+
+    const bool is_empty_range  = m_league_size == 0 || m_team_size == 0;
+    const bool need_device_set = ReducerType::has_init_member_function() ||
+                                 ReducerType::has_final_member_function() ||
+                                 !m_result_ptr_host_accessible ||
+                                 Policy::is_graph_kernel::value ||
+                                 !std::is_same<ReducerType, InvalidType>::value;
+    if (!is_empty_range || need_device_set) {
+      int const block_count = compute_block_count();
+
+      m_scratch_space = hip_internal_scratch_space(
+          m_policy.space(), reducer.value_size() * block_count);
+      m_scratch_flags =
+          hip_internal_scratch_flags(m_policy.space(), sizeof(size_type));
+
+      dim3 block(m_vector_size, m_team_size, 1);
+      dim3 grid(block_count, 1, 1);
+      if (is_empty_range) {
+        block = dim3(1, 1, 1);
+        grid  = dim3(1, 1, 1);
+      }
+      const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
+
+      Impl::hip_parallel_launch<ParallelReduce, launch_bounds>(
+          *this, grid, block, shmem_size_total,
+          m_policy.space().impl_internal_space_instance(),
+          true);  // copy to device and execute
+
+      if (!m_result_ptr_device_accessible) {
+        m_policy.space().impl_internal_space_instance()->fence();
+
+        if (m_result_ptr) {
+          const int size = reducer.value_size();
+          DeepCopy<HostSpace, HIPSpace, HIP>(m_policy.space(), m_result_ptr,
+                                             m_scratch_space, size);
+        }
+      }
+    } else {
+      if (m_result_ptr) {
+        reducer.init(m_result_ptr);
+      }
+    }
+  }
+
+  template <class ViewType>
+  ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer,
+                 Policy const& arg_policy, ViewType const& arg_result)
+      : m_functor_reducer(arg_functor_reducer),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<HIPSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_host_accessible(
+            MemorySpaceAccess<Kokkos::HostSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_team_begin(0),
+        m_shmem_begin(0),
+        m_shmem_size(0),
+        m_scratch_ptr{nullptr, nullptr},
+        m_league_size(arg_policy.league_size()),
+        m_team_size(arg_policy.team_size()),
+        m_vector_size(arg_policy.impl_vector_length()) {
+    auto internal_space_instance =
+        m_policy.space().impl_internal_space_instance();
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor_reducer.get_functor(),
+                                         arg_functor_reducer.get_reducer(),
+                                         ParallelReduceTag());
+
+    m_team_begin =
+        UseShflReduction
+            ? 0
+            : hip_single_inter_block_reduce_scan_shmem<false, work_tag,
+                                                       value_type>(
+                  arg_functor_reducer.get_functor(), m_team_size);
+    m_shmem_begin = sizeof(double) * (m_team_size + 2);
+    m_shmem_size  = m_policy.scratch_size(0, m_team_size) +
+                   FunctorTeamShmemSize<FunctorType>::value(
+                       arg_functor_reducer.get_functor(), m_team_size);
+    m_scratch_size[0]   = m_shmem_size;
+    m_scratch_size[1]   = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks     = internal_space_instance->m_scratch_locks;
+    m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space();
+      m_scratch_ptr[1]  = internal_space_instance->resize_team_scratch_space(
+          m_scratch_pool_id,
+          static_cast<std::int64_t>(m_scratch_size[1]) *
+              (std::min(
+                  static_cast<std::int64_t>(HIP().concurrency() /
+                                            (m_team_size * m_vector_size)),
+                  static_cast<std::int64_t>(m_league_size))));
+    }
+
+    // The global parallel_reduce does not support vector_length other than 1 at
+    // the moment
+    if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction)
+      Impl::throw_runtime_exception(
+          "Kokkos::parallel_reduce with a TeamPolicy using a vector length of "
+          "greater than 1 is not currently supported for HIP for dynamic "
+          "sized reduction types.");
+
+    if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction)
+      Impl::throw_runtime_exception(
+          "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller "
+          "than 64 is not currently supported with HIP for dynamic sized "
+          "reduction types.");
+
+    // Functor's reduce memory, team scan memory, and team shared memory depend
+    // upon team size.
+
+    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
+
+    if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
+        !UseShflReduction) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size"));
+    }
+
+    if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much "
+                      "L0 scratch memory"));
+    }
+
+    size_t max_size = arg_policy.team_size_max(
+        arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(),
+        ParallelReduceTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
+                      "large team size."));
+    }
+  }
+
+  ~ParallelReduce() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->release_team_scratch_space(m_scratch_pool_id);
+    }
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp
similarity index 50%
rename from core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
rename to core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp
index 26e8be4698a..41692a3291b 100644
--- a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
+++ b/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp
@@ -14,390 +14,18 @@
 //
 //@HEADER
 
-#ifndef KOKKO_HIP_PARALLEL_RANGE_HPP
-#define KOKKO_HIP_PARALLEL_RANGE_HPP
+#ifndef KOKKOS_HIP_PARALLEL_SCAN_RANGE_HPP
+#define KOKKOS_HIP_PARALLEL_SCAN_RANGE_HPP
 
 #include <Kokkos_Parallel.hpp>
 
-#if defined(__HIPCC__)
-
 #include <HIP/Kokkos_HIP_BlockSize_Deduction.hpp>
 #include <HIP/Kokkos_HIP_KernelLaunch.hpp>
 #include <HIP/Kokkos_HIP_ReduceScan.hpp>
-#include <HIP/Kokkos_HIP_Shuffle_Reduce.hpp>
-#include <impl/Kokkos_Traits.hpp>
 
 namespace Kokkos {
 namespace Impl {
 
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::HIP> {
- public:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
- private:
-  using Member       = typename Policy::member_type;
-  using WorkTag      = typename Policy::work_tag;
-  using LaunchBounds = typename Policy::launch_bounds;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-
-  template <class TagType>
-  inline __device__ std::enable_if_t<std::is_void<TagType>::value> exec_range(
-      const Member i) const {
-    m_functor(i);
-  }
-
-  template <class TagType>
-  inline __device__ std::enable_if_t<!std::is_void<TagType>::value> exec_range(
-      const Member i) const {
-    m_functor(TagType(), i);
-  }
-
- public:
-  using functor_type = FunctorType;
-
-  ParallelFor()                   = delete;
-  ParallelFor(ParallelFor const&) = default;
-  ParallelFor& operator=(ParallelFor const&) = delete;
-
-  inline __device__ void operator()() const {
-    const Member work_stride = blockDim.y * gridDim.x;
-    const Member work_end    = m_policy.end();
-
-    for (Member iwork =
-             m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x;
-         iwork < work_end;
-         iwork = iwork < work_end - work_stride ? iwork + work_stride
-                                                : work_end) {
-      this->template exec_range<WorkTag>(iwork);
-    }
-  }
-
-  inline void execute() const {
-    const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
-
-    using DriverType = ParallelFor<FunctorType, Policy, Kokkos::HIP>;
-    const int block_size =
-        Kokkos::Impl::hip_get_preferred_blocksize<DriverType, LaunchBounds>();
-    const dim3 block(1, block_size, 1);
-    const dim3 grid(
-        typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1);
-
-    if (block_size == 0) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a "
-                      "valid execution configuration."));
-    }
-    Kokkos::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
-        *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
-        false);
-  }
-
-  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template <class CombinedFunctorReducerType, class... Traits>
-class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
-                     Kokkos::HIP> {
- public:
-  using Policy      = Kokkos::RangePolicy<Traits...>;
-  using FunctorType = typename CombinedFunctorReducerType::functor_type;
-  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
-
- private:
-  using WorkRange    = typename Policy::WorkRange;
-  using WorkTag      = typename Policy::work_tag;
-  using Member       = typename Policy::member_type;
-  using LaunchBounds = typename Policy::launch_bounds;
-
- public:
-  using pointer_type   = typename ReducerType::pointer_type;
-  using value_type     = typename ReducerType::value_type;
-  using reference_type = typename ReducerType::reference_type;
-  using functor_type   = FunctorType;
-  using reducer_type   = ReducerType;
-  using size_type      = Kokkos::HIP::size_type;
-  using index_type     = typename Policy::index_type;
-  // Conditionally set word_size_type to int16_t or int8_t if value_type is
-  // smaller than int32_t (Kokkos::HIP::size_type)
-  // word_size_type is used to determine the word count, shared memory buffer
-  // size, and global memory buffer size before the scan is performed.
-  // Within the scan, the word count is recomputed based on word_size_type
-  // and when calculating indexes into the shared/global memory buffers for
-  // performing the scan, word_size_type is used again.
-  // For scalars > 4 bytes in size, indexing into shared/global memory relies
-  // on the block and grid dimensions to ensure that we index at the correct
-  // offset rather than at every 4 byte word; such that, when the join is
-  // performed, we have the correct data that was copied over in chunks of 4
-  // bytes.
-  using word_size_type = std::conditional_t<
-      sizeof(value_type) < sizeof(size_type),
-      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
-
-  // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
-  // blockDim.z == 1
-
-  const CombinedFunctorReducerType m_functor_reducer;
-  const Policy m_policy;
-  const pointer_type m_result_ptr;
-  const bool m_result_ptr_device_accessible;
-  const bool m_result_ptr_host_accessible;
-  word_size_type* m_scratch_space = nullptr;
-  size_type* m_scratch_flags      = nullptr;
-
-  static constexpr bool UseShflReduction = false;
-
- private:
-  struct ShflReductionTag {};
-  struct SHMEMReductionTag {};
-
-  // Make the exec_range calls call to Reduce::DeviceIterateTile
-  template <class TagType>
-  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
-      const Member& i, reference_type update) const {
-    m_functor_reducer.get_functor()(i, update);
-  }
-
-  template <class TagType>
-  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
-      const Member& i, reference_type update) const {
-    m_functor_reducer.get_functor()(TagType(), i, update);
-  }
-
- public:
-  __device__ inline void operator()() const {
-    using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag,
-                                            SHMEMReductionTag>;
-    run(ReductionTag{});
-  }
-
-  __device__ inline void run(SHMEMReductionTag) const {
-    const ReducerType& reducer = m_functor_reducer.get_reducer();
-    const integral_nonzero_constant<word_size_type,
-                                    ReducerType::static_value_size() /
-                                        sizeof(word_size_type)>
-        word_count(reducer.value_size() / sizeof(word_size_type));
-
-    {
-      reference_type value = reducer.init(reinterpret_cast<pointer_type>(
-          ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() +
-          threadIdx.y * word_count.value));
-
-      // Number of blocks is bounded so that the reduction can be limited to two
-      // passes. Each thread block is given an approximately equal amount of
-      // work to perform. Accumulate the values for this block. The accumulation
-      // ordering does not match the final pass, but is arithmetically
-      // equivalent.
-
-      const WorkRange range(m_policy, blockIdx.x, gridDim.x);
-
-      for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
-           iwork < iwork_end; iwork += blockDim.y) {
-        this->template exec_range<WorkTag>(iwork, value);
-      }
-    }
-
-    // Reduce with final value at blockDim.y - 1 location.
-    // Shortcut for length zero reduction
-    bool do_final_reduction = m_policy.begin() == m_policy.end();
-    if (!do_final_reduction)
-      do_final_reduction = hip_single_inter_block_reduce_scan<false>(
-          reducer, blockIdx.x, gridDim.x,
-          ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>(),
-          m_scratch_space, m_scratch_flags);
-    if (do_final_reduction) {
-      // This is the final block with the final result at the final threads'
-      // location
-
-      word_size_type* const shared =
-          ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() +
-          (blockDim.y - 1) * word_count.value;
-      word_size_type* const global =
-          m_result_ptr_device_accessible
-              ? reinterpret_cast<word_size_type*>(m_result_ptr)
-              : m_scratch_space;
-
-      if (threadIdx.y == 0) {
-        reducer.final(reinterpret_cast<value_type*>(shared));
-      }
-
-      if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) {
-        __syncthreads();
-      }
-
-      for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
-        global[i] = shared[i];
-      }
-    }
-  }
-
-  __device__ inline void run(ShflReductionTag) const {
-    const ReducerType& reducer = m_functor_reducer.get_reducer();
-
-    value_type value;
-    reducer.init(&value);
-    // Number of blocks is bounded so that the reduction can be limited to two
-    // passes. Each thread block is given an approximately equal amount of work
-    // to perform. Accumulate the values for this block. The accumulation
-    // ordering does not match the final pass, but is arithmetically equivalent.
-
-    WorkRange const range(m_policy, blockIdx.x, gridDim.x);
-
-    for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
-         iwork < iwork_end; iwork += blockDim.y) {
-      this->template exec_range<WorkTag>(iwork, value);
-    }
-
-    pointer_type const result = reinterpret_cast<pointer_type>(m_scratch_space);
-
-    int max_active_thread = static_cast<int>(range.end() - range.begin()) <
-                                    static_cast<int>(blockDim.y)
-                                ? range.end() - range.begin()
-                                : blockDim.y;
-
-    max_active_thread =
-        (max_active_thread == 0) ? blockDim.y : max_active_thread;
-
-    value_type init;
-    reducer.init(&init);
-    if (m_policy.begin() == m_policy.end()) {
-      reducer.final(&value);
-      pointer_type const final_result =
-          m_result_ptr_device_accessible ? m_result_ptr : result;
-      *final_result = value;
-    } else if (Impl::hip_inter_block_shuffle_reduction<>(
-                   value, init, reducer, m_scratch_space, result,
-                   m_scratch_flags, max_active_thread)) {
-      unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x;
-      if (id == 0) {
-        reducer.final(&value);
-        pointer_type const final_result =
-            m_result_ptr_device_accessible ? m_result_ptr : result;
-        *final_result = value;
-      }
-    }
-  }
-
-  // Determine block size constrained by shared memory:
-  inline unsigned local_block_size(const FunctorType& f) {
-    const auto& instance = m_policy.space().impl_internal_space_instance();
-    auto shmem_functor   = [&f](unsigned n) {
-      return hip_single_inter_block_reduce_scan_shmem<false, WorkTag,
-                                                      value_type>(f, n);
-    };
-    return Kokkos::Impl::hip_get_preferred_blocksize<ParallelReduce,
-                                                     LaunchBounds>(
-        instance, shmem_functor);
-  }
-
-  inline void execute() {
-    const ReducerType& reducer = m_functor_reducer.get_reducer();
-
-    const index_type nwork     = m_policy.end() - m_policy.begin();
-    const bool need_device_set = ReducerType::has_init_member_function() ||
-                                 ReducerType::has_final_member_function() ||
-                                 !m_result_ptr_host_accessible ||
-                                 !std::is_same<ReducerType, InvalidType>::value;
-    if ((nwork > 0) || need_device_set) {
-      const int block_size = local_block_size(m_functor_reducer.get_functor());
-      if (block_size == 0) {
-        Kokkos::Impl::throw_runtime_exception(
-            std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
-                        "valid execution configuration."));
-      }
-
-      // REQUIRED ( 1 , N , 1 )
-      dim3 block(1, block_size, 1);
-      // use a slightly less constrained, but still well bounded limit for
-      // scratch
-      int nblocks = (nwork + block.y - 1) / block.y;
-      // Heuristic deciding the value of nblocks.
-      // The general idea here is we want to:
-      //    1. Not undersubscribe the device (i.e., we want at least
-      //    preferred_block_min blocks)
-      //    2. Have each thread reduce > 1 value to minimize overheads
-      //    3. Limit the total # of blocks, to avoid unbounded scratch space
-      constexpr int block_max           = 4096;
-      constexpr int preferred_block_min = 1024;
-
-      if (nblocks < preferred_block_min) {
-        // keep blocks as is, already have low parallelism
-      } else if (nblocks > block_max) {
-        // "large dispatch" -> already have lots of parallelism
-        nblocks = block_max;
-      } else {
-        // in the intermediate range, try to have each thread process multiple
-        // items to offset the cost of the reduction (with not enough
-        // parallelism to hide it)
-        int items_per_thread =
-            (nwork + nblocks * block_size - 1) / (nblocks * block_size);
-        if (items_per_thread < 4) {
-          int ratio = std::min(
-              (nblocks + preferred_block_min - 1) / preferred_block_min,
-              (4 + items_per_thread - 1) / items_per_thread);
-          nblocks /= ratio;
-        }
-      }
-
-      // TODO: down casting these uses more space than required?
-      m_scratch_space =
-          (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space(
-              m_policy.space(), reducer.value_size() * nblocks);
-      // Intentionally do not downcast to word_size_type since we use HIP
-      // atomics in Kokkos_HIP_ReduceScan.hpp
-      m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags(
-          m_policy.space(), sizeof(size_type));
-      // Required grid.x <= block.y
-      dim3 grid(nblocks, 1, 1);
-
-      if (nwork == 0) {
-        block = dim3(1, 1, 1);
-        grid  = dim3(1, 1, 1);
-      }
-      const int shmem =
-          UseShflReduction
-              ? 0
-              : hip_single_inter_block_reduce_scan_shmem<false, WorkTag,
-                                                         value_type>(
-                    m_functor_reducer.get_functor(), block.y);
-
-      Kokkos::Impl::hip_parallel_launch<ParallelReduce, LaunchBounds>(
-          *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
-
-      if (!m_result_ptr_device_accessible && m_result_ptr) {
-        const int size = reducer.value_size();
-        DeepCopy<HostSpace, HIPSpace, HIP>(m_policy.space(), m_result_ptr,
-                                           m_scratch_space, size);
-      }
-    } else {
-      if (m_result_ptr) {
-        reducer.init(m_result_ptr);
-      }
-    }
-  }
-
-  template <class ViewType>
-  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
-                 const Policy& arg_policy, const ViewType& arg_result)
-      : m_functor_reducer(arg_functor_reducer),
-        m_policy(arg_policy),
-        m_result_ptr(arg_result.data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<HIPSpace,
-                              typename ViewType::memory_space>::accessible),
-        m_result_ptr_host_accessible(
-            MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ViewType::memory_space>::accessible) {}
-};
-
 template <class FunctorType, class ValueType, class... Traits>
 class ParallelScanHIPBase {
  public:
@@ -763,5 +391,3 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 }  // namespace Kokkos
 
 #endif
-
-#endif
diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
deleted file mode 100644
index 3fe568ac361..00000000000
--- a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
+++ /dev/null
@@ -1,936 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKO_HIP_PARALLEL_TEAM_HPP
-#define KOKKO_HIP_PARALLEL_TEAM_HPP
-
-#include <Kokkos_Parallel.hpp>
-
-#if defined(__HIPCC__)
-
-#include <HIP/Kokkos_HIP_KernelLaunch.hpp>
-#include <HIP/Kokkos_HIP_Team.hpp>
-#include <HIP/Kokkos_HIP_Instance.hpp>
-#include <Kokkos_MinMaxClamp.hpp>
-
-namespace Kokkos {
-namespace Impl {
-
-template <typename... Properties>
-class TeamPolicyInternal<HIP, Properties...>
-    : public PolicyTraits<Properties...> {
- public:
-  using execution_policy = TeamPolicyInternal;
-
-  using traits = PolicyTraits<Properties...>;
-
-  template <typename ExecSpace, typename... OtherProperties>
-  friend class TeamPolicyInternal;
-
- private:
-  typename traits::execution_space m_space;
-  int m_league_size;
-  int m_team_size;
-  int m_vector_length;
-  size_t m_team_scratch_size[2];
-  size_t m_thread_scratch_size[2];
-  int m_chunk_size;
-  bool m_tune_team_size;
-  bool m_tune_vector_length;
-
- public:
-  using execution_space = HIP;
-
-  template <class... OtherProperties>
-  TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) {
-    m_league_size            = p.m_league_size;
-    m_team_size              = p.m_team_size;
-    m_vector_length          = p.m_vector_length;
-    m_team_scratch_size[0]   = p.m_team_scratch_size[0];
-    m_team_scratch_size[1]   = p.m_team_scratch_size[1];
-    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
-    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
-    m_chunk_size             = p.m_chunk_size;
-    m_space                  = p.m_space;
-    m_tune_team_size         = p.m_tune_team_size;
-    m_tune_vector_length     = p.m_tune_vector_length;
-  }
-
-  template <typename FunctorType>
-  int team_size_max(FunctorType const& f, ParallelForTag const&) const {
-    using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
-
-    return internal_team_size_common<BlockType::Max, closure_type, void>(f);
-  }
-
-  template <class FunctorType>
-  inline int team_size_max(const FunctorType& f,
-                           const ParallelReduceTag&) const {
-    using functor_analysis_type =
-        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                              TeamPolicyInternal, FunctorType, void>;
-    using closure_type = Impl::ParallelReduce<
-        CombinedFunctorReducer<FunctorType,
-                               typename functor_analysis_type::Reducer>,
-        TeamPolicy<Properties...>, Kokkos::HIP>;
-    return internal_team_size_common<
-        BlockType::Max, closure_type,
-        typename functor_analysis_type::value_type>(f);
-  }
-
-  template <typename FunctorType, typename ReducerType>
-  inline int team_size_max(const FunctorType& f, const ReducerType&,
-                           const ParallelReduceTag&) const {
-    using closure_type =
-        Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
-                             TeamPolicy<Properties...>, Kokkos::HIP>;
-    return internal_team_size_common<BlockType::Max, closure_type,
-                                     typename ReducerType::value_type>(f);
-  }
-
-  template <typename FunctorType>
-  int team_size_recommended(FunctorType const& f, ParallelForTag const&) const {
-    using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
-
-    return internal_team_size_common<BlockType::Preferred, closure_type, void>(
-        f);
-  }
-
-  template <typename FunctorType>
-  inline int team_size_recommended(FunctorType const& f,
-                                   ParallelReduceTag const&) const {
-    using functor_analysis_type =
-        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                              TeamPolicyInternal, FunctorType, void>;
-    using closure_type = Impl::ParallelReduce<
-        CombinedFunctorReducer<FunctorType,
-                               typename functor_analysis_type::Reducer>,
-        TeamPolicy<Properties...>, Kokkos::HIP>;
-    return internal_team_size_common<
-        BlockType::Preferred, closure_type,
-        typename functor_analysis_type::value_type>(f);
-  }
-
-  template <typename FunctorType, typename ReducerType>
-  int team_size_recommended(FunctorType const& f, ReducerType const&,
-                            ParallelReduceTag const&) const {
-    using closure_type =
-        Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
-                             TeamPolicy<Properties...>, Kokkos::HIP>;
-    return internal_team_size_common<BlockType::Preferred, closure_type,
-                                     typename ReducerType::value_type>(f);
-  }
-
-  inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
-  inline bool impl_auto_team_size() const { return m_tune_team_size; }
-  static int vector_length_max() { return HIPTraits::WarpSize; }
-
-  static int verify_requested_vector_length(int requested_vector_length) {
-    int test_vector_length =
-        std::min(requested_vector_length, vector_length_max());
-
-    // Allow only power-of-two vector_length
-    if (!(is_integral_power_of_two(test_vector_length))) {
-      int test_pow2           = 1;
-      constexpr int warp_size = HIPTraits::WarpSize;
-      while (test_pow2 < warp_size) {
-        test_pow2 <<= 1;
-        if (test_pow2 > test_vector_length) {
-          break;
-        }
-      }
-      test_vector_length = test_pow2 >> 1;
-    }
-
-    return test_vector_length;
-  }
-
-  inline static int scratch_size_max(int level) {
-    // HIP Teams use (team_size + 2)*sizeof(double) shared memory for team
-    // reductions. They also use one int64_t in static shared memory for a
-    // shared ID. Furthermore, they use additional scratch memory in some
-    // reduction scenarios, which depend on the size of the value_type and is
-    // NOT captured here
-    constexpr size_t max_possible_team_size = 1024;
-    constexpr size_t max_reserved_shared_mem_per_team =
-        (max_possible_team_size + 2) * sizeof(double) + sizeof(int64_t);
-    // arbitrarily setting level 1 scratch limit to 20MB, for a
-    // MI250 that would give us about 4.4GB for 2 teams per CU
-    constexpr size_t max_l1_scratch_size = 20 * 1024 * 1024;
-
-    size_t max_shmem = HIP().hip_device_prop().sharedMemPerBlock;
-    return (level == 0 ? max_shmem - max_reserved_shared_mem_per_team
-                       : max_l1_scratch_size);
-  }
-
-  inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
-  inline void impl_set_team_size(size_t size) { m_team_size = size; }
-  int impl_vector_length() const { return m_vector_length; }
-
-  int team_size() const { return m_team_size; }
-
-  int league_size() const { return m_league_size; }
-
-  size_t scratch_size(int level, int team_size_ = -1) const {
-    if (team_size_ < 0) team_size_ = m_team_size;
-    return m_team_scratch_size[level] +
-           team_size_ * m_thread_scratch_size[level];
-  }
-
-  size_t team_scratch_size(int level) const {
-    return m_team_scratch_size[level];
-  }
-
-  size_t thread_scratch_size(int level) const {
-    return m_thread_scratch_size[level];
-  }
-
-  typename traits::execution_space space() const { return m_space; }
-
-  TeamPolicyInternal()
-      : m_space(typename traits::execution_space()),
-        m_league_size(0),
-        m_team_size(-1),
-        m_vector_length(0),
-        m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(HIPTraits::WarpSize),
-        m_tune_team_size(false),
-        m_tune_vector_length(false) {}
-
-  /** \brief  Specify league size, request team size */
-  TeamPolicyInternal(const execution_space space_, int league_size_,
-                     int team_size_request, int vector_length_request = 1)
-      : m_space(space_),
-        m_league_size(league_size_),
-        m_team_size(team_size_request),
-        m_vector_length(
-            (vector_length_request > 0)
-                ? verify_requested_vector_length(vector_length_request)
-                : (verify_requested_vector_length(1))),
-        m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(HIPTraits::WarpSize),
-        m_tune_team_size(bool(team_size_request <= 0)),
-        m_tune_vector_length(bool(vector_length_request <= 0)) {
-    // Make sure league size is permissible
-    if (league_size_ >= static_cast<int>(hip_internal_maximum_grid_count()[0]))
-      Impl::throw_runtime_exception(
-          "Requested too large league_size for TeamPolicy on HIP execution "
-          "space.");
-
-    // Make sure total block size is permissible
-    if (m_team_size * m_vector_length > HIPTraits::MaxThreadsPerBlock) {
-      Impl::throw_runtime_exception(
-          std::string("Kokkos::TeamPolicy< HIP > the team size is too large. "
-                      "Team size x vector length must be smaller than 1024."));
-    }
-  }
-
-  /** \brief  Specify league size, request team size */
-  TeamPolicyInternal(const execution_space space_, int league_size_,
-                     const Kokkos::AUTO_t& /* team_size_request */,
-                     int vector_length_request = 1)
-      : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {}
-  // FLAG
-  /** \brief  Specify league size and team size, request vector length*/
-  TeamPolicyInternal(const execution_space space_, int league_size_,
-                     int team_size_request,
-                     const Kokkos::AUTO_t& /* vector_length_request */
-                     )
-      : TeamPolicyInternal(space_, league_size_, team_size_request, -1)
-
-  {}
-
-  /** \brief  Specify league size, request team size and vector length*/
-  TeamPolicyInternal(const execution_space space_, int league_size_,
-                     const Kokkos::AUTO_t& /* team_size_request */,
-                     const Kokkos::AUTO_t& /* vector_length_request */
-
-                     )
-      : TeamPolicyInternal(space_, league_size_, -1, -1)
-
-  {}
-
-  TeamPolicyInternal(int league_size_, int team_size_request,
-                     int vector_length_request = 1)
-      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
-                           team_size_request, vector_length_request) {}
-
-  TeamPolicyInternal(int league_size_,
-                     const Kokkos::AUTO_t& /* team_size_request */,
-                     int vector_length_request = 1)
-      : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
-                           vector_length_request) {}
-
-  /** \brief  Specify league size and team size, request vector length*/
-  TeamPolicyInternal(int league_size_, int team_size_request,
-                     const Kokkos::AUTO_t& /* vector_length_request */
-
-                     )
-      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
-                           team_size_request, -1)
-
-  {}
-
-  /** \brief  Specify league size, request team size and vector length*/
-  TeamPolicyInternal(int league_size_,
-                     const Kokkos::AUTO_t& /* team_size_request */,
-                     const Kokkos::AUTO_t& /* vector_length_request */
-
-                     )
-      : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
-                           -1) {}
-
-  int chunk_size() const { return m_chunk_size; }
-
-  TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) {
-    m_chunk_size = chunk_size_;
-    return *this;
-  }
-
-  /** \brief set per team scratch size for a specific level of the scratch
-   * hierarchy */
-  TeamPolicyInternal& set_scratch_size(int level,
-                                       PerTeamValue const& per_team) {
-    m_team_scratch_size[level] = per_team.value;
-    return *this;
-  }
-
-  /** \brief set per thread scratch size for a specific level of the scratch
-   * hierarchy */
-  TeamPolicyInternal& set_scratch_size(int level,
-                                       PerThreadValue const& per_thread) {
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-
-  /** \brief set per thread and per team scratch size for a specific level of
-   * the scratch hierarchy */
-  TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team,
-                                       PerThreadValue const& per_thread) {
-    m_team_scratch_size[level]   = per_team.value;
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-
-  using member_type = Kokkos::Impl::HIPTeamMember;
-
- protected:
-  template <BlockType BlockSize, class ClosureType, class ValueType,
-            class FunctorType>
-  int internal_team_size_common(FunctorType const& f) const {
-    const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double);
-    unsigned shmem_thread      = thread_scratch_size(0) + sizeof(double);
-    using Tag = typename PatternTagFromImplSpecialization<ClosureType>::type;
-    if constexpr (std::is_same_v<Tag, ParallelReduceTag>) {
-      using Interface =
-          typename Impl::DeduceFunctorPatternInterface<ClosureType>::type;
-      using Analysis =
-          Impl::FunctorAnalysis<Interface, typename ClosureType::Policy,
-                                FunctorType, ValueType>;
-      shmem_thread +=
-          ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f));
-    }
-    const int vector_length = impl_vector_length();
-
-    const auto functor = [&f, shmem_block, shmem_thread, vector_length](
-                             const hipFuncAttributes& attr, int block_size) {
-      int functor_shmem =
-          ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
-              f, block_size / vector_length);
-      return shmem_block + shmem_thread * (block_size / vector_length) +
-             functor_shmem + attr.sharedSizeBytes;
-    };
-    int block_size;
-    if constexpr (BlockSize == BlockType::Max) {
-      block_size = hip_get_max_team_blocksize<ClosureType,
-                                              typename traits::launch_bounds>(
-          space().impl_internal_space_instance(), functor);
-    } else {
-      block_size =
-          hip_get_preferred_team_blocksize<ClosureType,
-                                           typename traits::launch_bounds>(
-              space().impl_internal_space_instance(), functor);
-    }
-
-    if (block_size == 0) {
-      Kokkos::Impl::throw_runtime_exception(std::string(
-          "Kokkos::Impl::ParallelFor/Reduce< HIP > could not find a valid "
-          "team size."));
-    }
-    if constexpr (std::is_same_v<Tag, ParallelForTag>) {
-      return block_size / impl_vector_length();
-    } else {
-      // Currently we require Power-of-2 team size for reductions.
-      int p2 = 1;
-      while (p2 <= block_size) p2 *= 2;
-      p2 /= 2;
-      return p2 / impl_vector_length();
-    }
-  }
-};
-
-__device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size,
-                                                int32_t* scratch_locks,
-                                                size_t num_scratch_locks) {
-  int64_t threadid = 0;
-  __shared__ int64_t base_thread_id;
-  if (threadIdx.x == 0 && threadIdx.y == 0) {
-    int64_t const wraparound_len =
-        Kokkos::min(int64_t(league_size),
-                    int64_t(num_scratch_locks) / (blockDim.x * blockDim.y));
-    threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len;
-    threadid *= blockDim.x * blockDim.y;
-    int done = 0;
-    while (!done) {
-      done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1));
-      if (!done) {
-        threadid += blockDim.x * blockDim.y;
-        if (int64_t(threadid + blockDim.x * blockDim.y) >=
-            wraparound_len * blockDim.x * blockDim.y)
-          threadid = 0;
-      }
-    }
-    base_thread_id = threadid;
-  }
-  __syncthreads();
-  threadid = base_thread_id;
-  return threadid;
-}
-
-__device__ inline void hip_release_scratch_index(int32_t* scratch_locks,
-                                                 int64_t threadid) {
-  __syncthreads();
-  if (threadIdx.x == 0 && threadIdx.y == 0) {
-    scratch_locks[threadid] = 0;
-  }
-}
-
-template <typename FunctorType, typename... Properties>
-class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> {
- public:
-  using Policy       = TeamPolicy<Properties...>;
-  using functor_type = FunctorType;
-  using size_type    = HIP::size_type;
-
- private:
-  using member_type   = typename Policy::member_type;
-  using work_tag      = typename Policy::work_tag;
-  using launch_bounds = typename Policy::launch_bounds;
-
-  // Algorithmic constraints: blockDim.y is a power of two AND
-  // blockDim.y  == blockDim.z == 1 shared memory utilization:
-  //
-  //  [ team   reduce space ]
-  //  [ team   shared space ]
-
-  FunctorType const m_functor;
-  Policy const m_policy;
-  size_type const m_league_size;
-  int m_team_size;
-  size_type const m_vector_size;
-  int m_shmem_begin;
-  int m_shmem_size;
-  void* m_scratch_ptr[2];
-  size_t m_scratch_size[2];
-  int m_scratch_pool_id = -1;
-  int32_t* m_scratch_locks;
-  size_t m_num_scratch_locks;
-
-  template <typename TagType>
-  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
-      const member_type& member) const {
-    m_functor(member);
-  }
-
-  template <typename TagType>
-  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team(
-      const member_type& member) const {
-    m_functor(TagType(), member);
-  }
-
- public:
-  ParallelFor()                   = delete;
-  ParallelFor(ParallelFor const&) = default;
-  ParallelFor& operator=(ParallelFor const&) = delete;
-
-  __device__ inline void operator()() const {
-    // Iterate this block through the league
-    int64_t threadid = 0;
-    if (m_scratch_size[1] > 0) {
-      threadid = hip_get_scratch_index(m_league_size, m_scratch_locks,
-                                       m_num_scratch_locks);
-    }
-
-    int const int_league_size = static_cast<int>(m_league_size);
-    for (int league_rank = blockIdx.x; league_rank < int_league_size;
-         league_rank += gridDim.x) {
-      this->template exec_team<work_tag>(typename Policy::member_type(
-          kokkos_impl_hip_shared_memory<void>(), m_shmem_begin, m_shmem_size,
-          static_cast<void*>(static_cast<char*>(m_scratch_ptr[1]) +
-                             ptrdiff_t(threadid / (blockDim.x * blockDim.y)) *
-                                 m_scratch_size[1]),
-          m_scratch_size[1], league_rank, m_league_size));
-    }
-    if (m_scratch_size[1] > 0) {
-      hip_release_scratch_index(m_scratch_locks, threadid);
-    }
-  }
-
-  inline void execute() const {
-    int64_t const shmem_size_total = m_shmem_begin + m_shmem_size;
-    dim3 const grid(static_cast<int>(m_league_size), 1, 1);
-    dim3 const block(static_cast<int>(m_vector_size),
-                     static_cast<int>(m_team_size), 1);
-
-    using closure_type =
-        ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP>;
-    Impl::hip_parallel_launch<closure_type, launch_bounds>(
-        *this, grid, block, shmem_size_total,
-        m_policy.space().impl_internal_space_instance(),
-        true);  // copy to device and execute
-  }
-
-  ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_league_size(arg_policy.league_size()),
-        m_team_size(arg_policy.team_size()),
-        m_vector_size(arg_policy.impl_vector_length()) {
-    auto internal_space_instance =
-        m_policy.space().impl_internal_space_instance();
-    m_team_size = m_team_size >= 0 ? m_team_size
-                                   : arg_policy.team_size_recommended(
-                                         arg_functor, ParallelForTag());
-
-    m_shmem_begin = (sizeof(double) * (m_team_size + 2));
-    m_shmem_size =
-        (m_policy.scratch_size(0, m_team_size) +
-         FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
-    m_scratch_size[0]   = m_policy.scratch_size(0, m_team_size);
-    m_scratch_size[1]   = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks     = internal_space_instance->m_scratch_locks;
-    m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
-
-    // Functor's reduce memory, team scan memory, and team shared memory depend
-    // upon team size.
-    m_scratch_ptr[0] = nullptr;
-    if (m_team_size <= 0) {
-      m_scratch_ptr[1] = nullptr;
-    } else {
-      m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space();
-      m_scratch_ptr[1]  = internal_space_instance->resize_team_scratch_space(
-          m_scratch_pool_id,
-          static_cast<std::int64_t>(m_scratch_size[1]) *
-              (std::min(
-                  static_cast<std::int64_t>(HIP().concurrency() /
-                                            (m_team_size * m_vector_size)),
-                  static_cast<std::int64_t>(m_league_size))));
-    }
-
-    int const shmem_size_total = m_shmem_begin + m_shmem_size;
-    if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) {
-      Kokkos::Impl::throw_runtime_exception(std::string(
-          "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory"));
-    }
-
-    size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag());
-    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
-      Kokkos::Impl::throw_runtime_exception(std::string(
-          "Kokkos::Impl::ParallelFor< HIP > requested too large team size."));
-    }
-  }
-
-  ~ParallelFor() {
-    if (m_scratch_pool_id >= 0) {
-      m_policy.space()
-          .impl_internal_space_instance()
-          ->release_team_scratch_space(m_scratch_pool_id);
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template <class CombinedFunctorReducerType, class... Properties>
-class ParallelReduce<CombinedFunctorReducerType,
-                     Kokkos::TeamPolicy<Properties...>, HIP> {
- public:
-  using Policy      = TeamPolicyInternal<HIP, Properties...>;
-  using FunctorType = typename CombinedFunctorReducerType::functor_type;
-  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
-
- private:
-  using member_type   = typename Policy::member_type;
-  using work_tag      = typename Policy::work_tag;
-  using launch_bounds = typename Policy::launch_bounds;
-
-  using pointer_type   = typename ReducerType::pointer_type;
-  using reference_type = typename ReducerType::reference_type;
-  using value_type     = typename ReducerType::value_type;
-
- public:
-  using functor_type = FunctorType;
-  using size_type    = HIP::size_type;
-
-  // static int constexpr UseShflReduction = false;
-  // FIXME_HIP This should be disabled unconditionally for best performance, but
-  // it currently causes tests to fail.
-  static constexpr int UseShflReduction =
-      (ReducerType::static_value_size() != 0);
-
- private:
-  struct ShflReductionTag {};
-  struct SHMEMReductionTag {};
-
-  // Algorithmic constraints: blockDim.y is a power of two AND
-  // blockDim.y == blockDim.z == 1 shared memory utilization:
-  //
-  //  [ global reduce space ]
-  //  [ team   reduce space ]
-  //  [ team   shared space ]
-  //
-
-  const CombinedFunctorReducerType m_functor_reducer;
-  const Policy m_policy;
-  const pointer_type m_result_ptr;
-  const bool m_result_ptr_device_accessible;
-  const bool m_result_ptr_host_accessible;
-  size_type* m_scratch_space;
-  size_type* m_scratch_flags;
-  size_type m_team_begin;
-  size_type m_shmem_begin;
-  size_type m_shmem_size;
-  void* m_scratch_ptr[2];
-  size_t m_scratch_size[2];
-  int m_scratch_pool_id = -1;
-  int32_t* m_scratch_locks;
-  size_t m_num_scratch_locks;
-  const size_type m_league_size;
-  int m_team_size;
-  const size_type m_vector_size;
-
-  template <class TagType>
-  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
-      member_type const& member, reference_type update) const {
-    m_functor_reducer.get_functor()(member, update);
-  }
-
-  template <class TagType>
-  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team(
-      member_type const& member, reference_type update) const {
-    m_functor_reducer.get_functor()(TagType(), member, update);
-  }
-
-  __device__ inline void iterate_through_league(int const threadid,
-                                                reference_type value) const {
-    int const int_league_size = static_cast<int>(m_league_size);
-    for (int league_rank = blockIdx.x; league_rank < int_league_size;
-         league_rank += gridDim.x) {
-      this->template exec_team<work_tag>(
-          member_type(
-              kokkos_impl_hip_shared_memory<char>() + m_team_begin,
-              m_shmem_begin, m_shmem_size,
-              reinterpret_cast<void*>(
-                  reinterpret_cast<char*>(m_scratch_ptr[1]) +
-                  static_cast<ptrdiff_t>(threadid / (blockDim.x * blockDim.y)) *
-                      m_scratch_size[1]),
-              m_scratch_size[1], league_rank, m_league_size),
-          value);
-    }
-  }
-
-  int compute_block_count() const {
-    constexpr auto light_weight =
-        Kokkos::Experimental::WorkItemProperty::HintLightWeight;
-    constexpr typename Policy::work_item_property property;
-    // Numbers were tuned on MI210 using dot product and yAx benchmarks
-    constexpr int block_max =
-        (property & light_weight) == light_weight ? 2097152 : 65536;
-    constexpr int preferred_block_min = 1024;
-    int block_count                   = m_league_size;
-    if (block_count < preferred_block_min) {
-      // keep blocks as is, already low parallelism
-    } else if (block_count >= block_max) {
-      block_count = block_max;
-
-    } else {
-      int nwork = m_league_size * m_team_size;
-      int items_per_thread =
-          (nwork + block_count * m_team_size - 1) / (block_count * m_team_size);
-      if (items_per_thread < 4) {
-        int ratio = std::min(
-            (block_count + preferred_block_min - 1) / preferred_block_min,
-            (4 + items_per_thread - 1) / items_per_thread);
-        block_count /= ratio;
-      }
-    }
-
-    return block_count;
-  }
-
- public:
-  __device__ inline void operator()() const {
-    int64_t threadid = 0;
-    if (m_scratch_size[1] > 0) {
-      threadid = hip_get_scratch_index(m_league_size, m_scratch_locks,
-                                       m_num_scratch_locks);
-    }
-
-    using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag,
-                                            SHMEMReductionTag>;
-    run(ReductionTag{}, threadid);
-
-    if (m_scratch_size[1] > 0) {
-      hip_release_scratch_index(m_scratch_locks, threadid);
-    }
-  }
-
-  __device__ inline void run(SHMEMReductionTag, int const threadid) const {
-    const ReducerType& reducer = m_functor_reducer.get_reducer();
-
-    integral_nonzero_constant<size_type, ReducerType::static_value_size() /
-                                             sizeof(size_type)> const
-        word_count(reducer.value_size() / sizeof(size_type));
-
-    reference_type value =
-        reducer.init(kokkos_impl_hip_shared_memory<size_type>() +
-                     threadIdx.y * word_count.value);
-    // Iterate this block through the league
-    iterate_through_league(threadid, value);
-
-    // Reduce with final value at blockDim.y - 1 location.
-    bool do_final_reduce = (m_league_size == 0);
-    if (!do_final_reduce)
-      do_final_reduce =
-          hip_single_inter_block_reduce_scan<false, FunctorType, work_tag>(
-              reducer, blockIdx.x, gridDim.x,
-              kokkos_impl_hip_shared_memory<size_type>(), m_scratch_space,
-              m_scratch_flags);
-    if (do_final_reduce) {
-      // This is the final block with the final result at the final threads'
-      // location
-
-      size_type* const shared = kokkos_impl_hip_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
-      size_type* const global = m_result_ptr_device_accessible
-                                    ? reinterpret_cast<size_type*>(m_result_ptr)
-                                    : m_scratch_space;
-
-      if (threadIdx.y == 0) {
-        reducer.final(reinterpret_cast<value_type*>(shared));
-      }
-
-      if (HIPTraits::WarpSize < word_count.value) {
-        __syncthreads();
-      }
-
-      for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
-        global[i] = shared[i];
-      }
-    }
-  }
-
-  __device__ inline void run(ShflReductionTag, int const threadid) const {
-    const ReducerType& reducer = m_functor_reducer.get_reducer();
-
-    value_type value;
-    reducer.init(&value);
-
-    // Iterate this block through the league
-    iterate_through_league(threadid, value);
-
-    pointer_type const result =
-        m_result_ptr_device_accessible
-            ? m_result_ptr
-            : reinterpret_cast<pointer_type>(m_scratch_space);
-
-    value_type init;
-    reducer.init(&init);
-    if (m_league_size == 0) {
-      reducer.final(&value);
-      *result = value;
-    } else if (Impl::hip_inter_block_shuffle_reduction(
-                   value, init, reducer, m_scratch_space, result,
-                   m_scratch_flags, blockDim.y)) {
-      unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x;
-      if (id == 0) {
-        reducer.final(&value);
-        *result = value;
-      }
-    }
-  }
-
-  inline void execute() {
-    const ReducerType& reducer = m_functor_reducer.get_reducer();
-
-    const bool is_empty_range  = m_league_size == 0 || m_team_size == 0;
-    const bool need_device_set = ReducerType::has_init_member_function() ||
-                                 ReducerType::has_final_member_function() ||
-                                 !m_result_ptr_host_accessible ||
-                                 Policy::is_graph_kernel::value ||
-                                 !std::is_same<ReducerType, InvalidType>::value;
-    if (!is_empty_range || need_device_set) {
-      int const block_count = compute_block_count();
-
-      m_scratch_space = hip_internal_scratch_space(
-          m_policy.space(), reducer.value_size() * block_count);
-      m_scratch_flags =
-          hip_internal_scratch_flags(m_policy.space(), sizeof(size_type));
-
-      dim3 block(m_vector_size, m_team_size, 1);
-      dim3 grid(block_count, 1, 1);
-      if (is_empty_range) {
-        block = dim3(1, 1, 1);
-        grid  = dim3(1, 1, 1);
-      }
-      const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
-
-      Impl::hip_parallel_launch<ParallelReduce, launch_bounds>(
-          *this, grid, block, shmem_size_total,
-          m_policy.space().impl_internal_space_instance(),
-          true);  // copy to device and execute
-
-      if (!m_result_ptr_device_accessible) {
-        m_policy.space().impl_internal_space_instance()->fence();
-
-        if (m_result_ptr) {
-          const int size = reducer.value_size();
-          DeepCopy<HostSpace, HIPSpace>(m_result_ptr, m_scratch_space, size);
-        }
-      }
-    } else {
-      if (m_result_ptr) {
-        reducer.init(m_result_ptr);
-      }
-    }
-  }
-
-  template <class ViewType>
-  ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer,
-                 Policy const& arg_policy, ViewType const& arg_result)
-      : m_functor_reducer(arg_functor_reducer),
-        m_policy(arg_policy),
-        m_result_ptr(arg_result.data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<HIPSpace,
-                              typename ViewType::memory_space>::accessible),
-        m_result_ptr_host_accessible(
-            MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ViewType::memory_space>::accessible),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr),
-        m_team_begin(0),
-        m_shmem_begin(0),
-        m_shmem_size(0),
-        m_scratch_ptr{nullptr, nullptr},
-        m_league_size(arg_policy.league_size()),
-        m_team_size(arg_policy.team_size()),
-        m_vector_size(arg_policy.impl_vector_length()) {
-    auto internal_space_instance =
-        m_policy.space().impl_internal_space_instance();
-    m_team_size = m_team_size >= 0 ? m_team_size
-                                   : arg_policy.team_size_recommended(
-                                         arg_functor_reducer.get_functor(),
-                                         arg_functor_reducer.get_reducer(),
-                                         ParallelReduceTag());
-
-    m_team_begin =
-        UseShflReduction
-            ? 0
-            : hip_single_inter_block_reduce_scan_shmem<false, work_tag,
-                                                       value_type>(
-                  arg_functor_reducer.get_functor(), m_team_size);
-    m_shmem_begin = sizeof(double) * (m_team_size + 2);
-    m_shmem_size  = m_policy.scratch_size(0, m_team_size) +
-                   FunctorTeamShmemSize<FunctorType>::value(
-                       arg_functor_reducer.get_functor(), m_team_size);
-    m_scratch_size[0]   = m_shmem_size;
-    m_scratch_size[1]   = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks     = internal_space_instance->m_scratch_locks;
-    m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
-    if (m_team_size <= 0) {
-      m_scratch_ptr[1] = nullptr;
-    } else {
-      m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space();
-      m_scratch_ptr[1]  = internal_space_instance->resize_team_scratch_space(
-          m_scratch_pool_id,
-          static_cast<std::int64_t>(m_scratch_size[1]) *
-              (std::min(
-                  static_cast<std::int64_t>(HIP().concurrency() /
-                                            (m_team_size * m_vector_size)),
-                  static_cast<std::int64_t>(m_league_size))));
-    }
-
-    // The global parallel_reduce does not support vector_length other than 1 at
-    // the moment
-    if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction)
-      Impl::throw_runtime_exception(
-          "Kokkos::parallel_reduce with a TeamPolicy using a vector length of "
-          "greater than 1 is not currently supported for HIP for dynamic "
-          "sized reduction types.");
-
-    if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction)
-      Impl::throw_runtime_exception(
-          "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller "
-          "than 64 is not currently supported with HIP for dynamic sized "
-          "reduction types.");
-
-    // Functor's reduce memory, team scan memory, and team shared memory depend
-    // upon team size.
-
-    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
-
-    if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
-        !UseShflReduction) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size"));
-    }
-
-    if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much "
-                      "L0 scratch memory"));
-    }
-
-    size_t max_size = arg_policy.team_size_max(
-        arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(),
-        ParallelReduceTag());
-    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
-                      "large team size."));
-    }
-  }
-
-  ~ParallelReduce() {
-    if (m_scratch_pool_id >= 0) {
-      m_policy.space()
-          .impl_internal_space_instance()
-          ->release_team_scratch_space(m_scratch_pool_id);
-    }
-  }
-};
-}  // namespace Impl
-}  // namespace Kokkos
-
-#endif
-
-#endif
diff --git a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp
index ea599989e7a..ab24004f5fc 100644
--- a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp
+++ b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp
@@ -18,138 +18,14 @@
 #define KOKKOS_IMPL_PUBLIC_INCLUDE
 #endif
 
-#include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp>
-#include <HIP/Kokkos_HIP_DeepCopy.hpp>
 #include <HIP/Kokkos_HIP.hpp>
-
-namespace Kokkos {
-namespace Impl {
-
-#ifdef KOKKOS_ENABLE_DEBUG
-SharedAllocationRecord<void, void>
-    SharedAllocationRecord<HIPSpace, void>::s_root_record;
-
-SharedAllocationRecord<void, void>
-    SharedAllocationRecord<HIPHostPinnedSpace, void>::s_root_record;
-
-SharedAllocationRecord<void, void>
-    SharedAllocationRecord<HIPManagedSpace, void>::s_root_record;
-#endif
-
-SharedAllocationRecord<HIPSpace, void>::~SharedAllocationRecord() {
-  auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
-  m_space.deallocate(m_label.c_str(),
-                     SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     alloc_size, (alloc_size - sizeof(SharedAllocationHeader)));
-}
-
-SharedAllocationRecord<HIPHostPinnedSpace, void>::~SharedAllocationRecord() {
-  m_space.deallocate(m_label.c_str(),
-                     SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     SharedAllocationRecord<void, void>::m_alloc_size);
-}
-
-SharedAllocationRecord<HIPManagedSpace, void>::~SharedAllocationRecord() {
-  m_space.deallocate(m_label.c_str(),
-                     SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     SharedAllocationRecord<void, void>::m_alloc_size);
-}
-
-SharedAllocationRecord<HIPSpace, void>::SharedAllocationRecord(
-    const HIPSpace& arg_space, const std::string& arg_label,
-    const size_t arg_alloc_size,
-    const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<HIPSpace, void>::s_root_record,
-#endif
-          Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label,
-                                                       arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-
-  SharedAllocationHeader header;
-
-  this->base_t::_fill_host_accessible_header_info(header, arg_label);
-
-  // Copy to device memory
-  HIP exec;
-  Kokkos::Impl::DeepCopy<HIPSpace, HostSpace>(
-      exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader));
-  exec.fence(
-      "SharedAllocationRecord<Kokkos::HIPSpace, "
-      "void>::SharedAllocationRecord(): fence after copying header from "
-      "HostSpace");
-}
-
-SharedAllocationRecord<HIPSpace, void>::SharedAllocationRecord(
-    const HIP& arg_exec_space, const HIPSpace& arg_space,
-    const std::string& arg_label, const size_t arg_alloc_size,
-    const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<HIPSpace, void>::s_root_record,
-#endif
-          Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label,
-                                                       arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-
-  SharedAllocationHeader header;
-
-  this->base_t::_fill_host_accessible_header_info(header, arg_label);
-
-  // Copy to device memory
-  Kokkos::Impl::DeepCopy<HIPSpace, HostSpace>(arg_exec_space,
-                                              RecordBase::m_alloc_ptr, &header,
-                                              sizeof(SharedAllocationHeader));
-}
-
-SharedAllocationRecord<HIPHostPinnedSpace, void>::SharedAllocationRecord(
-    const HIPHostPinnedSpace& arg_space, const std::string& arg_label,
-    const size_t arg_alloc_size,
-    const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<HIPHostPinnedSpace, void>::s_root_record,
-#endif
-          Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label,
-                                                       arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-  // Fill in the Header information, directly accessible via host pinned memory
-  this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr,
-                                                  arg_label);
-}
-
-SharedAllocationRecord<HIPManagedSpace, void>::SharedAllocationRecord(
-    const HIPManagedSpace& arg_space, const std::string& arg_label,
-    const size_t arg_alloc_size,
-    const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<HIPManagedSpace, void>::s_root_record,
-#endif
-          Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label,
-                                                       arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-  // Fill in the Header information, directly accessible via managed memory
-  this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr,
-                                                  arg_label);
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
+#include <HIP/Kokkos_HIP_DeepCopy.hpp>
+#include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp>
+#include <impl/Kokkos_SharedAlloc_timpl.hpp>
+
+KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
+    Kokkos::HIPSpace);
+KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
+    Kokkos::HIPHostPinnedSpace);
+KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
+    Kokkos::HIPManagedSpace);
diff --git a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp
index e68bad97230..fbae5188344 100644
--- a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp
+++ b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp
@@ -18,120 +18,11 @@
 #define KOKKOS_HIP_SHARED_ALLOCATION_RECORD_HPP
 
 #include <HIP/Kokkos_HIP_Space.hpp>
+#include <impl/Kokkos_SharedAlloc.hpp>
 
-namespace Kokkos {
-namespace Impl {
-
-template <>
-class SharedAllocationRecord<HIPSpace, void>
-    : public HostInaccessibleSharedAllocationRecordCommon<HIPSpace> {
- private:
-  friend class SharedAllocationRecordCommon<HIPSpace>;
-  friend class HostInaccessibleSharedAllocationRecordCommon<HIPSpace>;
-  using base_t     = HostInaccessibleSharedAllocationRecordCommon<HIPSpace>;
-  using RecordBase = SharedAllocationRecord<void, void>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-
-#ifdef KOKKOS_ENABLE_DEBUG
-  static RecordBase s_root_record;
-#endif
-
-  const HIPSpace m_space;
-
- protected:
-  ~SharedAllocationRecord();
-
-  template <typename ExecutionSpace>
-  SharedAllocationRecord(
-      const ExecutionSpace& /*exec*/, const HIPSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
-      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
-                               arg_dealloc) {}
-
-  SharedAllocationRecord(
-      const HIP& exec_space, const HIPSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-
-  SharedAllocationRecord(
-      const HIPSpace& arg_space, const std::string& arg_label,
-      const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-};
-
-template <>
-class SharedAllocationRecord<HIPHostPinnedSpace, void>
-    : public SharedAllocationRecordCommon<HIPHostPinnedSpace> {
- private:
-  friend class SharedAllocationRecordCommon<HIPHostPinnedSpace>;
-  using base_t     = SharedAllocationRecordCommon<HIPHostPinnedSpace>;
-  using RecordBase = SharedAllocationRecord<void, void>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-
-#ifdef KOKKOS_ENABLE_DEBUG
-  static RecordBase s_root_record;
-#endif
-
-  const HIPHostPinnedSpace m_space;
-
- protected:
-  ~SharedAllocationRecord();
-  SharedAllocationRecord() = default;
-
-  template <typename ExecutionSpace>
-  SharedAllocationRecord(
-      const ExecutionSpace& /*exec_space*/, const HIPHostPinnedSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
-      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
-                               arg_dealloc) {}
-
-  SharedAllocationRecord(
-      const HIPHostPinnedSpace& arg_space, const std::string& arg_label,
-      const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-};
-
-template <>
-class SharedAllocationRecord<HIPManagedSpace, void>
-    : public SharedAllocationRecordCommon<HIPManagedSpace> {
- private:
-  friend class SharedAllocationRecordCommon<HIPManagedSpace>;
-  using base_t     = SharedAllocationRecordCommon<HIPManagedSpace>;
-  using RecordBase = SharedAllocationRecord<void, void>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-
-#ifdef KOKKOS_ENABLE_DEBUG
-  static RecordBase s_root_record;
-#endif
-
-  const HIPManagedSpace m_space;
-
- protected:
-  ~SharedAllocationRecord();
-  SharedAllocationRecord() = default;
-
-  template <typename ExecutionSpace>
-  SharedAllocationRecord(
-      const ExecutionSpace& /*exec_space*/, const HIPManagedSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
-      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
-                               arg_dealloc) {}
-
-  SharedAllocationRecord(
-      const HIPManagedSpace& arg_space, const std::string& arg_label,
-      const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-};
-}  // namespace Impl
-}  // namespace Kokkos
+KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(
+    Kokkos::HIPSpace);
+KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPHostPinnedSpace);
+KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPManagedSpace);
 
 #endif
diff --git a/core/src/HIP/Kokkos_HIP_Space.cpp b/core/src/HIP/Kokkos_HIP_Space.cpp
index 7f6aa0d8e82..e8bdfca66fe 100644
--- a/core/src/HIP/Kokkos_HIP_Space.cpp
+++ b/core/src/HIP/Kokkos_HIP_Space.cpp
@@ -24,10 +24,8 @@
 #include <HIP/Kokkos_HIP_Space.hpp>
 
 #include <HIP/Kokkos_HIP_DeepCopy.hpp>
-#include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp>
 
 #include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_MemorySpace.hpp>
 #include <impl/Kokkos_DeviceManagement.hpp>
 #include <impl/Kokkos_ExecSpaceManager.hpp>
 
@@ -287,22 +285,3 @@ void HIPManagedSpace::impl_deallocate(
 }
 
 }  // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-#include <impl/Kokkos_SharedAlloc_timpl.hpp>
-
-namespace Kokkos {
-namespace Impl {
-
-// To avoid additional compilation cost for something that's (mostly?) not
-// performance sensitive, we explicity instantiate these CRTP base classes here,
-// where we have access to the associated *_timpl.hpp header files.
-template class HostInaccessibleSharedAllocationRecordCommon<HIPSpace>;
-template class SharedAllocationRecordCommon<HIPSpace>;
-template class SharedAllocationRecordCommon<HIPHostPinnedSpace>;
-template class SharedAllocationRecordCommon<HIPManagedSpace>;
-
-}  // end namespace Impl
-}  // end namespace Kokkos
diff --git a/core/src/HIP/Kokkos_HIP_Space.hpp b/core/src/HIP/Kokkos_HIP_Space.hpp
index f3e5adf87e5..7f2004e5cbc 100644
--- a/core/src/HIP/Kokkos_HIP_Space.hpp
+++ b/core/src/HIP/Kokkos_HIP_Space.hpp
@@ -65,6 +65,15 @@ class HIPSpace {
   ~HIPSpace()                              = default;
 
   /**\brief  Allocate untracked memory in the hip space */
+  // FIXME_HIP Use execution space instance
+  void* allocate(const HIP&, const size_t arg_alloc_size) const {
+    return allocate(arg_alloc_size);
+  }
+  // FIXME_HIP Use execution space instance
+  void* allocate(const HIP&, const char* arg_label, const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const {
+    return allocate(arg_label, arg_alloc_size, arg_logical_size);
+  }
   void* allocate(const size_t arg_alloc_size) const;
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
                  const size_t arg_logical_size = 0) const;
@@ -76,8 +85,6 @@ class HIPSpace {
                   const size_t arg_logical_size = 0) const;
 
  private:
-  template <class, class, class, class>
-  friend class LogicalMemorySpace;
   void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
                       const size_t arg_logical_size = 0,
                       const Kokkos::Tools::SpaceHandle =
@@ -94,8 +101,6 @@ class HIPSpace {
 
  private:
   int m_device;  ///< Which HIP device
-
-  friend class Kokkos::Impl::SharedAllocationRecord<HIPSpace, void>;
 };
 
 template <>
@@ -129,6 +134,16 @@ class HIPHostPinnedSpace {
   ~HIPHostPinnedSpace()                                        = default;
 
   /**\brief  Allocate untracked memory in the space */
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
+    return allocate(arg_alloc_size);
+  }
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const {
+    return allocate(arg_label, arg_alloc_size, arg_logical_size);
+  }
   void* allocate(const size_t arg_alloc_size) const;
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
                  const size_t arg_logical_size = 0) const;
@@ -140,8 +155,6 @@ class HIPHostPinnedSpace {
                   const size_t arg_logical_size = 0) const;
 
  private:
-  template <class, class, class, class>
-  friend class LogicalMemorySpace;
   void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
                       const size_t arg_logical_size = 0,
                       const Kokkos::Tools::SpaceHandle =
@@ -194,6 +207,16 @@ class HIPManagedSpace {
   ~HIPManagedSpace()                                     = default;
 
   /**\brief  Allocate untracked memory in the space */
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
+    return allocate(arg_alloc_size);
+  }
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const {
+    return allocate(arg_label, arg_alloc_size, arg_logical_size);
+  }
   void* allocate(const size_t arg_alloc_size) const;
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
                  const size_t arg_logical_size = 0) const;
@@ -209,8 +232,6 @@ class HIPManagedSpace {
 
  private:
   int m_device;  ///< Which HIP device
-  template <class, class, class, class>
-  friend class LogicalMemorySpace;
   void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
                       const size_t arg_logical_size = 0,
                       const Kokkos::Tools::SpaceHandle =
@@ -239,8 +260,7 @@ struct Impl::is_hip_type_space<HIPManagedSpace> : public std::true_type {};
 namespace Kokkos {
 namespace Impl {
 
-static_assert(Kokkos::Impl::MemorySpaceAccess<HIPSpace, HIPSpace>::assignable,
-              "");
+static_assert(Kokkos::Impl::MemorySpaceAccess<HIPSpace, HIPSpace>::assignable);
 
 //----------------------------------------
 
diff --git a/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp b/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp
new file mode 100644
index 00000000000..67e1181125c
--- /dev/null
+++ b/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp
@@ -0,0 +1,421 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP
+#define KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP
+
+#include <Kokkos_MinMax.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <typename... Properties>
+class TeamPolicyInternal<HIP, Properties...>
+    : public PolicyTraits<Properties...> {
+ public:
+  using execution_policy = TeamPolicyInternal;
+
+  using traits = PolicyTraits<Properties...>;
+
+  template <typename ExecSpace, typename... OtherProperties>
+  friend class TeamPolicyInternal;
+
+ private:
+  typename traits::execution_space m_space;
+  int m_league_size;
+  int m_team_size;
+  int m_vector_length;
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
+  int m_chunk_size;
+  bool m_tune_team_size;
+  bool m_tune_vector_length;
+
+ public:
+  using execution_space = HIP;
+
+  template <class... OtherProperties>
+  TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) {
+    m_league_size            = p.m_league_size;
+    m_team_size              = p.m_team_size;
+    m_vector_length          = p.m_vector_length;
+    m_team_scratch_size[0]   = p.m_team_scratch_size[0];
+    m_team_scratch_size[1]   = p.m_team_scratch_size[1];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size             = p.m_chunk_size;
+    m_space                  = p.m_space;
+    m_tune_team_size         = p.m_tune_team_size;
+    m_tune_vector_length     = p.m_tune_vector_length;
+  }
+
+  template <typename FunctorType>
+  int team_size_max(FunctorType const& f, ParallelForTag const&) const {
+    using closure_type =
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+
+    return internal_team_size_common<BlockType::Max, closure_type, void>(f);
+  }
+
+  template <class FunctorType>
+  inline int team_size_max(const FunctorType& f,
+                           const ParallelReduceTag&) const {
+    using functor_analysis_type =
+        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                              TeamPolicyInternal, FunctorType, void>;
+    using closure_type = Impl::ParallelReduce<
+        CombinedFunctorReducer<FunctorType,
+                               typename functor_analysis_type::Reducer>,
+        TeamPolicy<Properties...>, Kokkos::HIP>;
+    return internal_team_size_common<
+        BlockType::Max, closure_type,
+        typename functor_analysis_type::value_type>(f);
+  }
+
+  template <typename FunctorType, typename ReducerType>
+  inline int team_size_max(const FunctorType& f, const ReducerType&,
+                           const ParallelReduceTag&) const {
+    using closure_type =
+        Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
+                             TeamPolicy<Properties...>, Kokkos::HIP>;
+    return internal_team_size_common<BlockType::Max, closure_type,
+                                     typename ReducerType::value_type>(f);
+  }
+
+  template <typename FunctorType>
+  int team_size_recommended(FunctorType const& f, ParallelForTag const&) const {
+    using closure_type =
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+
+    return internal_team_size_common<BlockType::Preferred, closure_type, void>(
+        f);
+  }
+
+  template <typename FunctorType>
+  inline int team_size_recommended(FunctorType const& f,
+                                   ParallelReduceTag const&) const {
+    using functor_analysis_type =
+        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                              TeamPolicyInternal, FunctorType, void>;
+    using closure_type = Impl::ParallelReduce<
+        CombinedFunctorReducer<FunctorType,
+                               typename functor_analysis_type::Reducer>,
+        TeamPolicy<Properties...>, Kokkos::HIP>;
+    return internal_team_size_common<
+        BlockType::Preferred, closure_type,
+        typename functor_analysis_type::value_type>(f);
+  }
+
+  template <typename FunctorType, typename ReducerType>
+  int team_size_recommended(FunctorType const& f, ReducerType const&,
+                            ParallelReduceTag const&) const {
+    using closure_type =
+        Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
+                             TeamPolicy<Properties...>, Kokkos::HIP>;
+    return internal_team_size_common<BlockType::Preferred, closure_type,
+                                     typename ReducerType::value_type>(f);
+  }
+
+  inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
+  inline bool impl_auto_team_size() const { return m_tune_team_size; }
+  static int vector_length_max() { return HIPTraits::WarpSize; }
+
+  static int verify_requested_vector_length(int requested_vector_length) {
+    int test_vector_length =
+        std::min(requested_vector_length, vector_length_max());
+
+    // Allow only power-of-two vector_length
+    if (!(is_integral_power_of_two(test_vector_length))) {
+      int test_pow2           = 1;
+      constexpr int warp_size = HIPTraits::WarpSize;
+      while (test_pow2 < warp_size) {
+        test_pow2 <<= 1;
+        if (test_pow2 > test_vector_length) {
+          break;
+        }
+      }
+      test_vector_length = test_pow2 >> 1;
+    }
+
+    return test_vector_length;
+  }
+
+  inline static int scratch_size_max(int level) {
+    // HIP Teams use (team_size + 2)*sizeof(double) shared memory for team
+    // reductions. They also use one int64_t in static shared memory for a
+    // shared ID. Furthermore, they use additional scratch memory in some
+    // reduction scenarios, which depend on the size of the value_type and is
+    // NOT captured here
+    constexpr size_t max_possible_team_size = 1024;
+    constexpr size_t max_reserved_shared_mem_per_team =
+        (max_possible_team_size + 2) * sizeof(double) + sizeof(int64_t);
+    // arbitrarily setting level 1 scratch limit to 20MB, for a
+    // MI250 that would give us about 4.4GB for 2 teams per CU
+    constexpr size_t max_l1_scratch_size = 20 * 1024 * 1024;
+
+    size_t max_shmem = HIP().hip_device_prop().sharedMemPerBlock;
+    return (level == 0 ? max_shmem - max_reserved_shared_mem_per_team
+                       : max_l1_scratch_size);
+  }
+
+  inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
+  inline void impl_set_team_size(size_t size) { m_team_size = size; }
+  int impl_vector_length() const { return m_vector_length; }
+
+  int team_size() const { return m_team_size; }
+
+  int league_size() const { return m_league_size; }
+
+  size_t scratch_size(int level, int team_size_ = -1) const {
+    if (team_size_ < 0) team_size_ = m_team_size;
+    return m_team_scratch_size[level] +
+           team_size_ * m_thread_scratch_size[level];
+  }
+
+  size_t team_scratch_size(int level) const {
+    return m_team_scratch_size[level];
+  }
+
+  size_t thread_scratch_size(int level) const {
+    return m_thread_scratch_size[level];
+  }
+
+  typename traits::execution_space space() const { return m_space; }
+
+  TeamPolicyInternal()
+      : m_space(typename traits::execution_space()),
+        m_league_size(0),
+        m_team_size(-1),
+        m_vector_length(0),
+        m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(HIPTraits::WarpSize),
+        m_tune_team_size(false),
+        m_tune_vector_length(false) {}
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     int team_size_request, int vector_length_request = 1)
+      : m_space(space_),
+        m_league_size(league_size_),
+        m_team_size(team_size_request),
+        m_vector_length(
+            (vector_length_request > 0)
+                ? verify_requested_vector_length(vector_length_request)
+                : (verify_requested_vector_length(1))),
+        m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(HIPTraits::WarpSize),
+        m_tune_team_size(bool(team_size_request <= 0)),
+        m_tune_vector_length(bool(vector_length_request <= 0)) {
+    // Make sure league size is permissible
+    if (league_size_ >= static_cast<int>(hip_internal_maximum_grid_count()[0]))
+      Impl::throw_runtime_exception(
+          "Requested too large league_size for TeamPolicy on HIP execution "
+          "space.");
+
+    // Make sure total block size is permissible
+    if (m_team_size * m_vector_length > HIPTraits::MaxThreadsPerBlock) {
+      Impl::throw_runtime_exception(
+          std::string("Kokkos::TeamPolicy< HIP > the team size is too large. "
+                      "Team size x vector length must be smaller than 1024."));
+    }
+  }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {}
+  // FLAG
+  /** \brief  Specify league size and team size, request vector length*/
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     int team_size_request,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+                     )
+      : TeamPolicyInternal(space_, league_size_, team_size_request, -1)
+
+  {}
+
+  /** \brief  Specify league size, request team size and vector length*/
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+
+                     )
+      : TeamPolicyInternal(space_, league_size_, -1, -1)
+
+  {}
+
+  TeamPolicyInternal(int league_size_, int team_size_request,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
+                           team_size_request, vector_length_request) {}
+
+  TeamPolicyInternal(int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
+                           vector_length_request) {}
+
+  /** \brief  Specify league size and team size, request vector length*/
+  TeamPolicyInternal(int league_size_, int team_size_request,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+
+                     )
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
+                           team_size_request, -1)
+
+  {}
+
+  /** \brief  Specify league size, request team size and vector length*/
+  TeamPolicyInternal(int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+
+                     )
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
+                           -1) {}
+
+  int chunk_size() const { return m_chunk_size; }
+
+  TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) {
+    m_chunk_size = chunk_size_;
+    return *this;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch
+   * hierarchy */
+  TeamPolicyInternal& set_scratch_size(int level,
+                                       PerTeamValue const& per_team) {
+    m_team_scratch_size[level] = per_team.value;
+    return *this;
+  }
+
+  /** \brief set per thread scratch size for a specific level of the scratch
+   * hierarchy */
+  TeamPolicyInternal& set_scratch_size(int level,
+                                       PerThreadValue const& per_thread) {
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  }
+
+  /** \brief set per thread and per team scratch size for a specific level of
+   * the scratch hierarchy */
+  TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team,
+                                       PerThreadValue const& per_thread) {
+    m_team_scratch_size[level]   = per_team.value;
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  }
+
+  using member_type = Kokkos::Impl::HIPTeamMember;
+
+ protected:
+  template <BlockType BlockSize, class ClosureType, class ValueType,
+            class FunctorType>
+  int internal_team_size_common(FunctorType const& f) const {
+    const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double);
+    unsigned shmem_thread      = thread_scratch_size(0) + sizeof(double);
+    using Tag = typename PatternTagFromImplSpecialization<ClosureType>::type;
+    if constexpr (std::is_same_v<Tag, ParallelReduceTag>) {
+      using Interface =
+          typename Impl::DeduceFunctorPatternInterface<ClosureType>::type;
+      using Analysis =
+          Impl::FunctorAnalysis<Interface, typename ClosureType::Policy,
+                                FunctorType, ValueType>;
+      shmem_thread +=
+          ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f));
+    }
+    const int vector_length = impl_vector_length();
+
+    const auto functor = [&f, shmem_block, shmem_thread, vector_length](
+                             const hipFuncAttributes& attr, int block_size) {
+      int functor_shmem =
+          ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
+              f, block_size / vector_length);
+      return shmem_block + shmem_thread * (block_size / vector_length) +
+             functor_shmem + attr.sharedSizeBytes;
+    };
+    int block_size;
+    if constexpr (BlockSize == BlockType::Max) {
+      block_size = hip_get_max_team_blocksize<ClosureType,
+                                              typename traits::launch_bounds>(
+          space().impl_internal_space_instance(), functor);
+    } else {
+      block_size =
+          hip_get_preferred_team_blocksize<ClosureType,
+                                           typename traits::launch_bounds>(
+              space().impl_internal_space_instance(), functor);
+    }
+
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(std::string(
+          "Kokkos::Impl::ParallelFor/Reduce< HIP > could not find a valid "
+          "team size."));
+    }
+    if constexpr (std::is_same_v<Tag, ParallelForTag>) {
+      return block_size / impl_vector_length();
+    } else {
+      // Currently we require Power-of-2 team size for reductions.
+      int p2 = 1;
+      while (p2 <= block_size) p2 *= 2;
+      p2 /= 2;
+      return p2 / impl_vector_length();
+    }
+  }
+};
+
+__device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size,
+                                                int32_t* scratch_locks,
+                                                size_t num_scratch_locks) {
+  int64_t threadid = 0;
+  __shared__ int64_t base_thread_id;
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    int64_t const wraparound_len =
+        Kokkos::min(int64_t(league_size),
+                    int64_t(num_scratch_locks) / (blockDim.x * blockDim.y));
+    threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len;
+    threadid *= blockDim.x * blockDim.y;
+    int done = 0;
+    while (!done) {
+      done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1));
+      if (!done) {
+        threadid += blockDim.x * blockDim.y;
+        if (int64_t(threadid + blockDim.x * blockDim.y) >=
+            wraparound_len * blockDim.x * blockDim.y)
+          threadid = 0;
+      }
+    }
+    base_thread_id = threadid;
+  }
+  __syncthreads();
+  threadid = base_thread_id;
+  return threadid;
+}
+
+__device__ inline void hip_release_scratch_index(int32_t* scratch_locks,
+                                                 int64_t threadid) {
+  __syncthreads();
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    scratch_locks[threadid] = 0;
+  }
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/core/src/HIP/Kokkos_HIP_UniqueToken.hpp b/core/src/HIP/Kokkos_HIP_UniqueToken.hpp
index 313e5f52172..3d70b596463 100644
--- a/core/src/HIP/Kokkos_HIP_UniqueToken.hpp
+++ b/core/src/HIP/Kokkos_HIP_UniqueToken.hpp
@@ -19,7 +19,6 @@
 
 #include <HIP/Kokkos_HIP_Space.hpp>
 #include <Kokkos_UniqueToken.hpp>
-#include <impl/Kokkos_SharedAlloc.hpp>
 
 namespace Kokkos {
 
diff --git a/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp
index 5c40d0fbc8d..4bca29868f7 100644
--- a/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp
+++ b/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp
@@ -25,19 +25,11 @@ namespace Impl {
 
 template <class T, class... P>
 struct ZeroMemset<HIP, View<T, P...>> {
-  ZeroMemset(const HIP& exec_space, const View<T, P...>& dst,
-             typename View<T, P...>::const_value_type&) {
+  ZeroMemset(const HIP& exec_space, const View<T, P...>& dst) {
     KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync(
         dst.data(), 0, dst.size() * sizeof(typename View<T, P...>::value_type),
         exec_space.hip_stream()));
   }
-
-  ZeroMemset(const View<T, P...>& dst,
-             typename View<T, P...>::const_value_type&) {
-    KOKKOS_IMPL_HIP_SAFE_CALL(
-        hipMemset(dst.data(), 0,
-                  dst.size() * sizeof(typename View<T, P...>::value_type)));
-  }
 };
 
 }  // namespace Impl
diff --git a/core/src/HPX/Kokkos_HPX.cpp b/core/src/HPX/Kokkos_HPX.cpp
index 4a40ffcaa4f..6d541a64148 100644
--- a/core/src/HPX/Kokkos_HPX.cpp
+++ b/core/src/HPX/Kokkos_HPX.cpp
@@ -103,6 +103,7 @@ void HPX::print_configuration(std::ostream &os, const bool) const {
   os << hpx::configuration_string() << '\n';
 }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
 bool &HPX::impl_get_in_parallel() noexcept {
   static thread_local bool in_parallel = false;
   return in_parallel;
@@ -127,6 +128,7 @@ HPX::impl_not_in_parallel_scope::~impl_not_in_parallel_scope() noexcept {
   KOKKOS_EXPECTS(!impl_get_in_parallel());
   impl_get_in_parallel() = true;
 }
+#endif
 
 void HPX::impl_decrement_active_parallel_region_count() {
   std::unique_lock<hpx::spinlock> l(m_active_parallel_region_count_mutex);
diff --git a/core/src/HPX/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp
index 1dfc5b40646..26181a7c05d 100644
--- a/core/src/HPX/Kokkos_HPX.hpp
+++ b/core/src/HPX/Kokkos_HPX.hpp
@@ -27,14 +27,6 @@ static_assert(false,
 
 #include <Kokkos_Core_fwd.hpp>
 
-#include <Kokkos_HostSpace.hpp>
-#include <cstddef>
-#include <iosfwd>
-
-#ifdef KOKKOS_ENABLE_HBWSPACE
-#include <Kokkos_HBWSpace.hpp>
-#endif
-
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_MemoryTraits.hpp>
@@ -59,6 +51,7 @@ static_assert(false,
 
 #include <Kokkos_UniqueToken.hpp>
 
+#include <cstddef>
 #include <iosfwd>
 #include <functional>
 #include <memory>
@@ -201,6 +194,7 @@ class HPX {
     return impl_get_instance_data().m_instance_id;
   }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
   static bool &impl_get_in_parallel() noexcept;
 
   struct impl_in_parallel_scope {
@@ -223,9 +217,10 @@ class HPX {
         delete;
   };
 
-  static bool in_parallel(HPX const & = HPX()) noexcept {
+  KOKKOS_DEPRECATED static bool in_parallel(HPX const & = HPX()) noexcept {
     return impl_get_in_parallel();
   }
+#endif
 
   static void impl_decrement_active_parallel_region_count();
   static void impl_increment_active_parallel_region_count();
@@ -248,18 +243,6 @@ class HPX {
 #endif
   }
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-  template <typename F>
-  KOKKOS_DEPRECATED static void partition_master(
-      F const &, int requested_num_partitions = 0, int = 0) {
-    if (requested_num_partitions > 1) {
-      Kokkos::abort(
-          "Kokkos::Experimental::HPX::partition_master: can't partition an "
-          "HPX instance\n");
-    }
-  }
-#endif
-
 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
   static int concurrency();
 #else
@@ -355,7 +338,9 @@ class HPX {
                            hpx::threads::thread_stacksize::default_) const {
     impl_bulk_plain_erased(force_synchronous, is_light_weight_policy,
                            {[functor](Index i) {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
                              impl_in_parallel_scope p;
+#endif
                              functor.execute_range(i);
                            }},
                            n, stacksize);
@@ -417,15 +402,21 @@ class HPX {
           hpx::threads::thread_stacksize::default_) const {
     impl_bulk_setup_finalize_erased(force_synchronous, is_light_weight_policy,
                                     {[functor](Index i) {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
                                       impl_in_parallel_scope p;
+#endif
                                       functor.execute_range(i);
                                     }},
                                     {[functor]() {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
                                       impl_in_parallel_scope p;
+#endif
                                       functor.setup();
                                     }},
                                     {[functor]() {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
                                       impl_in_parallel_scope p;
+#endif
                                       functor.finalize();
                                     }},
                                     n, stacksize);
@@ -1292,6 +1283,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
     const WorkRange range(m_policy, t, num_worker_threads);
     execute_chunk(range.begin(), range.end(), update_sum, false);
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
     {
       // Since arrive_and_wait may yield and resume on another worker thread we
       // set in_parallel = false on the current thread before suspending and set
@@ -1299,6 +1291,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
       Kokkos::Experimental::HPX::impl_not_in_parallel_scope p;
       barrier.arrive_and_wait();
     }
+#else
+    barrier.arrive_and_wait();
+#endif
 
     if (t == 0) {
       final_reducer.init(reinterpret_cast<pointer_type>(
@@ -1320,6 +1315,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
       }
     }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
     {
       // Since arrive_and_wait may yield and resume on another worker thread we
       // set in_parallel = false on the current thread before suspending and set
@@ -1327,6 +1323,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
       Kokkos::Experimental::HPX::impl_not_in_parallel_scope p;
       barrier.arrive_and_wait();
     }
+#else
+    barrier.arrive_and_wait();
+#endif
 
     reference_type update_base =
         Analysis::Reducer::reference(reinterpret_cast<pointer_type>(
@@ -1407,6 +1406,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     const WorkRange range(m_policy, t, num_worker_threads);
     execute_chunk(range.begin(), range.end(), update_sum, false);
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
     {
       // Since arrive_and_wait may yield and resume on another worker thread we
       // set in_parallel = false on the current thread before suspending and set
@@ -1414,6 +1414,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
       Kokkos::Experimental::HPX::impl_not_in_parallel_scope p;
       barrier.arrive_and_wait();
     }
+#else
+    barrier.arrive_and_wait();
+#endif
 
     if (t == 0) {
       final_reducer.init(reinterpret_cast<pointer_type>(
@@ -1435,6 +1438,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
       }
     }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
     {
       // Since arrive_and_wait may yield and resume on another worker thread we
       // set in_parallel = false on the current thread before suspending and set
@@ -1442,6 +1446,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
       Kokkos::Experimental::HPX::impl_not_in_parallel_scope p;
       barrier.arrive_and_wait();
     }
+#else
+    barrier.arrive_and_wait();
+#endif
 
     reference_type update_base =
         Analysis::Reducer::reference(reinterpret_cast<pointer_type>(
diff --git a/core/src/KokkosExp_MDRangePolicy.hpp b/core/src/KokkosExp_MDRangePolicy.hpp
index c9080db01ca..297b1fadee9 100644
--- a/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/core/src/KokkosExp_MDRangePolicy.hpp
@@ -73,7 +73,7 @@ is_less_than_value_initialized_variable(T arg) {
 
 // Checked narrowing conversion that calls abort if the cast changes the value
 template <class To, class From>
-constexpr To checked_narrow_cast(From arg) {
+constexpr To checked_narrow_cast(From arg, std::size_t idx) {
   constexpr const bool is_different_signedness =
       (std::is_signed<To>::value != std::is_signed<From>::value);
   auto const ret = static_cast<To>(arg);
@@ -81,7 +81,12 @@ constexpr To checked_narrow_cast(From arg) {
       (is_different_signedness &&
        is_less_than_value_initialized_variable(arg) !=
            is_less_than_value_initialized_variable(ret))) {
-    Kokkos::abort("unsafe narrowing conversion");
+    auto msg =
+        "Kokkos::MDRangePolicy bound type error: an unsafe implicit conversion "
+        "is performed on a bound (" +
+        std::to_string(arg) + ") in dimension (" + std::to_string(idx) +
+        "), which may not preserve its original value.\n";
+    Kokkos::abort(msg.c_str());
   }
   return ret;
 }
@@ -96,15 +101,15 @@ constexpr Array to_array_potentially_narrowing(const U (&init)[M]) {
   using T = typename Array::value_type;
   Array a{};
   constexpr std::size_t N = a.size();
-  static_assert(M <= N, "");
+  static_assert(M <= N);
   auto* ptr = a.data();
   // NOTE equivalent to
   // std::transform(std::begin(init), std::end(init), a.data(),
   //                [](U x) { return static_cast<T>(x); });
   // except that std::transform is not constexpr.
-  for (auto x : init) {
-    *ptr++ = checked_narrow_cast<T>(x);
-    (void)checked_narrow_cast<IndexType>(x);  // see note above
+  for (std::size_t i = 0; i < M; ++i) {
+    *ptr++ = checked_narrow_cast<T>(init[i], i);
+    (void)checked_narrow_cast<IndexType>(init[i], i);  // see note above
   }
   return a;
 }
@@ -120,10 +125,10 @@ constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing(
   using T = typename NVCC_WONT_LET_ME_CALL_YOU_Array::value_type;
   NVCC_WONT_LET_ME_CALL_YOU_Array a{};
   constexpr std::size_t N = a.size();
-  static_assert(M <= N, "");
+  static_assert(M <= N);
   for (std::size_t i = 0; i < M; ++i) {
-    a[i] = checked_narrow_cast<T>(other[i]);
-    (void)checked_narrow_cast<IndexType>(other[i]);  // see note above
+    a[i] = checked_narrow_cast<T>(other[i], i);
+    (void)checked_narrow_cast<IndexType>(other[i], i);  // see note above
   }
   return a;
 }
@@ -150,9 +155,20 @@ TileSizeProperties get_tile_size_properties(const ExecutionSpace&) {
 
 // multi-dimensional iteration pattern
 template <typename... Properties>
-struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
-  using traits       = Kokkos::Impl::PolicyTraits<Properties...>;
-  using range_policy = RangePolicy<Properties...>;
+struct MDRangePolicy;
+
+// Note: If MDRangePolicy has a primary template, implicit CTAD (deduction
+// guides) are generated -> MDRangePolicy<> by some compilers, which is
+// incorrect.  By making it a template specialization instead, no implicit CTAD
+// is generated.  This works because there has to be at least one property
+// specified (which is Rank<...>); otherwise, we'd get the static_assert
+// "Kokkos::Error: MD iteration pattern not defined".  This template
+// specialization uses <P, Properties...> in all places for correctness.
+template <typename P, typename... Properties>
+struct MDRangePolicy<P, Properties...>
+    : public Kokkos::Impl::PolicyTraits<P, Properties...> {
+  using traits       = Kokkos::Impl::PolicyTraits<P, Properties...>;
+  using range_policy = RangePolicy<P, Properties...>;
 
   typename traits::execution_space m_space;
 
@@ -161,8 +177,8 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
                   typename traits::schedule_type, typename traits::index_type>;
 
   using execution_policy =
-      MDRangePolicy<Properties...>;  // needed for is_execution_space
-                                     // interrogation
+      MDRangePolicy<P, Properties...>;  // needed for is_execution_policy
+                                        // interrogation
 
   template <class... OtherProperties>
   friend struct MDRangePolicy;
@@ -327,6 +343,20 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
     }
     for (int i = rank_start; i != rank_end; i += increment) {
       const index_type length = m_upper[i] - m_lower[i];
+
+      if (m_upper[i] < m_lower[i]) {
+        std::string msg =
+            "Kokkos::MDRangePolicy bounds error: The lower bound (" +
+            std::to_string(m_lower[i]) + ") is greater than its upper bound (" +
+            std::to_string(m_upper[i]) + ") in dimension " + std::to_string(i) +
+            ".\n";
+#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4)
+        Kokkos::abort(msg.c_str());
+#elif defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS)
+        Kokkos::Impl::log_warning(msg);
+#endif
+      }
+
       if (m_tile[i] <= 0) {
         m_tune_tile_size = true;
         if ((inner_direction == Iterate::Right && (i < rank - 1)) ||
@@ -358,6 +388,60 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
   }
 };
 
+template <typename LT, size_t N, typename UT>
+MDRangePolicy(const LT (&)[N], const UT (&)[N])->MDRangePolicy<Rank<N>>;
+
+template <typename LT, size_t N, typename UT, typename TT, size_t TN>
+MDRangePolicy(const LT (&)[N], const UT (&)[N], const TT (&)[TN])
+    ->MDRangePolicy<Rank<N>>;
+
+template <typename LT, size_t N, typename UT>
+MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N])
+    ->MDRangePolicy<Rank<N>>;
+
+template <typename LT, size_t N, typename UT, typename TT, size_t TN>
+MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N],
+              const TT (&)[TN])
+    ->MDRangePolicy<Rank<N>>;
+
+template <typename ES, typename LT, size_t N, typename UT,
+          typename = std::enable_if_t<is_execution_space_v<ES>>>
+MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N])
+    ->MDRangePolicy<ES, Rank<N>>;
+
+template <typename ES, typename LT, size_t N, typename UT, typename TT,
+          size_t TN, typename = std::enable_if_t<is_execution_space_v<ES>>>
+MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N], const TT (&)[TN])
+    ->MDRangePolicy<ES, Rank<N>>;
+
+template <typename T, size_t N>
+MDRangePolicy(Array<T, N> const&, Array<T, N> const&)->MDRangePolicy<Rank<N>>;
+
+template <typename T, size_t N, size_t NT>
+MDRangePolicy(Array<T, N> const&, Array<T, N> const&, Array<T, NT> const&)
+    ->MDRangePolicy<Rank<N>>;
+
+template <typename T, size_t N>
+MDRangePolicy(DefaultExecutionSpace const&, Array<T, N> const&,
+              Array<T, N> const&)
+    ->MDRangePolicy<Rank<N>>;
+
+template <typename T, size_t N, size_t NT>
+MDRangePolicy(DefaultExecutionSpace const&, Array<T, N> const&,
+              Array<T, N> const&, Array<T, NT> const&)
+    ->MDRangePolicy<Rank<N>>;
+
+template <typename ES, typename T, size_t N,
+          typename = std::enable_if_t<is_execution_space_v<ES>>>
+MDRangePolicy(ES const&, Array<T, N> const&, Array<T, N> const&)
+    ->MDRangePolicy<ES, Rank<N>>;
+
+template <typename ES, typename T, size_t N, size_t NT,
+          typename = std::enable_if_t<is_execution_space_v<ES>>>
+MDRangePolicy(ES const&, Array<T, N> const&, Array<T, N> const&,
+              Array<T, NT> const&)
+    ->MDRangePolicy<ES, Rank<N>>;
+
 }  // namespace Kokkos
 
 #endif  // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp
index 82ceaaec218..ba1626bb72e 100644
--- a/core/src/Kokkos_Array.hpp
+++ b/core/src/Kokkos_Array.hpp
@@ -22,6 +22,7 @@
 #endif
 
 #include <Kokkos_Macros.hpp>
+#include <Kokkos_Swap.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_StringManipulation.hpp>
 
@@ -320,6 +321,9 @@ struct Array<T, KOKKOS_INVALID_INDEX, Array<>::strided> {
       : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {}
 };
 
+template <typename T, typename... Us>
+Array(T, Us...)->Array<T, 1 + sizeof...(Us)>;
+
 }  // namespace Kokkos
 
 //<editor-fold desc="Support for structured binding">
diff --git a/core/src/Kokkos_Assert.hpp b/core/src/Kokkos_Assert.hpp
index c3b9004734a..6fea286005e 100644
--- a/core/src/Kokkos_Assert.hpp
+++ b/core/src/Kokkos_Assert.hpp
@@ -44,9 +44,6 @@
               __LINE__) " \n");                                                \
     }                                                                          \
   }
-// some projects already define this for themselves, so don't mess
-// them up
-#ifndef KOKKOS_ASSERT
 #define KOKKOS_ASSERT(...)                                                     \
   {                                                                            \
     if (!bool(__VA_ARGS__)) {                                                  \
@@ -58,8 +55,7 @@
               __LINE__) " \n");                                                \
     }                                                                          \
   }
-#endif  // ifndef KOKKOS_ASSERT
-#else   // not debug mode
+#else  // not debug mode
 #define KOKKOS_EXPECTS(...)
 #define KOKKOS_ENSURES(...)
 #ifndef KOKKOS_ASSERT
diff --git a/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
index 1c434746321..9acacef901a 100644
--- a/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
+++ b/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
@@ -25,7 +25,7 @@ static_assert(false,
 #include <Kokkos_Atomics_Desul_Config.hpp>
 #include <desul/atomics.hpp>
 
-#ifdef KOKKOS_INTERNAL_NOT_PARALLEL
+#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS
 #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller()
 #else
 #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice()
diff --git a/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
index bda37839805..eebdd20f15d 100644
--- a/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
+++ b/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
@@ -49,7 +49,7 @@ inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; }
 #endif
 // ============================================================
 
-#ifdef KOKKOS_INTERNAL_NOT_PARALLEL
+#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS
 #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller()
 #else
 #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice()
diff --git a/core/src/Kokkos_Clamp.hpp b/core/src/Kokkos_Clamp.hpp
new file mode 100644
index 00000000000..033cde9ab84
--- /dev/null
+++ b/core/src/Kokkos_Clamp.hpp
@@ -0,0 +1,41 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_CLAMP_HPP
+#define KOKKOS_CLAMP_HPP
+
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos {
+
+template <class T>
+constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo,
+                                                const T& hi) {
+  KOKKOS_EXPECTS(!(hi < lo));
+  return (value < lo) ? lo : (hi < value) ? hi : value;
+}
+
+template <class T, class ComparatorType>
+constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo,
+                                                const T& hi,
+                                                ComparatorType comp) {
+  KOKKOS_EXPECTS(!comp(hi, lo));
+  return comp(value, lo) ? lo : comp(hi, value) ? hi : value;
+}
+
+}  // namespace Kokkos
+
+#endif
diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp
index a0ca55be704..08f6ba8d696 100644
--- a/core/src/Kokkos_CopyViews.hpp
+++ b/core/src/Kokkos_CopyViews.hpp
@@ -22,6 +22,7 @@ static_assert(false,
 #ifndef KOKKOS_COPYVIEWS_HPP_
 #define KOKKOS_COPYVIEWS_HPP_
 #include <string>
+#include <sstream>
 #include <Kokkos_Parallel.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 #include <Kokkos_Layout.hpp>
@@ -612,12 +613,17 @@ void view_copy(const DstType& dst, const SrcType& src) {
   };
 
   if (!DstExecCanAccessSrc && !SrcExecCanAccessDst) {
-    std::string message(
-        "Error: Kokkos::deep_copy with no available copy mechanism: ");
-    message += src.label();
-    message += " to ";
-    message += dst.label();
-    Kokkos::Impl::throw_runtime_exception(message);
+    std::ostringstream ss;
+    ss << "Error: Kokkos::deep_copy with no available copy mechanism: "
+       << "from source view (\"" << src.label() << "\") to destination view (\""
+       << dst.label() << "\").\n"
+       << "There is no common execution space that can access both source's "
+          "space\n"
+       << "(" << src_memory_space().name() << ") and destination's space ("
+       << dst_memory_space().name() << "), "
+       << "so source and destination\n"
+       << "must be contiguous and have the same layout.\n";
+    Kokkos::Impl::throw_runtime_exception(ss.str());
   }
 
   // Figure out iteration order in case we need it
@@ -1330,13 +1336,12 @@ inline void contiguous_fill(
 // Default implementation for execution spaces that don't provide a definition
 template <typename ExecutionSpace, class ViewType>
 struct ZeroMemset {
-  ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst,
-             typename ViewType::const_value_type& value) {
-    contiguous_fill(exec_space, dst, value);
-  }
-
-  ZeroMemset(const ViewType& dst, typename ViewType::const_value_type& value) {
-    contiguous_fill(ExecutionSpace(), dst, value);
+  ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst) {
+    using ValueType = typename ViewType::value_type;
+    alignas(alignof(ValueType)) unsigned char
+        zero_initialized_storage[sizeof(ValueType)] = {};
+    contiguous_fill(exec_space, dst,
+                    *reinterpret_cast<ValueType*>(zero_initialized_storage));
   }
 };
 
@@ -1348,13 +1353,18 @@ inline std::enable_if_t<
 contiguous_fill_or_memset(
     const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value) {
-// On A64FX memset seems to do the wrong thing with regards to first touch
-// leading to the significant performance issues
-#ifndef KOKKOS_ARCH_A64FX
-  if (Impl::is_zero_byte(value))
-    ZeroMemset<ExecutionSpace, View<DT, DP...>>(exec_space, dst, value);
-  else
+  // With OpenMP, using memset has significant performance issues.
+  if (Impl::is_zero_byte(value)
+#ifdef KOKKOS_ENABLE_OPENMP
+      && !std::is_same_v<ExecutionSpace, Kokkos::OpenMP>
 #endif
+  )
+    // FIXME intel/19 icpc fails to deduce template parameters here,
+    // resulting in compilation errors; explicitly passing the template
+    // parameters to ZeroMemset helps workaround the issue
+    // See https://github.com/kokkos/kokkos/issues/6775
+    ZeroMemset<ExecutionSpace, View<DT, DP...>>(exec_space, dst);
+  else
     contiguous_fill(exec_space, dst, value);
 }
 
@@ -1379,15 +1389,20 @@ contiguous_fill_or_memset(
     typename ViewTraits<DT, DP...>::const_value_type& value) {
   using ViewType        = View<DT, DP...>;
   using exec_space_type = typename ViewType::execution_space;
+  exec_space_type exec;
 
 // On A64FX memset seems to do the wrong thing with regards to first touch
 // leading to the significant performance issues
 #ifndef KOKKOS_ARCH_A64FX
   if (Impl::is_zero_byte(value))
-    ZeroMemset<exec_space_type, View<DT, DP...>>(dst, value);
+    // FIXME intel/19 icpc fails to deduce template parameters here,
+    // resulting in compilation errors; explicitly passing the template
+    // parameters to ZeroMemset helps workaround the issue
+    // See https://github.com/kokkos/kokkos/issues/6775
+    ZeroMemset<exec_space_type, ViewType>(exec, dst);
   else
 #endif
-    contiguous_fill(exec_space_type(), dst, value);
+    contiguous_fill(exec, dst, value);
 }
 
 template <class DT, class... DP>
diff --git a/core/src/Kokkos_Core.hpp b/core/src/Kokkos_Core.hpp
index 805411a699e..1f146563be2 100644
--- a/core/src/Kokkos_Core.hpp
+++ b/core/src/Kokkos_Core.hpp
@@ -46,14 +46,15 @@
 
 #include <Kokkos_Half.hpp>
 #include <Kokkos_AnonymousSpace.hpp>
-#include <Kokkos_LogicalSpaces.hpp>
 #include <Kokkos_Pair.hpp>
-#include <Kokkos_MinMaxClamp.hpp>
+#include <Kokkos_Clamp.hpp>
+#include <Kokkos_MinMax.hpp>
 #include <Kokkos_MathematicalConstants.hpp>
 #include <Kokkos_MathematicalFunctions.hpp>
 #include <Kokkos_MathematicalSpecialFunctions.hpp>
 #include <Kokkos_NumericTraits.hpp>
 #include <Kokkos_BitManipulation.hpp>
+#include <Kokkos_Swap.hpp>
 #include <Kokkos_MemoryPool.hpp>
 #include <Kokkos_Array.hpp>
 #include <Kokkos_View.hpp>
@@ -101,6 +102,7 @@ void declare_configuration_metadata(const std::string& category,
 [[nodiscard]] bool is_finalized() noexcept;
 
 [[nodiscard]] int device_id() noexcept;
+[[nodiscard]] int num_devices() noexcept;
 [[nodiscard]] int num_threads() noexcept;
 
 bool show_warnings() noexcept;
@@ -300,9 +302,6 @@ std::vector<ExecSpace> partition_space(ExecSpace const& space,
 // implementation of the RAII wrapper is using Kokkos::single.
 #include <Kokkos_AcquireUniqueTokenImpl.hpp>
 
-// Specializations required after core definitions
-#include <KokkosCore_Config_PostInclude.hpp>
-
 //----------------------------------------------------------------------------
 // Redefinition of the macros min and max if we pushed them at entry of
 // Kokkos_Core.hpp
diff --git a/core/src/Kokkos_Core_fwd.hpp b/core/src/Kokkos_Core_fwd.hpp
index 44f1c5b42f4..7edb35f00eb 100644
--- a/core/src/Kokkos_Core_fwd.hpp
+++ b/core/src/Kokkos_Core_fwd.hpp
@@ -30,10 +30,6 @@
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Utilities.hpp>
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-#include <Kokkos_MasterLock.hpp>
-#endif
-
 //----------------------------------------------------------------------------
 // Have assumed a 64-bit build (8-byte pointers) throughout the code base.
 // 32-bit build allowed but unsupported.
@@ -75,9 +71,6 @@ template <class ExecutionSpace, class MemorySpace>
 struct Device;
 
 // forward declare here so that backend initializer calls can use it.
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-struct InitArguments;
-#endif
 class InitializationSettings;
 
 }  // namespace Kokkos
@@ -262,12 +255,6 @@ KOKKOS_FUNCTION void runtime_check_memory_access_violation(
 }
 
 }  // namespace Impl
-
-namespace Experimental {
-template <class, class, class, class>
-class LogicalMemorySpace;
-}
-
 }  // namespace Kokkos
 
 //----------------------------------------------------------------------------
diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp
index ae1585a4989..5f251eeb26a 100644
--- a/core/src/Kokkos_ExecPolicy.hpp
+++ b/core/src/Kokkos_ExecPolicy.hpp
@@ -28,6 +28,7 @@ static_assert(false,
 #include <impl/Kokkos_AnalyzePolicy.hpp>
 #include <Kokkos_Concepts.hpp>
 #include <typeinfo>
+#include <limits>
 
 //----------------------------------------------------------------------------
 
@@ -114,62 +115,67 @@ class RangePolicy : public Impl::PolicyTraits<Properties...> {
         m_granularity_mask(0) {}
 
   /** \brief  Total range */
+  template <typename IndexType1, typename IndexType2,
+            std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
+                              std::is_convertible_v<IndexType2, member_type>),
+                             bool> = false>
+  inline RangePolicy(const IndexType1 work_begin, const IndexType2 work_end)
+      : RangePolicy(typename traits::execution_space(), work_begin, work_end) {}
+
+  /** \brief  Total range */
+  template <typename IndexType1, typename IndexType2,
+            std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
+                              std::is_convertible_v<IndexType2, member_type>),
+                             bool> = false>
   inline RangePolicy(const typename traits::execution_space& work_space,
-                     const member_type work_begin, const member_type work_end)
+                     const IndexType1 work_begin, const IndexType2 work_end)
       : m_space(work_space),
-        m_begin(work_begin < work_end ? work_begin : 0),
-        m_end(work_begin < work_end ? work_end : 0),
+        m_begin(work_begin),
+        m_end(work_end),
         m_granularity(0),
         m_granularity_mask(0) {
+    check_conversion_safety(work_begin);
+    check_conversion_safety(work_end);
+    check_bounds_validity();
     set_auto_chunk_size();
   }
 
-  /** \brief  Total range */
-  inline RangePolicy(const member_type work_begin, const member_type work_end)
-      : RangePolicy(typename traits::execution_space(), work_begin, work_end) {
-    set_auto_chunk_size();
-  }
-
-  /** \brief  Total range */
-  template <class... Args>
-  inline RangePolicy(const typename traits::execution_space& work_space,
-                     const member_type work_begin, const member_type work_end,
-                     Args... args)
+  template <typename IndexType1, typename IndexType2,
+            std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
+                              std::is_convertible_v<IndexType2, member_type>),
+                             bool> = false>
+  RangePolicy(const typename traits::execution_space& work_space,
+              const IndexType1 work_begin, const IndexType2 work_end,
+              const ChunkSize chunk_size)
       : m_space(work_space),
-        m_begin(work_begin < work_end ? work_begin : 0),
-        m_end(work_begin < work_end ? work_end : 0),
+        m_begin(work_begin),
+        m_end(work_end),
         m_granularity(0),
         m_granularity_mask(0) {
-    set_auto_chunk_size();
-    set(args...);
+    check_conversion_safety(work_begin);
+    check_conversion_safety(work_end);
+    check_bounds_validity();
+    set_chunk_size(chunk_size.value);
   }
 
   /** \brief  Total range */
-  template <class... Args>
-  inline RangePolicy(const member_type work_begin, const member_type work_end,
-                     Args... args)
-      : RangePolicy(typename traits::execution_space(), work_begin, work_end) {
-    set_auto_chunk_size();
-    set(args...);
-  }
-
- private:
-  inline void set() {}
+  template <typename IndexType1, typename IndexType2, typename... Args,
+            std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
+                              std::is_convertible_v<IndexType2, member_type>),
+                             bool> = false>
+  RangePolicy(const IndexType1 work_begin, const IndexType2 work_end,
+              const ChunkSize chunk_size)
+      : RangePolicy(typename traits::execution_space(), work_begin, work_end,
+                    chunk_size) {}
 
  public:
-  template <class... Args>
-  inline void set(Args...) {
-    static_assert(
-        0 == sizeof...(Args),
-        "Kokkos::RangePolicy: unhandled constructor arguments encountered.");
-  }
-
-  template <class... Args>
-  inline void set(const ChunkSize& chunksize, Args... args) {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED_WITH_COMMENT("Use set_chunk_size instead")
+  inline void set(ChunkSize chunksize) {
     m_granularity      = chunksize.value;
     m_granularity_mask = m_granularity - 1;
-    set(args...);
   }
+#endif
 
  public:
   /** \brief return chunk_size */
@@ -218,6 +224,67 @@ class RangePolicy : public Impl::PolicyTraits<Properties...> {
     m_granularity_mask = m_granularity - 1;
   }
 
+  void check_bounds_validity() {
+    if (m_end < m_begin) {
+      std::string msg = "Kokkos::RangePolicy bounds error: The lower bound (" +
+                        std::to_string(m_begin) +
+                        ") is greater than the upper bound (" +
+                        std::to_string(m_end) + ").\n";
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4
+      Kokkos::abort(msg.c_str());
+#endif
+      m_begin = 0;
+      m_end   = 0;
+#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
+      Kokkos::Impl::log_warning(msg);
+#endif
+    }
+  }
+
+  // To be replaced with std::in_range (c++20)
+  template <typename IndexType>
+  static void check_conversion_safety(const IndexType bound) {
+#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \
+    defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS)
+
+    std::string msg =
+        "Kokkos::RangePolicy bound type error: an unsafe implicit conversion "
+        "is performed on a bound (" +
+        std::to_string(bound) +
+        "), which may "
+        "not preserve its original value.\n";
+    bool warn = false;
+
+    if constexpr (std::is_signed_v<IndexType> !=
+                  std::is_signed_v<member_type>) {
+      // check signed to unsigned
+      if constexpr (std::is_signed_v<IndexType>)
+        warn |= (bound < static_cast<IndexType>(
+                             std::numeric_limits<member_type>::min()));
+
+      // check unsigned to signed
+      if constexpr (std::is_signed_v<member_type>)
+        warn |= (bound > static_cast<IndexType>(
+                             std::numeric_limits<member_type>::max()));
+    }
+
+    // check narrowing
+    warn |= (static_cast<IndexType>(static_cast<member_type>(bound)) != bound);
+
+    if (warn) {
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4
+      Kokkos::abort(msg.c_str());
+#endif
+
+#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
+      Kokkos::Impl::log_warning(msg);
+#endif
+    }
+#else
+    (void)bound;
+#endif
+  }
+
  public:
   /** \brief  Subrange for a partition's rank and size.
    *
@@ -261,6 +328,21 @@ class RangePolicy : public Impl::PolicyTraits<Properties...> {
   };
 };
 
+RangePolicy()->RangePolicy<>;
+
+RangePolicy(int64_t, int64_t)->RangePolicy<>;
+RangePolicy(int64_t, int64_t, ChunkSize const&)->RangePolicy<>;
+
+RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t)->RangePolicy<>;
+RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&)
+    ->RangePolicy<>;
+
+template <typename ES, typename = std::enable_if_t<is_execution_space_v<ES>>>
+RangePolicy(ES const&, int64_t, int64_t)->RangePolicy<ES>;
+
+template <typename ES, typename = std::enable_if_t<is_execution_space_v<ES>>>
+RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&)->RangePolicy<ES>;
+
 }  // namespace Kokkos
 
 //----------------------------------------------------------------------------
@@ -983,7 +1065,16 @@ template <typename Rank, typename TeamHandle, typename Lambda,
 KOKKOS_INLINE_FUNCTION void parallel_reduce(
     TeamThreadMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda,
     ReducerValueType& val) {
+  static_assert(/*!Kokkos::is_view_v<ReducerValueType> &&*/
+                !std::is_array_v<ReducerValueType> &&
+                    !std::is_pointer_v<ReducerValueType> &&
+                    !Kokkos::is_reducer_v<ReducerValueType>,
+                "Only scalar return types are allowed!");
+
+  val = ReducerValueType{};
   Impl::md_parallel_impl<Rank>(policy, lambda, val);
+  policy.team.team_reduce(
+      Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{val});
 }
 
 template <typename Rank, typename TeamHandle, typename Lambda>
@@ -997,7 +1088,29 @@ template <typename Rank, typename TeamHandle, typename Lambda,
 KOKKOS_INLINE_FUNCTION void parallel_reduce(
     ThreadVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda,
     ReducerValueType& val) {
+  static_assert(/*!Kokkos::is_view_v<ReducerValueType> &&*/
+                !std::is_array_v<ReducerValueType> &&
+                    !std::is_pointer_v<ReducerValueType> &&
+                    !Kokkos::is_reducer_v<ReducerValueType>,
+                "Only a scalar return types are allowed!");
+
+  val = ReducerValueType{};
   Impl::md_parallel_impl<Rank>(policy, lambda, val);
+  if constexpr (false
+#ifdef KOKKOS_ENABLE_CUDA
+                || std::is_same_v<typename TeamHandle::execution_space,
+                                  Kokkos::Cuda>
+#elif defined(KOKKOS_ENABLE_HIP)
+                || std::is_same_v<typename TeamHandle::execution_space,
+                                  Kokkos::HIP>
+#elif defined(KOKKOS_ENABLE_SYCL)
+                || std::is_same_v<typename TeamHandle::execution_space,
+                                  Kokkos::Experimental::SYCL>
+#endif
+  )
+    policy.team.vector_reduce(
+        Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{
+            val});
 }
 
 template <typename Rank, typename TeamHandle, typename Lambda>
@@ -1011,7 +1124,31 @@ template <typename Rank, typename TeamHandle, typename Lambda,
 KOKKOS_INLINE_FUNCTION void parallel_reduce(
     TeamVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda,
     ReducerValueType& val) {
+  static_assert(/*!Kokkos::is_view_v<ReducerValueType> &&*/
+                !std::is_array_v<ReducerValueType> &&
+                    !std::is_pointer_v<ReducerValueType> &&
+                    !Kokkos::is_reducer_v<ReducerValueType>,
+                "Only a scalar return types are allowed!");
+
+  val = ReducerValueType{};
   Impl::md_parallel_impl<Rank>(policy, lambda, val);
+  if constexpr (false
+#ifdef KOKKOS_ENABLE_CUDA
+                || std::is_same_v<typename TeamHandle::execution_space,
+                                  Kokkos::Cuda>
+#elif defined(KOKKOS_ENABLE_HIP)
+                || std::is_same_v<typename TeamHandle::execution_space,
+                                  Kokkos::HIP>
+#elif defined(KOKKOS_ENABLE_SYCL)
+                || std::is_same_v<typename TeamHandle::execution_space,
+                                  Kokkos::Experimental::SYCL>
+#endif
+  )
+    policy.team.vector_reduce(
+        Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{
+            val});
+  policy.team.team_reduce(
+      Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{val});
 }
 
 template <typename Rank, typename TeamHandle, typename Lambda>
diff --git a/core/src/Kokkos_HBWSpace.hpp b/core/src/Kokkos_HBWSpace.hpp
deleted file mode 100644
index 369b7bafb7b..00000000000
--- a/core/src/Kokkos_HBWSpace.hpp
+++ /dev/null
@@ -1,308 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
-#include <Kokkos_Macros.hpp>
-static_assert(false,
-              "Including non-public Kokkos header files is not allowed.");
-#endif
-#ifndef KOKKOS_HBWSPACE_HPP
-#define KOKKOS_HBWSPACE_HPP
-
-#include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_HBWSPACE
-
-#include <Kokkos_HostSpace.hpp>
-
-namespace Kokkos {
-
-namespace Experimental {
-
-/// \class HBWSpace
-/// \brief Memory management for host memory.
-///
-/// HBWSpace is a memory space that governs host memory.  "Host"
-/// memory means the usual CPU-accessible memory.
-class HBWSpace {
- public:
-  //! Tag this class as a kokkos memory space
-  using memory_space = HBWSpace;
-  using size_type    = size_t;
-
-  /// \typedef execution_space
-  /// \brief Default execution space for this memory space.
-  ///
-  /// Every memory space has a default execution space.  This is
-  /// useful for things like initializing a View (which happens in
-  /// parallel using the View's default execution space).
-  using execution_space = Kokkos::DefaultHostExecutionSpace;
-
-  //! This memory space preferred device_type
-  using device_type = Kokkos::Device<execution_space, memory_space>;
-
-  /**\brief  Default memory space instance */
-  HBWSpace();
-  HBWSpace(const HBWSpace& rhs) = default;
-  HBWSpace& operator=(const HBWSpace&) = default;
-  ~HBWSpace()                          = default;
-
-  /**\brief  Non-default memory space instance to choose allocation mechansim,
-   * if available */
-
-  enum AllocationMechanism {
-    STD_MALLOC,
-    POSIX_MEMALIGN,
-    POSIX_MMAP,
-    INTEL_MM_ALLOC
-  };
-
-  explicit HBWSpace(const AllocationMechanism&);
-
-  /**\brief  Allocate untracked memory in the space */
-  void* allocate(const size_t arg_alloc_size) const;
-  void* allocate(const char* arg_label, const size_t arg_alloc_size,
-                 const size_t arg_logical_size = 0) const;
-
-  /**\brief  Deallocate untracked memory in the space */
-  void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const;
-  void deallocate(const char* arg_label, void* const arg_alloc_ptr,
-                  const size_t arg_alloc_size,
-                  const size_t arg_logical_size = 0) const;
-
- private:
-  template <class, class, class, class>
-  friend class LogicalMemorySpace;
-
-  void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
-                      const size_t arg_logical_size = 0,
-                      const Kokkos::Tools::SpaceHandle =
-                          Kokkos::Tools::make_space_handle(name())) const;
-  void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
-                       const size_t arg_alloc_size,
-                       const size_t arg_logical_size = 0,
-                       const Kokkos::Tools::SpaceHandle =
-                           Kokkos::Tools::make_space_handle(name())) const;
-
- public:
-  /**\brief Return Name of the MemorySpace */
-  static constexpr const char* name() { return "HBW"; }
-
- private:
-  AllocationMechanism m_alloc_mech;
-  friend class Kokkos::Impl::SharedAllocationRecord<
-      Kokkos::Experimental::HBWSpace, void>;
-};
-
-}  // namespace Experimental
-
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-namespace Impl {
-
-template <>
-class SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>
-    : public SharedAllocationRecord<void, void> {
- private:
-  friend Kokkos::Experimental::HBWSpace;
-
-  using RecordBase = SharedAllocationRecord<void, void>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-
-  static void deallocate(RecordBase*);
-
-#ifdef KOKKOS_ENABLE_DEBUG
-  /**\brief  Root record for tracked allocations from this HBWSpace instance */
-  static RecordBase s_root_record;
-#endif
-
-  const Kokkos::Experimental::HBWSpace m_space;
-
- protected:
-  ~SharedAllocationRecord();
-  SharedAllocationRecord() = default;
-
-  SharedAllocationRecord(
-      const Kokkos::Experimental::HBWSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate);
-
- public:
-  inline std::string get_label() const {
-    return std::string(RecordBase::head()->m_label);
-  }
-
-  KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate(
-      const Kokkos::Experimental::HBWSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size) {
-    KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label,
-                                                         arg_alloc_size);))
-    KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size;
-                         return nullptr;))
-  }
-
-  /**\brief  Allocate tracked memory in the space */
-  static void* allocate_tracked(const Kokkos::Experimental::HBWSpace& arg_space,
-                                const std::string& arg_label,
-                                const size_t arg_alloc_size);
-
-  /**\brief  Reallocate tracked memory in the space */
-  static void* reallocate_tracked(void* const arg_alloc_ptr,
-                                  const size_t arg_alloc_size);
-
-  /**\brief  Deallocate tracked memory in the space */
-  static void deallocate_tracked(void* const arg_alloc_ptr);
-
-  static SharedAllocationRecord* get_record(void* arg_alloc_ptr);
-
-  static void print_records(std::ostream&,
-                            const Kokkos::Experimental::HBWSpace&,
-                            bool detail = false);
-};
-
-}  // namespace Impl
-
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-namespace Impl {
-
-static_assert(
-    Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::HBWSpace,
-                                    Kokkos::Experimental::HBWSpace>::assignable,
-    "");
-
-template <>
-struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::Experimental::HBWSpace> {
-  enum : bool { assignable = true };
-  enum : bool { accessible = true };
-  enum : bool { deepcopy = true };
-};
-
-template <>
-struct MemorySpaceAccess<Kokkos::Experimental::HBWSpace, Kokkos::HostSpace> {
-  enum : bool { assignable = false };
-  enum : bool { accessible = true };
-  enum : bool { deepcopy = true };
-};
-
-}  // namespace Impl
-
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-namespace Impl {
-
-template <>
-struct DeepCopy<Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace,
-                DefaultHostExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    hostspace_parallel_deepcopy(dst, src, n);
-  }
-
-  DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src,
-           size_t n) {
-    hostspace_parallel_deepcopy(exec, dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace,
-                ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    hostspace_parallel_deepcopy(dst, src, n);
-  }
-
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence(
-        "Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, "
-        "Kokkos::Experimental::HBWSpace,ExecutionSpace::DeepCopy: fence "
-        "before copy");
-    hostspace_parallel_deepcopy_async(dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace,
-                DefaultHostExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    hostspace_parallel_deepcopy(dst, src, n);
-  }
-
-  DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src,
-           size_t n) {
-    hostspace_parallel_deepcopy(exec, dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    hostspace_parallel_deepcopy(dst, src, n);
-  }
-
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence(
-        "Kokkos::Impl::DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, "
-        "ExecutionSpace>::DeepCopy: fence before copy");
-    hostspace_parallel_deepcopy_async(copy_space, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace,
-                DefaultHostExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    hostspace_parallel_deepcopy(dst, src, n);
-  }
-
-  DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src,
-           size_t n) {
-    hostspace_parallel_deepcopy(exec, dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    hostspace_parallel_deepcopy(dst, src, n);
-  }
-
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence(
-        "Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, "
-        "ExecutionSpace>::DeepCopy: fence before copy");
-    hostspace_parallel_deepcopy_async(dst, src, n);
-  }
-};
-
-}  // namespace Impl
-
-}  // namespace Kokkos
-
-#endif
-#endif  // #define KOKKOS_HBWSPACE_HPP
diff --git a/core/src/Kokkos_HostSpace.hpp b/core/src/Kokkos_HostSpace.hpp
index 252aabd949f..a1fb0f5a677 100644
--- a/core/src/Kokkos_HostSpace.hpp
+++ b/core/src/Kokkos_HostSpace.hpp
@@ -37,7 +37,6 @@ static_assert(false,
 #include <impl/Kokkos_Tools.hpp>
 
 #include "impl/Kokkos_HostSpace_deepcopy.hpp"
-#include <impl/Kokkos_MemorySpace.hpp>
 
 /*--------------------------------------------------------------------------*/
 
@@ -94,6 +93,16 @@ class HostSpace {
 #endif
 
   /**\brief  Allocate untracked memory in the space */
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
+    return allocate(arg_alloc_size);
+  }
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const {
+    return allocate(arg_label, arg_alloc_size, arg_logical_size);
+  }
   void* allocate(const size_t arg_alloc_size) const;
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
                  const size_t arg_logical_size = 0) const;
@@ -105,9 +114,6 @@ class HostSpace {
                   const size_t arg_logical_size = 0) const;
 
  private:
-  template <class, class, class, class>
-  friend class Kokkos::Experimental::LogicalMemorySpace;
-
   void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
                       const size_t arg_logical_size = 0,
                       const Kokkos::Tools::SpaceHandle =
@@ -124,7 +130,6 @@ class HostSpace {
 
  private:
   static constexpr const char* m_name = "Host";
-  friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>;
 };
 
 }  // namespace Kokkos
@@ -136,8 +141,7 @@ namespace Kokkos {
 namespace Impl {
 
 static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                              Kokkos::HostSpace>::assignable,
-              "");
+                                              Kokkos::HostSpace>::assignable);
 
 template <typename S>
 struct HostMirror {
@@ -173,75 +177,7 @@ struct HostMirror {
 
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-
-namespace Impl {
-
-template <>
-class SharedAllocationRecord<Kokkos::HostSpace, void>
-    : public SharedAllocationRecordCommon<Kokkos::HostSpace> {
- private:
-  friend Kokkos::HostSpace;
-  friend class SharedAllocationRecordCommon<Kokkos::HostSpace>;
-
-  using base_t     = SharedAllocationRecordCommon<Kokkos::HostSpace>;
-  using RecordBase = SharedAllocationRecord<void, void>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-
-#ifdef KOKKOS_ENABLE_DEBUG
-  /**\brief  Root record for tracked allocations from this HostSpace instance */
-  static RecordBase s_root_record;
-#endif
-
-  Kokkos::HostSpace m_space;
-
- protected:
-  ~SharedAllocationRecord();
-  SharedAllocationRecord() = default;
-
-  // This constructor does not forward to the one without exec_space arg
-  // in order to work around https://github.com/kokkos/kokkos/issues/5258
-  // This constructor is templated so I can't just put it into the cpp file
-  // like the other constructor.
-  template <typename ExecutionSpace>
-  SharedAllocationRecord(
-      const ExecutionSpace& /* exec_space*/, const Kokkos::HostSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate)
-      : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-            &SharedAllocationRecord<Kokkos::HostSpace, void>::s_root_record,
-#endif
-            Impl::checked_allocation_with_header(arg_space, arg_label,
-                                                 arg_alloc_size),
-            sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-            arg_label),
-        m_space(arg_space) {
-    this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr,
-                                                    arg_label);
-  }
-
-  SharedAllocationRecord(
-      const Kokkos::HostSpace& arg_space, const std::string& arg_label,
-      const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate);
-
- public:
-  KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate(
-      const Kokkos::HostSpace& arg_space, const std::string& arg_label,
-      const size_t arg_alloc_size) {
-    KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label,
-                                                         arg_alloc_size);))
-    KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size;
-                         return nullptr;))
-  }
-};
-
-}  // namespace Impl
-
-}  // namespace Kokkos
+KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HostSpace);
 
 //----------------------------------------------------------------------------
 
diff --git a/core/src/Kokkos_LogicalSpaces.hpp b/core/src/Kokkos_LogicalSpaces.hpp
deleted file mode 100644
index 1ee1d2c81fe..00000000000
--- a/core/src/Kokkos_LogicalSpaces.hpp
+++ /dev/null
@@ -1,413 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
-#include <Kokkos_Macros.hpp>
-static_assert(false,
-              "Including non-public Kokkos header files is not allowed.");
-#endif
-#ifndef KOKKOS_LOGICALSPACES_HPP
-#define KOKKOS_LOGICALSPACES_HPP
-
-#include <Kokkos_Macros.hpp>
-#include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_ScratchSpace.hpp>
-#include <impl/Kokkos_MemorySpace.hpp>
-#include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_SharedAlloc.hpp>
-#include <impl/Kokkos_Profiling.hpp>
-#include <cstring>
-namespace Kokkos {
-namespace Experimental {
-struct DefaultMemorySpaceNamer {
-  static constexpr const char* get_name() {
-    return "DefaultLogicalMemorySpaceName";
-  }
-};
-
-struct LogicalSpaceSharesAccess {
-  struct shared_access {};
-  struct no_shared_access {};
-};
-
-/// \class LogicalMemorySpace
-/// \brief
-///
-/// LogicalMemorySpace is a space that is identical to another space,
-/// but differentiable by name and template argument
-template <class BaseSpace, class DefaultBaseExecutionSpace = void,
-          class Namer                = DefaultMemorySpaceNamer,
-          class SharesAccessWithBase = LogicalSpaceSharesAccess::shared_access>
-class LogicalMemorySpace {
-#ifdef KOKKOS_ENABLE_OPENMPTARGET
-  // [DZP] For some reason I don't yet know, using LogicalMemorySpaces
-  // inside an OpenMPTarget build causes errors in the
-  // SharedAllocationRecords of other types. This is my way of erroring
-  // a build if we instantiate a LogicalMemSpace in an OMPTarget build
-  static_assert(!std::is_same<BaseSpace, BaseSpace>::value,
-                "Can't use LogicalMemorySpaces in an OpenMPTarget build, we're "
-                "debugging memory issues");
-#endif
- public:
-  //! Tag this class as a kokkos memory space
-  using memory_space = LogicalMemorySpace<BaseSpace, DefaultBaseExecutionSpace,
-                                          Namer, SharesAccessWithBase>;
-  using size_type    = typename BaseSpace::size_type;
-
-  /// \typedef execution_space
-  /// \brief Default execution space for this memory space.
-  ///
-  /// Every memory space has a default execution space.  This is
-  /// useful for things like initializing a View (which happens in
-  /// parallel using the View's default execution space).
-
-  using execution_space =
-      std::conditional_t<std::is_void<DefaultBaseExecutionSpace>::value,
-                         typename BaseSpace::execution_space,
-                         DefaultBaseExecutionSpace>;
-
-  using device_type = Kokkos::Device<execution_space, memory_space>;
-
-  LogicalMemorySpace() = default;
-
-  template <typename... Args>
-  LogicalMemorySpace(Args&&... args) : underlying_space((Args &&) args...) {}
-
-  /**\brief  Allocate untracked memory in the space */
-  void* allocate(const size_t arg_alloc_size) const {
-    return allocate("[unlabeled]", arg_alloc_size);
-  }
-  void* allocate(const char* arg_label, const size_t arg_alloc_size,
-                 const size_t arg_logical_size = 0) const {
-    return impl_allocate(arg_label, arg_alloc_size, arg_logical_size);
-  }
-
-  /**\brief  Deallocate untracked memory in the space */
-  void deallocate(void* const arg_alloc_ptr,
-                  const size_t arg_alloc_size) const {
-    deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
-  }
-  void deallocate(const char* arg_label, void* const arg_alloc_ptr,
-                  const size_t arg_alloc_size,
-                  const size_t arg_logical_size = 0) const {
-    impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size);
-  }
-
-  /**\brief Return Name of the MemorySpace */
-  constexpr static const char* name() { return Namer::get_name(); }
-
- private:
-  BaseSpace underlying_space;
-  template <class, class, class, class>
-  friend class LogicalMemorySpace;
-  friend class Kokkos::Impl::SharedAllocationRecord<memory_space, void>;
-
-  void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
-                      const size_t arg_logical_size = 0,
-                      Kokkos::Tools::SpaceHandle arg_handle =
-                          Kokkos::Tools::make_space_handle(name())) const {
-    return underlying_space.impl_allocate(arg_label, arg_alloc_size,
-                                          arg_logical_size, arg_handle);
-  }
-  void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
-                       const size_t arg_alloc_size,
-                       const size_t arg_logical_size = 0,
-                       const Kokkos::Tools::SpaceHandle arg_handle =
-                           Kokkos::Tools::make_space_handle(name())) const {
-    underlying_space.impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size,
-                                     arg_logical_size, arg_handle);
-  }
-};
-}  // namespace Experimental
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-namespace Impl {
-
-template <typename BaseSpace, typename DefaultBaseExecutionSpace, class Namer,
-          typename OtherSpace>
-struct MemorySpaceAccess<
-    Kokkos::Experimental::LogicalMemorySpace<
-        BaseSpace, DefaultBaseExecutionSpace, Namer,
-        Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>,
-    OtherSpace> {
-  enum { assignable = MemorySpaceAccess<BaseSpace, OtherSpace>::assignable };
-  enum { accessible = MemorySpaceAccess<BaseSpace, OtherSpace>::accessible };
-  enum { deepcopy = MemorySpaceAccess<BaseSpace, OtherSpace>::deepcopy };
-};
-
-template <typename BaseSpace, typename DefaultBaseExecutionSpace, class Namer,
-          typename OtherSpace>
-struct MemorySpaceAccess<
-    OtherSpace,
-    Kokkos::Experimental::LogicalMemorySpace<
-        BaseSpace, DefaultBaseExecutionSpace, Namer,
-        Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> {
-  enum { assignable = MemorySpaceAccess<OtherSpace, BaseSpace>::assignable };
-  enum { accessible = MemorySpaceAccess<OtherSpace, BaseSpace>::accessible };
-  enum { deepcopy = MemorySpaceAccess<OtherSpace, BaseSpace>::deepcopy };
-};
-
-template <typename BaseSpace, typename DefaultBaseExecutionSpace, class Namer>
-struct MemorySpaceAccess<
-    Kokkos::Experimental::LogicalMemorySpace<
-        BaseSpace, DefaultBaseExecutionSpace, Namer,
-        Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>,
-    Kokkos::Experimental::LogicalMemorySpace<
-        BaseSpace, DefaultBaseExecutionSpace, Namer,
-        Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> {
-  enum { assignable = true };
-  enum { accessible = true };
-  enum { deepcopy = true };
-};
-
-}  // namespace Impl
-
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-namespace Impl {
-template <class BaseSpace, class DefaultBaseExecutionSpace, class Namer,
-          class SharesAccessSemanticsWithBase>
-class SharedAllocationRecord<Kokkos::Experimental::LogicalMemorySpace<
-                                 BaseSpace, DefaultBaseExecutionSpace, Namer,
-                                 SharesAccessSemanticsWithBase>,
-                             void> : public SharedAllocationRecord<void, void> {
- private:
-  using SpaceType =
-      Kokkos::Experimental::LogicalMemorySpace<BaseSpace,
-                                               DefaultBaseExecutionSpace, Namer,
-                                               SharesAccessSemanticsWithBase>;
-  using RecordBase = SharedAllocationRecord<void, void>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-
-  static void deallocate(RecordBase* arg_rec) {
-    delete static_cast<SharedAllocationRecord*>(arg_rec);
-  }
-
-#ifdef KOKKOS_ENABLE_DEBUG
-  /**\brief  Root record for tracked allocations from this
-   * LogicalMemorySpace instance */
-  static RecordBase s_root_record;
-#endif
-
-  const SpaceType m_space;
-
- protected:
-  ~SharedAllocationRecord() {
-    m_space.deallocate(RecordBase::m_alloc_ptr->m_label,
-                       SharedAllocationRecord<void, void>::m_alloc_ptr,
-                       SharedAllocationRecord<void, void>::m_alloc_size,
-                       (SharedAllocationRecord<void, void>::m_alloc_size -
-                        sizeof(SharedAllocationHeader)));
-  }
-  SharedAllocationRecord() = default;
-
-  template <typename ExecutionSpace>
-  SharedAllocationRecord(
-      const ExecutionSpace& /*exec_space*/, const SpaceType& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate)
-      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
-                               arg_dealloc) {}
-
-  SharedAllocationRecord(
-      const SpaceType& arg_space, const std::string& arg_label,
-      const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate)
-      : SharedAllocationRecord<void, void>(
-#ifdef KOKKOS_ENABLE_DEBUG
-            &SharedAllocationRecord<SpaceType, void>::s_root_record,
-#endif
-            Impl::checked_allocation_with_header(arg_space, arg_label,
-                                                 arg_alloc_size),
-            sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-            arg_label),
-        m_space(arg_space) {
-    // Fill in the Header information
-    RecordBase::m_alloc_ptr->m_record =
-        static_cast<SharedAllocationRecord<void, void>*>(this);
-
-    strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(),
-            SharedAllocationHeader::maximum_label_length - 1);
-    // Set last element zero, in case c_str is too long
-    RecordBase::m_alloc_ptr
-        ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0';
-  }
-
- public:
-  inline std::string get_label() const {
-    return std::string(RecordBase::head()->m_label);
-  }
-  KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate(
-      const SpaceType& arg_space, const std::string& arg_label,
-      const size_t arg_alloc_size) {
-    KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label,
-                                                         arg_alloc_size);))
-    KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size;
-                         return nullptr;))
-  }
-
-  /**\brief  Allocate tracked memory in the space */
-  static void* allocate_tracked(const SpaceType& arg_space,
-                                const std::string& arg_label,
-                                const size_t arg_alloc_size) {
-    if (!arg_alloc_size) return (void*)nullptr;
-
-    SharedAllocationRecord* const r =
-        allocate(arg_space, arg_label, arg_alloc_size);
-
-    RecordBase::increment(r);
-
-    return r->data();
-  }
-
-  /**\brief  Reallocate tracked memory in the space */
-  static void* reallocate_tracked(void* const arg_alloc_ptr,
-                                  const size_t arg_alloc_size) {
-    SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr);
-    SharedAllocationRecord* const r_new =
-        allocate(r_old->m_space, r_old->get_label(), arg_alloc_size);
-
-    Kokkos::Impl::DeepCopy<SpaceType, SpaceType>(
-        r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size()));
-    Kokkos::fence(
-        "SharedAllocationRecord<Kokkos::Experimental::LogicalMemorySpace, "
-        "void>::reallocate_tracked: fence after copying data");
-
-    RecordBase::increment(r_new);
-    RecordBase::decrement(r_old);
-
-    return r_new->data();
-  }
-  /**\brief  Deallocate tracked memory in the space */
-  static void deallocate_tracked(void* const arg_alloc_ptr) {
-    if (arg_alloc_ptr != nullptr) {
-      SharedAllocationRecord* const r = get_record(arg_alloc_ptr);
-
-      RecordBase::decrement(r);
-    }
-  }
-
-  static SharedAllocationRecord* get_record(void* alloc_ptr) {
-    using Header     = SharedAllocationHeader;
-    using RecordHost = SharedAllocationRecord<SpaceType, void>;
-
-    SharedAllocationHeader const* const head =
-        alloc_ptr ? Header::get_header(alloc_ptr)
-                  : (SharedAllocationHeader*)nullptr;
-    RecordHost* const record =
-        head ? static_cast<RecordHost*>(head->m_record) : (RecordHost*)nullptr;
-
-    if (!alloc_ptr || record->m_alloc_ptr != head) {
-      Kokkos::Impl::throw_runtime_exception(std::string(
-          "Kokkos::Impl::SharedAllocationRecord< LogicalMemorySpace<> , "
-          "void >::get_record ERROR"));
-    }
-
-    return record;
-  }
-#ifdef KOKKOS_ENABLE_DEBUG
-  static void print_records(std::ostream& s, const SpaceType&,
-                            bool detail = false) {
-    SharedAllocationRecord<void, void>::print_host_accessible_records(
-        s, "HostSpace", &s_root_record, detail);
-  }
-#else
-  static void print_records(std::ostream&, const SpaceType&,
-                            bool detail = false) {
-    (void)detail;
-    throw_runtime_exception(
-        "SharedAllocationRecord<HostSpace>::print_records only works "
-        "with KOKKOS_ENABLE_DEBUG enabled");
-  }
-#endif
-};
-#ifdef KOKKOS_ENABLE_DEBUG
-/**\brief  Root record for tracked allocations from this LogicalSpace
- * instance */
-template <class BaseSpace, class DefaultBaseExecutionSpace, class Namer,
-          class SharesAccessSemanticsWithBase>
-SharedAllocationRecord<void, void>
-    SharedAllocationRecord<Kokkos::Experimental::LogicalMemorySpace<
-                               BaseSpace, DefaultBaseExecutionSpace, Namer,
-                               SharesAccessSemanticsWithBase>,
-                           void>::s_root_record;
-#endif
-
-}  // namespace Impl
-
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-namespace Impl {
-
-template <class Namer, class BaseSpace, class DefaultBaseExecutionSpace,
-          class SharesAccess, class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::LogicalMemorySpace<
-                    BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>,
-                Kokkos::Experimental::LogicalMemorySpace<
-                    BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>,
-                ExecutionSpace> {
-  DeepCopy(void* dst, void* src, size_t n) {
-    DeepCopy<BaseSpace, BaseSpace, ExecutionSpace>(dst, src, n);
-  }
-  DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) {
-    DeepCopy<BaseSpace, BaseSpace, ExecutionSpace>(exec, dst, src, n);
-  }
-};
-
-template <class Namer, class BaseSpace, class DefaultBaseExecutionSpace,
-          class SharesAccess, class ExecutionSpace, class SourceSpace>
-struct DeepCopy<SourceSpace,
-                Kokkos::Experimental::LogicalMemorySpace<
-                    BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>,
-                ExecutionSpace> {
-  DeepCopy(void* dst, void* src, size_t n) {
-    DeepCopy<SourceSpace, BaseSpace, ExecutionSpace>(dst, src, n);
-  }
-  DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) {
-    DeepCopy<SourceSpace, BaseSpace, ExecutionSpace>(exec, dst, src, n);
-  }
-};
-
-template <class Namer, class BaseSpace, class DefaultBaseExecutionSpace,
-          class SharesAccess, class ExecutionSpace, class DestinationSpace>
-struct DeepCopy<Kokkos::Experimental::LogicalMemorySpace<
-                    BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>,
-                DestinationSpace, ExecutionSpace> {
-  DeepCopy(void* dst, void* src, size_t n) {
-    DeepCopy<BaseSpace, DestinationSpace, ExecutionSpace>(dst, src, n);
-  }
-  DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) {
-    DeepCopy<BaseSpace, DestinationSpace, ExecutionSpace>(exec, dst, src, n);
-  }
-};
-}  // namespace Impl
-
-}  // namespace Kokkos
-#endif  // KOKKOS_LOGICALSPACES_HPP
diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp
index 3cf7ac4fa24..b255d2a5195 100644
--- a/core/src/Kokkos_Macros.hpp
+++ b/core/src/Kokkos_Macros.hpp
@@ -84,11 +84,12 @@
 
 //----------------------------------------------------------------------------
 
-#if !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_CUDA) &&     \
-    !defined(KOKKOS_ENABLE_OPENMP) && !defined(KOKKOS_ENABLE_HPX) &&       \
-    !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HIP) && \
-    !defined(KOKKOS_ENABLE_SYCL)
-#define KOKKOS_INTERNAL_NOT_PARALLEL
+#if defined(KOKKOS_ENABLE_ATOMICS_BYPASS) &&                              \
+    (defined(KOKKOS_ENABLE_THREADS) || defined(KOKKOS_ENABLE_CUDA) ||     \
+     defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_HPX) ||       \
+     defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_HIP) || \
+     defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENACC))
+#error Atomics may only be disabled if neither a host parallel nor a device backend is enabled
 #endif
 
 #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
@@ -339,12 +340,6 @@
 #define KOKKOS_IMPL_DEVICE_FUNCTION
 #endif
 
-// Temporary solution for SYCL not supporting printf in kernels.
-// Might disappear at any point once we have found another solution.
-#if !defined(KOKKOS_IMPL_DO_NOT_USE_PRINTF)
-#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(...) ::printf(__VA_ARGS__)
-#endif
-
 //----------------------------------------------------------------------------
 // Define final version of functions. This is so that clang tidy can find these
 // macros more easily
@@ -433,22 +428,6 @@
 #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
 #endif
 
-//----------------------------------------------------------------------------
-// Determine for what space the code is being compiled:
-#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3)
-
-#if defined(__CUDACC__) && defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
-#elif defined(__SYCL_DEVICE_ONLY__) && defined(KOKKOS_ENABLE_SYCL)
-#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-#elif defined(__HIPCC__) && defined(__HIP_DEVICE_COMPILE__) && \
-    defined(KOKKOS_ENABLE_HIP)
-#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU
-#else
-#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-#endif
-
-#endif
 //----------------------------------------------------------------------------
 
 // Remove surrounding parentheses if present
diff --git a/core/src/Kokkos_MasterLock.hpp b/core/src/Kokkos_MasterLock.hpp
deleted file mode 100644
index 1d09617371a..00000000000
--- a/core/src/Kokkos_MasterLock.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
-#include <Kokkos_Macros.hpp>
-static_assert(false,
-              "Including non-public Kokkos header files is not allowed.");
-#endif
-#ifndef KOKKOS_MASTER_LOCK_HPP
-#define KOKKOS_MASTER_LOCK_HPP
-
-#include <Kokkos_Macros.hpp>
-
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-
-namespace Kokkos {
-namespace Experimental {
-
-// my be used to coordinate work between master instances
-// SHOULD NOT be used within a parallel algorithm
-//
-// This lock should be used with with a scoped lock guard
-// i.e. std::unique_lock<Lock>, std::lock_guard
-//
-// cannot be copied or moved
-// has the following functions available
-//
-// Lock()
-// ~Lock()
-//
-// void lock()
-// void unlock()
-// bool try_lock()
-//
-template <typename ExecutionSpace>
-class MasterLock;
-
-}  // namespace Experimental
-}  // namespace Kokkos
-
-#endif
-
-#endif  // KOKKOS_MASTER_LOCK_HPP
diff --git a/core/src/Kokkos_MathematicalConstants.hpp b/core/src/Kokkos_MathematicalConstants.hpp
index 51a50d347de..1a77f373fd8 100644
--- a/core/src/Kokkos_MathematicalConstants.hpp
+++ b/core/src/Kokkos_MathematicalConstants.hpp
@@ -51,24 +51,6 @@ KOKKOS_IMPL_MATH_CONSTANT(phi,        1.618033988749894848204586834365638118L);
 
 }  // namespace Kokkos::numbers
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-namespace Kokkos::Experimental {
-using Kokkos::numbers::e_v;
-using Kokkos::numbers::egamma_v;
-using Kokkos::numbers::inv_pi_v;
-using Kokkos::numbers::inv_sqrt3_v;
-using Kokkos::numbers::inv_sqrtpi_v;
-using Kokkos::numbers::ln10_v;
-using Kokkos::numbers::ln2_v;
-using Kokkos::numbers::log10e_v;
-using Kokkos::numbers::log2e_v;
-using Kokkos::numbers::phi_v;
-using Kokkos::numbers::pi_v;
-using Kokkos::numbers::sqrt2_v;
-using Kokkos::numbers::sqrt3_v;
-}  // namespace Kokkos::Experimental
-#endif
-
 #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS
 #undef KOKKOS_IMPL_PUBLIC_INCLUDE
 #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS
diff --git a/core/src/Kokkos_MathematicalFunctions.hpp b/core/src/Kokkos_MathematicalFunctions.hpp
index ee64c67b93b..3fead8dd293 100644
--- a/core/src/Kokkos_MathematicalFunctions.hpp
+++ b/core/src/Kokkos_MathematicalFunctions.hpp
@@ -92,16 +92,6 @@ using promote_3_t = typename promote_3<T, U, V>::type;
 #endif
 #endif
 
-#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3)
-#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \
-    USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE)                      \
-  USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE
-#else
-#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \
-    USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE)                      \
-  /* nothing */
-#endif
-
 #define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                                  \
   KOKKOS_INLINE_FUNCTION float FUNC(float x) {                                 \
     using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                          \
@@ -128,13 +118,7 @@ using promote_3_t = typename promote_3<T, U, V>::type;
       T x) {                                                                   \
     using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                          \
     return FUNC(static_cast<double>(x));                                       \
-  }                                                                            \
-  KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED(               \
-      namespace Experimental {                                                 \
-        using ::Kokkos::FUNC;                                                  \
-        using ::Kokkos::FUNC##f;                                               \
-        using ::Kokkos::FUNC##l;                                               \
-      })
+  }
 
 // isinf, isnan, and isinfinite do not work on Windows with CUDA with std::
 // getting warnings about calling host function in device function then
@@ -151,9 +135,7 @@ using promote_3_t = typename promote_3<T, U, V>::type;
   KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral_v<T>, bool> FUNC( \
       T x) {                                                                 \
     return ::FUNC(static_cast<double>(x));                                   \
-  }                                                                          \
-  KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED(             \
-      namespace Experimental { using ::Kokkos::FUNC; })
+  }
 #else
 #define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                               \
   KOKKOS_INLINE_FUNCTION bool FUNC(float x) {                                \
@@ -173,9 +155,7 @@ using promote_3_t = typename promote_3<T, U, V>::type;
       T x) {                                                                 \
     using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
     return FUNC(static_cast<double>(x));                                     \
-  }                                                                          \
-  KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED(             \
-      namespace Experimental { using ::Kokkos::FUNC; })
+  }
 #endif
 
 #define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                                 \
@@ -218,16 +198,10 @@ using promote_3_t = typename promote_3<T, U, V>::type;
                           long double>                                         \
   FUNC(T1 x, T2 y) {                                                           \
     using Promoted = Kokkos::Impl::promote_2_t<T1, T2>;                        \
-    static_assert(std::is_same_v<Promoted, long double>, "");                  \
+    static_assert(std::is_same_v<Promoted, long double>);                      \
     using std::FUNC;                                                           \
     return FUNC(static_cast<Promoted>(x), static_cast<Promoted>(y));           \
-  }                                                                            \
-  KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED(               \
-      namespace Experimental {                                                 \
-        using ::Kokkos::FUNC;                                                  \
-        using ::Kokkos::FUNC##f;                                               \
-        using ::Kokkos::FUNC##l;                                               \
-      })
+  }
 
 #define KOKKOS_IMPL_MATH_TERNARY_FUNCTION(FUNC)                             \
   KOKKOS_INLINE_FUNCTION float FUNC(float x, float y, float z) {            \
@@ -314,8 +288,6 @@ inline long double abs(long double x) {
   using std::abs;
   return abs(x);
 }
-KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED(
-    namespace Experimental { using ::Kokkos::abs; })
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder)
@@ -336,12 +308,6 @@ KOKKOS_INLINE_FUNCTION float nanf(char const*) { return sycl::nan(0u); }
 KOKKOS_INLINE_FUNCTION double nan(char const*) { return sycl::nan(0ul); }
 #endif
 inline long double nanl(char const* arg) { return ::nanl(arg); }
-KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED(
-    namespace Experimental {
-      using ::Kokkos::nan;
-      using ::Kokkos::nanf;
-      using ::Kokkos::nanl;
-    })
 // Exponential functions
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp)
 // FIXME_NVHPC nvc++ has issues with exp2
@@ -478,7 +444,6 @@ KOKKOS_IMPL_MATH_UNARY_PREDICATE(signbit)
 // islessgreater
 // isunordered
 
-#undef KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED
 #undef KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE
 #undef KOKKOS_IMPL_MATH_UNARY_FUNCTION
 #undef KOKKOS_IMPL_MATH_UNARY_PREDICATE
diff --git a/core/src/Kokkos_MinMaxClamp.hpp b/core/src/Kokkos_MinMax.hpp
similarity index 83%
rename from core/src/Kokkos_MinMaxClamp.hpp
rename to core/src/Kokkos_MinMax.hpp
index 37a28a80b68..5c60a88bfb1 100644
--- a/core/src/Kokkos_MinMaxClamp.hpp
+++ b/core/src/Kokkos_MinMax.hpp
@@ -14,13 +14,8 @@
 //
 //@HEADER
 
-#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
-#include <Kokkos_Macros.hpp>
-static_assert(false,
-              "Including non-public Kokkos header files is not allowed.");
-#endif
-#ifndef KOKKOS_MIN_MAX_CLAMP_HPP
-#define KOKKOS_MIN_MAX_CLAMP_HPP
+#ifndef KOKKOS_MIN_MAX_HPP
+#define KOKKOS_MIN_MAX_HPP
 
 #include <Kokkos_Macros.hpp>
 #include <Kokkos_Pair.hpp>
@@ -29,22 +24,6 @@ static_assert(false,
 
 namespace Kokkos {
 
-// clamp
-template <class T>
-constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo,
-                                                const T& hi) {
-  KOKKOS_EXPECTS(!(hi < lo));
-  return (value < lo) ? lo : (hi < value) ? hi : value;
-}
-
-template <class T, class ComparatorType>
-constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo,
-                                                const T& hi,
-                                                ComparatorType comp) {
-  KOKKOS_EXPECTS(!comp(hi, lo));
-  return comp(value, lo) ? lo : comp(hi, value) ? hi : value;
-}
-
 // max
 template <class T>
 constexpr KOKKOS_INLINE_FUNCTION const T& max(const T& a, const T& b) {
@@ -199,15 +178,6 @@ KOKKOS_INLINE_FUNCTION constexpr Kokkos::pair<T, T> minmax(
   return result;
 }
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-namespace Experimental {
-using ::Kokkos::clamp;
-using ::Kokkos::max;
-using ::Kokkos::min;
-using ::Kokkos::minmax;
-}  // namespace Experimental
-#endif
-
 }  // namespace Kokkos
 
 #endif
diff --git a/core/src/Kokkos_Pair.hpp b/core/src/Kokkos_Pair.hpp
index 7127c78280e..9be8d8d7aa1 100644
--- a/core/src/Kokkos_Pair.hpp
+++ b/core/src/Kokkos_Pair.hpp
@@ -28,6 +28,7 @@
 #endif
 
 #include <Kokkos_Macros.hpp>
+#include <Kokkos_Swap.hpp>
 #include <utility>
 
 namespace Kokkos {
@@ -484,7 +485,6 @@ KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=(
 }
 
 namespace Impl {
-
 template <class T>
 struct is_pair_like : std::false_type {};
 template <class T, class U>
diff --git a/core/src/Kokkos_Printf.hpp b/core/src/Kokkos_Printf.hpp
index 39f95825c38..63a4cce2aeb 100644
--- a/core/src/Kokkos_Printf.hpp
+++ b/core/src/Kokkos_Printf.hpp
@@ -30,8 +30,11 @@ namespace Kokkos {
 // In contrast to std::printf, return void to get a consistent behavior across
 // backends. The GPU backends always return 1 and NVHPC only compiles if we
 // don't ask for the return value.
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+using ::printf;
+#else
 template <typename... Args>
-KOKKOS_FUNCTION void printf(const char* format, Args... args) {
+KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) {
 #ifdef KOKKOS_ENABLE_SYCL
   // Some compilers warn if "args" is empty and format is not a string literal
   if constexpr (sizeof...(Args) == 0)
@@ -39,15 +42,13 @@ KOKKOS_FUNCTION void printf(const char* format, Args... args) {
   else
     sycl::ext::oneapi::experimental::printf(format, args...);
 #else
-  if constexpr (sizeof...(Args) == 0) ::printf("%s", format);
-    // FIXME_OPENMPTARGET non-string-literal argument used in printf is not
-    // supported for spir64
-#if !(defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU))
+  if constexpr (sizeof...(Args) == 0)
+    ::printf("%s", format);
   else
     ::printf(format, args...);
 #endif
-#endif
 }
+#endif
 
 }  // namespace Kokkos
 
diff --git a/core/src/Kokkos_Profiling_ProfileSection.hpp b/core/src/Kokkos_Profiling_ProfileSection.hpp
index 29a04ac3b07..e7a9ba0c7ed 100644
--- a/core/src/Kokkos_Profiling_ProfileSection.hpp
+++ b/core/src/Kokkos_Profiling_ProfileSection.hpp
@@ -22,49 +22,34 @@
 #endif
 
 #include <Kokkos_Macros.hpp>
-#include <impl/Kokkos_Profiling_Interface.hpp>
 #include <impl/Kokkos_Profiling.hpp>
 
 #include <string>
 
-namespace Kokkos {
-namespace Profiling {
+namespace Kokkos::Profiling {
+
+class [[nodiscard]] ProfilingSection {
+  uint32_t sectionID;
 
-class ProfilingSection {
  public:
   ProfilingSection(ProfilingSection const&) = delete;
   ProfilingSection& operator=(ProfilingSection const&) = delete;
 
-  ProfilingSection(const std::string& sectionName) {
-    if (Kokkos::Profiling::profileLibraryLoaded()) {
-      Kokkos::Profiling::createProfileSection(sectionName, &secID);
-    }
-  }
-
-  void start() {
-    if (Kokkos::Profiling::profileLibraryLoaded()) {
-      Kokkos::Profiling::startSection(secID);
-    }
+#if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907
+  [[nodiscard]]
+#endif
+  explicit ProfilingSection(const std::string& sectionName) {
+    Kokkos::Profiling::createProfileSection(sectionName, &sectionID);
   }
 
-  void stop() {
-    if (Kokkos::Profiling::profileLibraryLoaded()) {
-      Kokkos::Profiling::stopSection(secID);
-    }
-  }
+  void start() { Kokkos::Profiling::startSection(sectionID); }
 
-  ~ProfilingSection() {
-    if (Kokkos::Profiling::profileLibraryLoaded()) {
-      Kokkos::Profiling::destroyProfileSection(secID);
-    }
-  }
+  void stop() { Kokkos::Profiling::stopSection(sectionID); }
 
- protected:
-  uint32_t secID;
+  ~ProfilingSection() { Kokkos::Profiling::destroyProfileSection(sectionID); }
 };
 
-}  // namespace Profiling
-}  // namespace Kokkos
+}  // namespace Kokkos::Profiling
 
 #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE
 #undef KOKKOS_IMPL_PUBLIC_INCLUDE
diff --git a/core/src/Kokkos_Swap.hpp b/core/src/Kokkos_Swap.hpp
new file mode 100644
index 00000000000..2f849a13ab6
--- /dev/null
+++ b/core/src/Kokkos_Swap.hpp
@@ -0,0 +1,68 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SWAP_HPP
+#define KOKKOS_SWAP_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+namespace Kokkos {
+
+template <class T>
+KOKKOS_FUNCTION constexpr std::enable_if_t<std::is_move_constructible_v<T> &&
+                                           std::is_move_assignable_v<T>>
+kokkos_swap(T& a, T& b) noexcept(std::is_nothrow_move_constructible_v<T>&&
+                                     std::is_nothrow_move_assignable_v<T>) {
+  T t(std::move(a));
+  a = std::move(b);
+  b = std::move(t);
+}
+
+namespace Impl {
+
+template <class T>
+struct is_swappable {
+  template <class U>
+  static decltype(kokkos_swap(std::declval<T&>(), std::declval<T&>()))
+  test_swap(int);
+  struct Nope;
+  template <class U>
+  static Nope test_swap(long);
+  static constexpr bool value =
+      !std::is_same_v<decltype(test_swap<T>(0)), Nope>;
+};
+
+template <class T>
+inline constexpr bool is_nothrow_swappable_v =
+    noexcept(kokkos_swap(std::declval<T&>(), std::declval<T&>()));
+
+}  // namespace Impl
+
+template <class T, std::size_t N>
+KOKKOS_FUNCTION constexpr std::enable_if_t<Impl::is_swappable<T>::value>
+kokkos_swap(T (&a)[N], T (&b)[N]) noexcept(Impl::is_nothrow_swappable_v<T>) {
+  for (std::size_t i = 0; i < N; ++i) {
+    kokkos_swap(a[i], b[i]);
+  }
+}
+
+}  // namespace Kokkos
+
+#endif
diff --git a/core/src/Kokkos_Tuners.hpp b/core/src/Kokkos_Tuners.hpp
index 618401654e7..f5ffc66af5b 100644
--- a/core/src/Kokkos_Tuners.hpp
+++ b/core/src/Kokkos_Tuners.hpp
@@ -256,13 +256,14 @@ auto get_point_helper(const PointType& in, const ArrayType& indices,
 template <typename PointType, typename ArrayType>
 struct GetPoint;
 
-template <typename PointType, size_t X>
-struct GetPoint<PointType,
-                std::array<Kokkos::Tools::Experimental::VariableValue, X>> {
+template <typename PointType, size_t ArraySize>
+struct GetPoint<
+    PointType,
+    std::array<Kokkos::Tools::Experimental::VariableValue, ArraySize>> {
   using index_set_type =
-      std::array<Kokkos::Tools::Experimental::VariableValue, X>;
+      std::array<Kokkos::Tools::Experimental::VariableValue, ArraySize>;
   static auto build(const PointType& in, const index_set_type& indices) {
-    return get_point_helper(in, indices, std::make_index_sequence<X>{});
+    return get_point_helper(in, indices, std::make_index_sequence<ArraySize>{});
   }
 };
 
diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp
index bcbb28014cd..484a0e6f62e 100644
--- a/core/src/Kokkos_View.hpp
+++ b/core/src/Kokkos_View.hpp
@@ -39,7 +39,7 @@ static_assert(false,
 #ifdef KOKKOS_ENABLE_IMPL_MDSPAN
 #include <View/MDSpan/Kokkos_MDSpan_Extents.hpp>
 #endif
-#include <Kokkos_MinMaxClamp.hpp>
+#include <Kokkos_MinMax.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -75,25 +75,59 @@ constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers(
          (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX);
 }
 
-KOKKOS_INLINE_FUNCTION
-void runtime_check_rank(const size_t rank, const size_t dyn_rank,
-                        const bool is_void_spec, const size_t i0,
-                        const size_t i1, const size_t i2, const size_t i3,
-                        const size_t i4, const size_t i5, const size_t i6,
-                        const size_t i7, const std::string& label) {
+// FIXME Ideally, we would not instantiate this function for every possible View
+// type. We should be able to only pass "extent" when we use mdspan.
+template <typename View>
+KOKKOS_INLINE_FUNCTION void runtime_check_rank(
+    const View&, const bool is_void_spec, const size_t i0, const size_t i1,
+    const size_t i2, const size_t i3, const size_t i4, const size_t i5,
+    const size_t i6, const size_t i7, const char* label) {
   (void)(label);
 
   if (is_void_spec) {
     const size_t num_passed_args =
         count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7);
+    // We either allow to pass as many extents as the dynamic rank is, or
+    // as many extents as the total rank is. In the latter case, the given
+    // extents for the static dimensions must match the
+    // compile-time extents.
+    constexpr int rank            = View::rank();
+    constexpr int dyn_rank        = View::rank_dynamic();
+    const bool n_args_is_dyn_rank = num_passed_args == dyn_rank;
+    const bool n_args_is_rank     = num_passed_args == rank;
+
+    if constexpr (rank != dyn_rank) {
+      if (n_args_is_rank) {
+        size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+        for (int i = dyn_rank; i < rank; ++i)
+          if (new_extents[i] != View::static_extent(i)) {
+            KOKKOS_IF_ON_HOST(
+                const std::string message =
+                    "The specified run-time extent for Kokkos::View '" +
+                    std::string(label) +
+                    "' does not match the compile-time extent in dimension " +
+                    std::to_string(i) + ". The given extent is " +
+                    std::to_string(new_extents[i]) + " but should be " +
+                    std::to_string(View::static_extent(i)) + ".\n";
+                Kokkos::abort(message.c_str());)
+            KOKKOS_IF_ON_DEVICE(
+                Kokkos::abort(
+                    "The specified run-time extents for a Kokkos::View "
+                    "do not match the compile-time extents.");)
+          }
+      }
+    }
 
-    if (num_passed_args != dyn_rank && num_passed_args != rank) {
+    if (!n_args_is_dyn_rank && !n_args_is_rank) {
       KOKKOS_IF_ON_HOST(
           const std::string message =
-              "Constructor for Kokkos View '" + label +
-              "' has mismatched number of arguments. Number of arguments = " +
+              "Constructor for Kokkos::View '" + std::string(label) +
+              "' has mismatched number of arguments. The number "
+              "of arguments = " +
               std::to_string(num_passed_args) +
-              " but dynamic rank = " + std::to_string(dyn_rank) + " \n";
+              " neither matches the dynamic rank = " +
+              std::to_string(dyn_rank) +
+              " nor the total rank = " + std::to_string(rank) + "\n";
           Kokkos::abort(message.c_str());)
       KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has "
                                         "mismatched number of arguments.");)
@@ -814,15 +848,15 @@ class View : public ViewTraits<DataType, Properties...> {
 
   template <typename... Is>
   static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) {
-    static_assert(rank <= sizeof...(Is), "");
-    static_assert(sizeof...(Is) <= 8, "");
-    static_assert(Kokkos::Impl::are_integral<Is...>::value, "");
+    static_assert(rank <= sizeof...(Is));
+    static_assert(sizeof...(Is) <= 8);
+    static_assert(Kokkos::Impl::are_integral<Is...>::value);
   }
 
   template <typename... Is>
   static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) {
-    static_assert(rank == sizeof...(Is), "");
-    static_assert(Kokkos::Impl::are_integral<Is...>::value, "");
+    static_assert(rank == sizeof...(Is));
+    static_assert(Kokkos::Impl::are_integral<Is...>::value);
   }
 
  public:
@@ -1402,21 +1436,30 @@ class View : public ViewTraits<DataType, Properties...> {
           "execution space");
     }
 
-    size_t i0 = arg_layout.dimension[0];
-    size_t i1 = arg_layout.dimension[1];
-    size_t i2 = arg_layout.dimension[2];
-    size_t i3 = arg_layout.dimension[3];
-    size_t i4 = arg_layout.dimension[4];
-    size_t i5 = arg_layout.dimension[5];
-    size_t i6 = arg_layout.dimension[6];
-    size_t i7 = arg_layout.dimension[7];
-
-    const std::string& alloc_name =
-        Impl::get_property<Impl::LabelTag>(prop_copy);
-    Impl::runtime_check_rank(
-        rank, rank_dynamic,
-        std::is_same<typename traits::specialize, void>::value, i0, i1, i2, i3,
-        i4, i5, i6, i7, alloc_name);
+#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
+    if constexpr (std::is_same_v<typename traits::array_layout,
+                                 Kokkos::LayoutLeft> ||
+                  std::is_same_v<typename traits::array_layout,
+                                 Kokkos::LayoutRight> ||
+                  std::is_same_v<typename traits::array_layout,
+                                 Kokkos::LayoutStride> ||
+                  is_layouttiled<typename traits::array_layout>::value) {
+      size_t i0 = arg_layout.dimension[0];
+      size_t i1 = arg_layout.dimension[1];
+      size_t i2 = arg_layout.dimension[2];
+      size_t i3 = arg_layout.dimension[3];
+      size_t i4 = arg_layout.dimension[4];
+      size_t i5 = arg_layout.dimension[5];
+      size_t i6 = arg_layout.dimension[6];
+      size_t i7 = arg_layout.dimension[7];
+
+      const std::string& alloc_name =
+          Impl::get_property<Impl::LabelTag>(prop_copy);
+      Impl::runtime_check_rank(
+          *this, std::is_same<typename traits::specialize, void>::value, i0, i1,
+          i2, i3, i4, i5, i6, i7, alloc_name.c_str());
+    }
+#endif
 
     Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared(
         prop_copy, arg_layout, Impl::ViewCtorProp<P...>::has_execution_space);
@@ -1445,6 +1488,29 @@ class View : public ViewTraits<DataType, Properties...> {
                      typename Impl::ViewCtorProp<P...>::pointer_type>::value,
         "Constructing View to wrap user memory must supply matching pointer "
         "type");
+
+#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
+    if constexpr (std::is_same_v<typename traits::array_layout,
+                                 Kokkos::LayoutLeft> ||
+                  std::is_same_v<typename traits::array_layout,
+                                 Kokkos::LayoutRight> ||
+                  std::is_same_v<typename traits::array_layout,
+                                 Kokkos::LayoutStride> ||
+                  is_layouttiled<typename traits::array_layout>::value) {
+      size_t i0 = arg_layout.dimension[0];
+      size_t i1 = arg_layout.dimension[1];
+      size_t i2 = arg_layout.dimension[2];
+      size_t i3 = arg_layout.dimension[3];
+      size_t i4 = arg_layout.dimension[4];
+      size_t i5 = arg_layout.dimension[5];
+      size_t i6 = arg_layout.dimension[6];
+      size_t i7 = arg_layout.dimension[7];
+
+      Impl::runtime_check_rank(
+          *this, std::is_same<typename traits::specialize, void>::value, i0, i1,
+          i2, i3, i4, i5, i6, i7, "UNMANAGED");
+    }
+#endif
   }
 
   // Simple dimension-only layout
diff --git a/core/src/OpenACC/Kokkos_OpenACC.cpp b/core/src/OpenACC/Kokkos_OpenACC.cpp
index f54c44d66f0..99daf379b6f 100644
--- a/core/src/OpenACC/Kokkos_OpenACC.cpp
+++ b/core/src/OpenACC/Kokkos_OpenACC.cpp
@@ -58,8 +58,10 @@ void Kokkos::Experimental::OpenACC::impl_initialize(
     Impl::OpenACCInternal::m_acc_device_num =
         acc_get_device_num(acc_device_host);
   } else {
+    using Kokkos::Impl::get_visible_devices;
+    std::vector<int> const& visible_devices = get_visible_devices();
     using Kokkos::Impl::get_gpu;
-    int const dev_num = get_gpu(settings);
+    int const dev_num = get_gpu(settings).value_or(visible_devices[0]);
     acc_set_device_num(dev_num, Impl::OpenACC_Traits::dev_type);
     Impl::OpenACCInternal::m_acc_device_num = dev_num;
   }
diff --git a/core/src/OpenACC/Kokkos_OpenACC.hpp b/core/src/OpenACC/Kokkos_OpenACC.hpp
index b012f6a42a4..5155bee33dc 100644
--- a/core/src/OpenACC/Kokkos_OpenACC.hpp
+++ b/core/src/OpenACC/Kokkos_OpenACC.hpp
@@ -91,7 +91,11 @@ class OpenACC {
 #else
   int concurrency() const { return 256000; }  // FIXME_OPENACC
 #endif
-  static bool in_parallel() { return acc_on_device(acc_device_not_host); }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED static bool in_parallel() {
+    return acc_on_device(acc_device_not_host);
+  }
+#endif
   uint32_t impl_instance_id() const noexcept;
   Impl::OpenACCInternal* impl_internal_space_instance() const {
     return m_space_instance.get();
diff --git a/core/src/OpenACC/Kokkos_OpenACCSpace.cpp b/core/src/OpenACC/Kokkos_OpenACCSpace.cpp
index 141ec77fd1f..acc0dcd3c6e 100644
--- a/core/src/OpenACC/Kokkos_OpenACCSpace.cpp
+++ b/core/src/OpenACC/Kokkos_OpenACCSpace.cpp
@@ -19,8 +19,8 @@
 #include <OpenACC/Kokkos_OpenACC.hpp>
 #include <OpenACC/Kokkos_OpenACCSpace.hpp>
 #include <OpenACC/Kokkos_OpenACC_DeepCopy.hpp>
-#include <impl/Kokkos_MemorySpace.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
+#include <impl/Kokkos_Error.hpp>
 
 #include <openacc.h>
 
@@ -66,6 +66,19 @@ void *Kokkos::Experimental::OpenACCSpace::impl_allocate(
 
   ptr = acc_malloc(arg_alloc_size);
 
+  if (!ptr) {
+    size_t alignment = 1;  // OpenACC does not handle alignment
+    using Kokkos::Experimental::RawMemoryAllocationFailure;
+    auto failure_mode =
+        arg_alloc_size > 0
+            ? RawMemoryAllocationFailure::FailureMode::OutOfMemoryError
+            : RawMemoryAllocationFailure::FailureMode::InvalidAllocationSize;
+    auto alloc_mechanism =
+        RawMemoryAllocationFailure::AllocationMechanism::OpenACCMalloc;
+    throw RawMemoryAllocationFailure(arg_alloc_size, alignment, failure_mode,
+                                     alloc_mechanism);
+  }
+
   if (Kokkos::Profiling::profileLibraryLoaded()) {
     const size_t reported_size =
         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
diff --git a/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp b/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp
index 4aed7e00f76..ca022192b0b 100644
--- a/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp
+++ b/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp
@@ -34,7 +34,7 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace,
     // value checking is added as a safeguard. (The current NVHPC (V22.5)
     // supports OpenACC V2.7.)
     if (n > 0) {
-      acc_memcpy_device(dst, const_cast<void*>(src), n);
+      acc_memcpy_device_async(dst, const_cast<void*>(src), n, acc_async_noval);
     }
   }
   DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst,
@@ -52,7 +52,7 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace,
                               ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) {
     if (n > 0) {
-      acc_memcpy_device(dst, const_cast<void*>(src), n);
+      acc_memcpy_device_async(dst, const_cast<void*>(src), n, acc_async_noval);
     }
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
@@ -60,7 +60,7 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace,
         "Kokkos::Impl::DeepCopy<OpenACCSpace, OpenACCSpace, "
         "ExecutionSpace>::DeepCopy: fence before copy");
     if (n > 0) {
-      acc_memcpy_device(dst, const_cast<void*>(src), n);
+      acc_memcpy_device_async(dst, const_cast<void*>(src), n, acc_async_noval);
     }
   }
 };
@@ -70,7 +70,9 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace,
                               Kokkos::HostSpace,
                               Kokkos::Experimental::OpenACC> {
   DeepCopy(void* dst, const void* src, size_t n) {
-    if (n > 0) acc_memcpy_to_device(dst, const_cast<void*>(src), n);
+    if (n > 0)
+      acc_memcpy_to_device_async(dst, const_cast<void*>(src), n,
+                                 acc_async_noval);
   }
   DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst,
            const void* src, size_t n) {
@@ -85,7 +87,8 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace,
                               Kokkos::HostSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) {
     if (n > 0) {
-      acc_memcpy_to_device(dst, const_cast<void*>(src), n);
+      acc_memcpy_to_device_async(dst, const_cast<void*>(src), n,
+                                 acc_async_noval);
     }
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
@@ -93,7 +96,8 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace,
         "Kokkos::Impl::DeepCopy<OpenACCSpace, HostSpace, "
         "ExecutionSpace>::DeepCopy: fence before copy");
     if (n > 0) {
-      acc_memcpy_to_device(dst, const_cast<void*>(src), n);
+      acc_memcpy_to_device_async(dst, const_cast<void*>(src), n,
+                                 acc_async_noval);
     }
   }
 };
@@ -104,7 +108,8 @@ struct Kokkos::Impl::DeepCopy<Kokkos::HostSpace,
                               Kokkos::Experimental::OpenACC> {
   DeepCopy(void* dst, const void* src, size_t n) {
     if (n > 0) {
-      acc_memcpy_from_device(dst, const_cast<void*>(src), n);
+      acc_memcpy_from_device_async(dst, const_cast<void*>(src), n,
+                                   acc_async_noval);
     }
   }
   DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst,
@@ -120,14 +125,17 @@ template <class ExecutionSpace>
 struct Kokkos::Impl::DeepCopy<
     Kokkos::HostSpace, Kokkos::Experimental::OpenACCSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) {
-    if (n > 0) acc_memcpy_from_device(dst, const_cast<void*>(src), n);
+    if (n > 0)
+      acc_memcpy_from_device_async(dst, const_cast<void*>(src), n,
+                                   acc_async_noval);
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
     exec.fence(
         "Kokkos::Impl::DeepCopy<HostSpace, OpenACCSpace, "
         "ExecutionSpace>::DeepCopy: fence before copy");
     if (n > 0) {
-      acc_memcpy_from_device(dst, const_cast<void*>(src), n);
+      acc_memcpy_from_device_async(dst, const_cast<void*>(src), n,
+                                   acc_async_noval);
     }
   }
 };
diff --git a/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp b/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp
index 6645616ba51..c3d72368727 100644
--- a/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp
+++ b/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp
@@ -35,7 +35,7 @@ class OpenACCInternal {
 
  public:
   static int m_acc_device_num;
-  int m_async_arg = acc_async_sync;
+  int m_async_arg = acc_async_noval;
 
   OpenACCInternal() = default;
 
@@ -43,7 +43,7 @@ class OpenACCInternal {
 
   bool verify_is_initialized(const char* const label) const;
 
-  void initialize(int async_arg = acc_async_sync);
+  void initialize(int async_arg = acc_async_noval);
   void finalize();
   bool is_initialized() const;
 
diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp
index 2c7793dc116..5afb5e75d39 100644
--- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp
+++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp
@@ -31,7 +31,7 @@ template <class Functor, class Reducer, class Policy,
 struct OpenACCParallelReduceMDRangeHelper {
   OpenACCParallelReduceMDRangeHelper(Functor const&, Reducer const&,
                                      Policy const&) {
-    static_assert(!Kokkos::Impl::always_true<Functor>::value,
+    static_assert(Kokkos::Impl::always_false<Functor>::value,
                   "not implemented");
   }
 };
@@ -136,6 +136,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         functor(i0, i1, val);                                                 \
       }                                                                       \
     }                                                                         \
+    acc_wait(async_arg);                                                      \
     aval = val;                                                               \
   }                                                                           \
                                                                               \
@@ -159,6 +160,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         functor(i0, i1, val);                                                 \
       }                                                                       \
     }                                                                         \
+    acc_wait(async_arg);                                                      \
     aval = val;                                                               \
   }                                                                           \
                                                                               \
@@ -188,6 +190,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         }                                                                     \
       }                                                                       \
     }                                                                         \
+    acc_wait(async_arg);                                                      \
     aval = val;                                                               \
   }                                                                           \
                                                                               \
@@ -217,6 +220,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         }                                                                     \
       }                                                                       \
     }                                                                         \
+    acc_wait(async_arg);                                                      \
     aval = val;                                                               \
   }                                                                           \
                                                                               \
@@ -248,6 +252,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         }                                                                     \
       }                                                                       \
     }                                                                         \
+    acc_wait(async_arg);                                                      \
     aval = val;                                                               \
   }                                                                           \
                                                                               \
@@ -279,6 +284,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         }                                                                     \
       }                                                                       \
     }                                                                         \
+    acc_wait(async_arg);                                                      \
     aval = val;                                                               \
   }                                                                           \
                                                                               \
@@ -314,6 +320,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         }                                                                     \
       }                                                                       \
     }                                                                         \
+    acc_wait(async_arg);                                                      \
     aval = val;                                                               \
   }                                                                           \
                                                                               \
@@ -349,6 +356,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         }                                                                     \
       }                                                                       \
     }                                                                         \
+    acc_wait(async_arg);                                                      \
     aval = val;                                                               \
   }                                                                           \
                                                                               \
@@ -388,6 +396,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         }                                                                     \
       }                                                                       \
     }                                                                         \
+    acc_wait(async_arg);                                                      \
     aval = val;                                                               \
   }                                                                           \
                                                                               \
@@ -427,6 +436,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         }                                                                     \
       }                                                                       \
     }                                                                         \
+    acc_wait(async_arg);                                                      \
     aval = val;                                                               \
   }                                                                           \
   }  // namespace Kokkos::Experimental::Impl
diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp
index b61a05a8ee1..f9039e3bb4c 100644
--- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp
+++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp
@@ -31,7 +31,7 @@ template <class Functor, class Reducer, class Policy,
           bool = std::is_arithmetic_v<typename Reducer::value_type>>
 struct OpenACCParallelReduceHelper {
   OpenACCParallelReduceHelper(Functor const&, Reducer const&, Policy const&) {
-    static_assert(!Kokkos::Impl::always_true<Functor>::value,
+    static_assert(Kokkos::Impl::always_false<Functor>::value,
                   "not implemented");
   }
 };
@@ -140,6 +140,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         functor(i, val);                                                  \
       }                                                                   \
     }                                                                     \
+    acc_wait(async_arg);                                                  \
     aval = val;                                                           \
   }                                                                       \
                                                                           \
@@ -169,6 +170,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         functor(i, val);                                                  \
       }                                                                   \
     }                                                                     \
+    acc_wait(async_arg);                                                  \
     aval = val;                                                           \
   }                                                                       \
   }  // namespace Kokkos::Experimental::Impl
diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp
index 3223ce3f9af..430bdcb6808 100644
--- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp
+++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp
@@ -40,7 +40,7 @@ template <class Functor, class Reducer, class Policy,
 struct OpenACCParallelReduceTeamHelper {
   OpenACCParallelReduceTeamHelper(Functor const&, Reducer const&,
                                   Policy const&) {
-    static_assert(!Kokkos::Impl::always_true<Functor>::value,
+    static_assert(Kokkos::Impl::always_false<Functor>::value,
                   "not implemented");
   }
 };
@@ -129,7 +129,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
     const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::OpenACCTeamMember>&
         loop_boundaries,
     const Lambda& lambda, const JoinType& join, ValueType& init_result) {
-  static_assert(!Kokkos::Impl::always_true<Lambda>::value,
+  static_assert(Kokkos::Impl::always_false<Lambda>::value,
                 "custom reduction is not implemented");
 }
 
@@ -140,7 +140,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
     const Impl::ThreadVectorRangeBoundariesStruct<
         iType, Impl::OpenACCTeamMember>& loop_boundaries,
     const Lambda& lambda, const JoinType& join, ValueType& init_result) {
-  static_assert(!Kokkos::Impl::always_true<Lambda>::value,
+  static_assert(Kokkos::Impl::always_false<Lambda>::value,
                 "custom reduction is not implemented");
 }
 
@@ -394,6 +394,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
                                         vector_length);                    \
       functor(team, val);                                                  \
     }                                                                      \
+    acc_wait(async_arg);                                                   \
     aval = val;                                                            \
   }                                                                        \
   }  // namespace Kokkos::Experimental::Impl
diff --git a/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp b/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp
index 91faa64f733..76e1514476a 100644
--- a/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp
+++ b/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp
@@ -16,92 +16,11 @@
 
 #define KOKKOS_IMPL_PUBLIC_INCLUDE
 
-#include <OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp>
+#include <OpenACC/Kokkos_OpenACC.hpp>
 #include <OpenACC/Kokkos_OpenACC_DeepCopy.hpp>
-#include <impl/Kokkos_MemorySpace.hpp>
-#include <Kokkos_HostSpace.hpp>
-
-#ifdef KOKKOS_ENABLE_DEBUG
-Kokkos::Impl::SharedAllocationRecord<void, void> SharedAllocationRecord<
-    Kokkos::Experimental::OpenACCSpace, void>::s_root_record;
-#endif
-
-Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace,
-                                     void>::~SharedAllocationRecord() {
-  m_space.deallocate(m_label.c_str(),
-                     SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     (SharedAllocationRecord<void, void>::m_alloc_size -
-                      sizeof(SharedAllocationHeader)));
-}
-
-Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace, void>::
-    SharedAllocationRecord(
-        const Kokkos::Experimental::OpenACCSpace &arg_space,
-        const std::string &arg_label, const size_t arg_alloc_size,
-        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace,
-                                  void>::s_root_record,
-#endif
-          Impl::checked_allocation_with_header(arg_space, arg_label,
-                                               arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-  SharedAllocationHeader header;
-
-  this->base_t::_fill_host_accessible_header_info(header, arg_label);
-
-  Kokkos::Impl::DeepCopy<Experimental::OpenACCSpace, HostSpace>(
-      RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader));
-  Kokkos::fence(
-      "SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace, "
-      "void>::SharedAllocationRecord(): fence after copying header from "
-      "HostSpace");
-}
-
-Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace, void>::
-    SharedAllocationRecord(
-        const Kokkos::Experimental::OpenACC &arg_exec_space,
-        const Kokkos::Experimental::OpenACCSpace &arg_space,
-        const std::string &arg_label, const size_t arg_alloc_size,
-        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace,
-                                  void>::s_root_record,
-#endif
-          Impl::checked_allocation_with_header(arg_exec_space, arg_space,
-                                               arg_label, arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-  SharedAllocationHeader header;
-
-  this->base_t::_fill_host_accessible_header_info(header, arg_label);
-
-  Kokkos::Impl::DeepCopy<Experimental::OpenACCSpace, HostSpace>(
-      arg_exec_space, RecordBase::m_alloc_ptr, &header,
-      sizeof(SharedAllocationHeader));
-}
-
-//==============================================================================
-// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
+#include <OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp>
 
 #include <impl/Kokkos_SharedAlloc_timpl.hpp>
 
-// To avoid additional compilation cost for something that's (mostly?) not
-// performance sensitive, we explicitly instantiate these CRTP base classes
-// here, where we have access to the associated *_timpl.hpp header files.
-template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon<
-    Kokkos::Experimental::OpenACCSpace>;
-template class Kokkos::Impl::SharedAllocationRecordCommon<
-    Kokkos::Experimental::OpenACCSpace>;
-
-// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1
-//==============================================================================
+KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
+    Kokkos::Experimental::OpenACCSpace);
diff --git a/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp b/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp
index cf83a5b27bc..cde5ecdcb77 100644
--- a/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp
+++ b/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp
@@ -20,55 +20,7 @@
 #include <OpenACC/Kokkos_OpenACCSpace.hpp>
 #include <impl/Kokkos_SharedAlloc.hpp>
 
-#include <openacc.h>
-
-template <>
-class Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace,
-                                           void>
-    : public HostInaccessibleSharedAllocationRecordCommon<
-          Kokkos::Experimental::OpenACCSpace> {
- private:
-  friend class HostInaccessibleSharedAllocationRecordCommon<
-      Kokkos::Experimental::OpenACCSpace>;
-  friend class SharedAllocationRecordCommon<Kokkos::Experimental::OpenACCSpace>;
-  friend Kokkos::Experimental::OpenACCSpace;
-
-  using base_t = HostInaccessibleSharedAllocationRecordCommon<
-      Kokkos::Experimental::OpenACCSpace>;
-  using RecordBase = SharedAllocationRecord<void, void>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-
-  /**\brief  Root record for tracked allocations from this OpenACCSpace
-   * instance */
-  static RecordBase s_root_record;
-
-  const Kokkos::Experimental::OpenACCSpace m_space;
-
- protected:
-  ~SharedAllocationRecord();
-  SharedAllocationRecord() = default;
-
-  template <typename ExecutionSpace>
-  SharedAllocationRecord(
-      const ExecutionSpace& /*exec_space*/,
-      const Kokkos::Experimental::OpenACCSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate)
-      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
-                               arg_dealloc) {}
-
-  SharedAllocationRecord(
-      const Kokkos::Experimental::OpenACC& exec_space,
-      const Kokkos::Experimental::OpenACCSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate);
-
-  SharedAllocationRecord(
-      const Kokkos::Experimental::OpenACCSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate);
-};
+KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(
+    Kokkos::Experimental::OpenACCSpace);
 
 #endif
diff --git a/core/src/OpenACC/Kokkos_OpenACC_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_Team.hpp
index 4ec71f56ef6..20ea392452b 100644
--- a/core/src/OpenACC/Kokkos_OpenACC_Team.hpp
+++ b/core/src/OpenACC/Kokkos_OpenACC_Team.hpp
@@ -82,7 +82,7 @@ class OpenACCTeamMember {
   // FIXME_OPENACC: team_broadcast() is not implemented.
   template <class ValueType>
   KOKKOS_FUNCTION void team_broadcast(ValueType& value, int thread_id) const {
-    static_assert(!Kokkos::Impl::always_true<ValueType>::value,
+    static_assert(Kokkos::Impl::always_false<ValueType>::value,
                   "Kokkos Error: team_broadcast() is not implemented for the "
                   "OpenACC backend");
     return ValueType();
@@ -99,7 +99,7 @@ class OpenACCTeamMember {
   template <class ValueType, class JoinOp>
   KOKKOS_FUNCTION ValueType team_reduce(const ValueType& value,
                                         const JoinOp& op_in) const {
-    static_assert(!Kokkos::Impl::always_true<ValueType>::value,
+    static_assert(Kokkos::Impl::always_false<ValueType>::value,
                   "Kokkos Error: team_reduce() is not implemented for the "
                   "OpenACC backend");
     return ValueType();
@@ -110,7 +110,7 @@ class OpenACCTeamMember {
   KOKKOS_FUNCTION ArgType team_scan(const ArgType& /*value*/,
                                     ArgType* const /*global_accum*/) const {
     static_assert(
-        !Kokkos::Impl::always_true<ArgType>::value,
+        Kokkos::Impl::always_false<ArgType>::value,
         "Kokkos Error: team_scan() is not implemented for the OpenACC backend");
     return ArgType();
   }
diff --git a/core/src/OpenMP/Kokkos_OpenMP.cpp b/core/src/OpenMP/Kokkos_OpenMP.cpp
index 9a169a435c7..81f2c5c3056 100644
--- a/core/src/OpenMP/Kokkos_OpenMP.cpp
+++ b/core/src/OpenMP/Kokkos_OpenMP.cpp
@@ -81,29 +81,16 @@ bool OpenMP::impl_is_initialized() noexcept {
   return Impl::OpenMPInternal::singleton().is_initialized();
 }
 
-bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-  return (
-      (exec_space.impl_internal_space_instance()->m_level < omp_get_level()) &&
-      (!Impl::t_openmp_instance ||
-       Impl::t_openmp_instance->m_level < omp_get_level()));
-#else
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+KOKKOS_DEPRECATED bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept {
   return exec_space.impl_internal_space_instance()->m_level < omp_get_level();
-#endif
 }
+#endif
 
 int OpenMP::impl_thread_pool_size() const noexcept {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-  return OpenMP::in_parallel(*this)
-             ? omp_get_num_threads()
-             : (Impl::t_openmp_instance
-                    ? Impl::t_openmp_instance->m_pool_size
-                    : impl_internal_space_instance()->m_pool_size);
-#else
-  return OpenMP::in_parallel(*this)
+  return (impl_internal_space_instance()->get_level() < omp_get_level())
              ? omp_get_num_threads()
              : impl_internal_space_instance()->m_pool_size;
-#endif
 }
 
 int OpenMP::impl_max_hardware_threads() noexcept {
diff --git a/core/src/OpenMP/Kokkos_OpenMP.hpp b/core/src/OpenMP/Kokkos_OpenMP.hpp
index 594f40d5245..11292af84ad 100644
--- a/core/src/OpenMP/Kokkos_OpenMP.hpp
+++ b/core/src/OpenMP/Kokkos_OpenMP.hpp
@@ -27,14 +27,7 @@ static_assert(false,
 
 #include <Kokkos_Core_fwd.hpp>
 
-#include <cstddef>
-#include <iosfwd>
 #include <Kokkos_HostSpace.hpp>
-
-#ifdef KOKKOS_ENABLE_HBWSPACE
-#include <Kokkos_HBWSpace.hpp>
-#endif
-
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
@@ -45,6 +38,8 @@ static_assert(false,
 
 #include <omp.h>
 
+#include <cstddef>
+#include <iosfwd>
 #include <vector>
 
 /*--------------------------------------------------------------------------*/
@@ -53,11 +48,6 @@ namespace Kokkos {
 
 namespace Impl {
 class OpenMPInternal;
-
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-// FIXME_OPENMP we can remove this after we remove partition_master
-inline thread_local OpenMPInternal* t_openmp_instance = nullptr;
-#endif
 }  // namespace Impl
 
 /// \class OpenMP
@@ -67,12 +57,7 @@ class OpenMP {
   //! Tag this class as a kokkos execution space
   using execution_space = OpenMP;
 
-  using memory_space =
-#ifdef KOKKOS_ENABLE_HBWSPACE
-      Experimental::HBWSpace;
-#else
-      HostSpace;
-#endif
+  using memory_space = HostSpace;
 
   //! This execution space preferred device_type
   using device_type          = Kokkos::Device<execution_space, memory_space>;
@@ -87,8 +72,10 @@ class OpenMP {
   /// \brief Print configuration information to the given output stream.
   void print_configuration(std::ostream& os, bool verbose = false) const;
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
   /// \brief is the instance running a parallel algorithm
-  static bool in_parallel(OpenMP const& = OpenMP()) noexcept;
+  KOKKOS_DEPRECATED static bool in_parallel(OpenMP const& = OpenMP()) noexcept;
+#endif
 
   /// \brief Wait until all dispatched functors complete on the given instance
   ///
@@ -104,18 +91,6 @@ class OpenMP {
   /// This always returns false on OpenMP
   inline static bool is_asynchronous(OpenMP const& = OpenMP()) noexcept;
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-  /// \brief Partition the default instance and call 'f' on each new 'master'
-  /// thread
-  ///
-  /// Func is a functor with the following signiture
-  ///   void( int partition_id, int num_partitions )
-  template <typename F>
-  KOKKOS_DEPRECATED static void partition_master(
-      F const& f, int requested_num_partitions = 0,
-      int requested_partition_size = 0);
-#endif
-
 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
   static int concurrency(OpenMP const& = OpenMP());
 #else
@@ -166,14 +141,7 @@ class OpenMP {
 };
 
 inline int OpenMP::impl_thread_pool_rank() noexcept {
-  // FIXME_OPENMP Can we remove this when removing partition_master? It's only
-  // used in one partition_master test
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-  KOKKOS_IF_ON_HOST(
-      (return Impl::t_openmp_instance ? 0 : omp_get_thread_num();))
-#else
   KOKKOS_IF_ON_HOST((return omp_get_thread_num();))
-#endif
 
   KOKKOS_IF_ON_DEVICE((return -1;))
 }
diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
index 12bf3b71f7c..32172fbc6c7 100644
--- a/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
+++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
@@ -47,61 +47,6 @@ void OpenMPInternal::release_lock() {
                       desul::MemoryScopeDevice());
 }
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-void OpenMPInternal::validate_partition_impl(const int nthreads,
-                                             int &num_partitions,
-                                             int &partition_size) {
-  if (nthreads == 1) {
-    num_partitions = 1;
-    partition_size = 1;
-  } else if (num_partitions < 1 && partition_size < 1) {
-    int idle = nthreads;
-    for (int np = 2; np <= nthreads; ++np) {
-      for (int ps = 1; ps <= nthreads / np; ++ps) {
-        if (nthreads - np * ps < idle) {
-          idle           = nthreads - np * ps;
-          num_partitions = np;
-          partition_size = ps;
-        }
-        if (idle == 0) {
-          break;
-        }
-      }
-    }
-  } else if (num_partitions < 1 && partition_size > 0) {
-    if (partition_size <= nthreads) {
-      num_partitions = nthreads / partition_size;
-    } else {
-      num_partitions = 1;
-      partition_size = nthreads;
-    }
-  } else if (num_partitions > 0 && partition_size < 1) {
-    if (num_partitions <= nthreads) {
-      partition_size = nthreads / num_partitions;
-    } else {
-      num_partitions = nthreads;
-      partition_size = 1;
-    }
-  } else if (num_partitions * partition_size > nthreads) {
-    int idle     = nthreads;
-    const int NP = num_partitions;
-    const int PS = partition_size;
-    for (int np = NP; np > 0; --np) {
-      for (int ps = PS; ps > 0; --ps) {
-        if ((np * ps <= nthreads) && (nthreads - np * ps < idle)) {
-          idle           = nthreads - np * ps;
-          num_partitions = np;
-          partition_size = ps;
-        }
-        if (idle == 0) {
-          break;
-        }
-      }
-    }
-  }
-}
-#endif
-
 void OpenMPInternal::clear_thread_data() {
   const size_t member_bytes =
       sizeof(int64_t) *
diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
index 03f5fff395a..35b9aa93ba7 100644
--- a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
+++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
@@ -41,16 +41,6 @@
 #include <vector>
 
 /*--------------------------------------------------------------------------*/
-namespace Kokkos {
-namespace Impl {
-
-inline bool execute_in_serial(OpenMP const& space = OpenMP()) {
-  return (OpenMP::in_parallel(space) &&
-          !(omp_get_nested() && (omp_get_level() == 1)));
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
 
 namespace Kokkos {
 namespace Impl {
@@ -99,11 +89,6 @@ class OpenMPInternal {
   // Release lock used to protect access to m_pool
   void release_lock();
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-  static void validate_partition_impl(const int nthreads, int& num_partitions,
-                                      int& partition_size);
-#endif
-
   void resize_thread_data(size_t pool_reduce_bytes, size_t team_reduce_bytes,
                           size_t team_shared_bytes, size_t thread_local_bytes);
 
@@ -115,6 +100,8 @@ class OpenMPInternal {
     return m_pool[i];
   }
 
+  int get_level() const { return m_level; }
+
   bool is_initialized() const { return m_initialized; }
 
   bool verify_is_initialized(const char* const label) const;
@@ -122,32 +109,20 @@ class OpenMPInternal {
   void print_configuration(std::ostream& s) const;
 };
 
-}  // namespace Impl
-
-namespace Experimental {
-
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-template <>
-class MasterLock<OpenMP> {
- public:
-  void lock() { omp_set_lock(&m_lock); }
-  void unlock() { omp_unset_lock(&m_lock); }
-  bool try_lock() { return static_cast<bool>(omp_test_lock(&m_lock)); }
-
-  KOKKOS_DEPRECATED MasterLock() { omp_init_lock(&m_lock); }
-  ~MasterLock() { omp_destroy_lock(&m_lock); }
-
-  MasterLock(MasterLock const&) = delete;
-  MasterLock(MasterLock&&)      = delete;
-  MasterLock& operator=(MasterLock const&) = delete;
-  MasterLock& operator=(MasterLock&&) = delete;
-
- private:
-  omp_lock_t m_lock;
-};
+inline bool execute_in_serial(OpenMP const& space = OpenMP()) {
+// The default value returned by `omp_get_max_active_levels` with gcc version
+// lower than 11.1.0 is 2147483647 instead of 1.
+#if (!defined(KOKKOS_COMPILER_GNU) || KOKKOS_COMPILER_GNU >= 1110) && \
+    _OPENMP >= 201511
+  bool is_nested = omp_get_max_active_levels() > 1;
+#else
+  bool is_nested = static_cast<bool>(omp_get_nested());
 #endif
+  return (space.impl_internal_space_instance()->get_level() < omp_get_level() &&
+          !(is_nested && (omp_get_level() == 1)));
+}
 
-}  // namespace Experimental
+}  // namespace Impl
 
 namespace Experimental {
 namespace Impl {
@@ -202,50 +177,6 @@ std::vector<OpenMP> partition_space(OpenMP const& main_instance,
   return Impl::create_OpenMP_instances(main_instance, weights);
 }
 }  // namespace Experimental
-
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-template <typename F>
-KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions,
-                                                int partition_size) {
-#if _OPENMP >= 201511
-  if (omp_get_max_active_levels() > 1) {
-#else
-  if (omp_get_nested()) {
-#endif
-    using Exec = Impl::OpenMPInternal;
-
-    Exec* prev_instance = &Impl::OpenMPInternal::singleton();
-
-    Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions,
-                                  partition_size);
-
-    OpenMP::memory_space space;
-
-#pragma omp parallel num_threads(num_partitions)
-    {
-      Exec thread_local_instance(partition_size);
-      Impl::t_openmp_instance = &thread_local_instance;
-
-      size_t pool_reduce_bytes  = 32 * partition_size;
-      size_t team_reduce_bytes  = 32 * partition_size;
-      size_t team_shared_bytes  = 1024 * partition_size;
-      size_t thread_local_bytes = 1024;
-
-      thread_local_instance.resize_thread_data(
-          pool_reduce_bytes, team_reduce_bytes, team_shared_bytes,
-          thread_local_bytes);
-
-      omp_set_num_threads(partition_size);
-      f(omp_get_thread_num(), omp_get_num_threads());
-      Impl::t_openmp_instance = nullptr;
-    }
-  } else {
-    // nested openmp not enabled
-    f(0, 1);
-  }
-}
-#endif
-
 }  // namespace Kokkos
 
 #endif
diff --git a/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp b/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp
index 96dc664eb79..823a7e668e5 100644
--- a/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp
+++ b/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp
@@ -147,15 +147,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::OpenMP> {
 
   inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
       : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
     m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
   }
 };
 
@@ -251,16 +243,9 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
   inline ParallelFor(const FunctorType& arg_functor, MDRangePolicy arg_policy)
       : m_instance(nullptr), m_iter(arg_policy, arg_functor) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
     m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
   }
+
   template <typename Policy, typename Functor>
   static int max_tile_size_product(const Policy&, const Functor&) {
     /**
@@ -409,15 +394,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
                      FunctorTeamShmemSize<FunctorType>::value(
                          arg_functor, arg_policy.team_size())) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
     m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
   }
 };
 
diff --git a/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp b/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp
index 52cdef18e65..05fd1c9dce3 100644
--- a/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp
+++ b/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp
@@ -170,15 +170,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
         m_functor_reducer(arg_functor_reducer),
         m_policy(arg_policy),
         m_result_ptr(arg_view.data()) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
     m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
     static_assert(
         Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
                                         Kokkos::HostSpace>::accessible,
@@ -319,15 +311,7 @@ class ParallelReduce<CombinedFunctorReducerType,
       : m_instance(nullptr),
         m_iter(arg_policy, arg_functor_reducer),
         m_result_ptr(arg_view.data()) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
     m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
     static_assert(
         Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
                                         Kokkos::HostSpace>::accessible,
@@ -543,15 +527,7 @@ class ParallelReduce<CombinedFunctorReducerType,
             arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
             FunctorTeamShmemSize<FunctorType>::value(
                 arg_functor_reducer.get_functor(), arg_policy.team_size())) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
     m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
 
     static_assert(
         Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
diff --git a/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp b/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp
index 02707e7fbee..f843aef3a84 100644
--- a/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp
+++ b/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp
@@ -140,15 +140,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 
   inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
       : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
     m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
   }
 };
 
@@ -292,15 +284,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
         Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
                                         Kokkos::HostSpace>::accessible,
         "Kokkos::OpenMP parallel_scan result must be host-accessible!");
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
     m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
   }
 
   //----------------------------------------
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp
index adf972dd081..ea4e7f6baba 100644
--- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp
@@ -65,7 +65,11 @@ class OpenMPTarget {
 
   using scratch_memory_space = ScratchMemorySpace<OpenMPTarget>;
 
-  inline static bool in_parallel() { return omp_in_parallel(); }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED inline static bool in_parallel() {
+    return omp_in_parallel();
+  }
+#endif
 
   static void fence(const std::string& name =
                         "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence");
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
index 81fbc56de00..a414b34d7c6 100644
--- a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
@@ -37,7 +37,6 @@
 #include <OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <Kokkos_Atomic.hpp>
-#include <impl/Kokkos_MemorySpace.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -110,79 +109,13 @@ void OpenMPTargetSpace::deallocate(const char* arg_label,
 }  // namespace Experimental
 }  // namespace Kokkos
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-#ifdef KOKKOS_ENABLE_DEBUG
-SharedAllocationRecord<void, void> SharedAllocationRecord<
-    Kokkos::Experimental::OpenMPTargetSpace, void>::s_root_record;
-#endif
-
-SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace,
-                       void>::~SharedAllocationRecord() {
-  auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
-  m_space.deallocate(m_label.c_str(),
-                     SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     alloc_size, (alloc_size - sizeof(SharedAllocationHeader)));
-}
-
-SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
-    SharedAllocationRecord(
-        const Kokkos::Experimental::OpenMPTargetSpace& arg_space,
-        const std::string& arg_label, const size_t arg_alloc_size,
-        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace,
-                                  void>::s_root_record,
-#endif
-          Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label,
-                                                       arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-  SharedAllocationHeader header;
-
-  this->base_t::_fill_host_accessible_header_info(header, arg_label);
-
-  // TODO DeepCopy
-  // DeepCopy
-  Kokkos::Impl::DeepCopy<Experimental::OpenMPTargetSpace, HostSpace>(
-      RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader));
-  Kokkos::fence(
-      "SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, "
-      "void>::SharedAllocationRecord(): fence after copying header from "
-      "HostSpace");
-}
-
-//----------------------------------------------------------------------------
-
-}  // namespace Impl
-}  // namespace Kokkos
-
 //==============================================================================
 // <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
 
 #include <impl/Kokkos_SharedAlloc_timpl.hpp>
 
-namespace Kokkos {
-namespace Impl {
-
-// To avoid additional compilation cost for something that's (mostly?) not
-// performance sensitive, we explicity instantiate these CRTP base classes here,
-// where we have access to the associated *_timpl.hpp header files.
-template class HostInaccessibleSharedAllocationRecordCommon<
-    Kokkos::Experimental::OpenMPTargetSpace>;
-template class SharedAllocationRecordCommon<
-    Kokkos::Experimental::OpenMPTargetSpace>;
-
-}  // end namespace Impl
-}  // end namespace Kokkos
+KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
+    Kokkos::Experimental::OpenMPTargetSpace);
 
 // </editor-fold> end Explicit instantiations of CRTP Base classes }}}1
 //==============================================================================
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp
index e5b33d0982f..ed625cfcc82 100644
--- a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp
@@ -98,6 +98,16 @@ class OpenMPTargetSpace {
   ~OpenMPTargetSpace()                                   = default;
 
   /**\brief  Allocate untracked memory in the space */
+  // FIXME_OPENMPTARGET Use execution space instance
+  void* allocate(const OpenMPTarget&, const size_t arg_alloc_size) const {
+    return allocate(arg_alloc_size);
+  }
+  // FIXME_OPENMPTARGET Use execution space instance
+  void* allocate(const OpenMPTarget&, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const {
+    return allocate(arg_label, arg_alloc_size, arg_logical_size);
+  }
   void* allocate(const size_t arg_alloc_size) const;
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
                  const size_t arg_logical_size = 0) const;
@@ -121,9 +131,6 @@ class OpenMPTargetSpace {
                        const size_t arg_logical_size = 0,
                        const Kokkos::Tools::SpaceHandle =
                            Kokkos::Tools::make_space_handle(name())) const;
-
-  friend class Kokkos::Impl::SharedAllocationRecord<
-      Kokkos::Experimental::OpenMPTargetSpace, void>;
 };
 }  // namespace Experimental
 }  // namespace Kokkos
@@ -131,64 +138,8 @@ class OpenMPTargetSpace {
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-namespace Impl {
-
-template <>
-class SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>
-    : public HostInaccessibleSharedAllocationRecordCommon<
-          Kokkos::Experimental::OpenMPTargetSpace> {
- private:
-  friend class HostInaccessibleSharedAllocationRecordCommon<
-      Kokkos::Experimental::OpenMPTargetSpace>;
-  friend class SharedAllocationRecordCommon<
-      Kokkos::Experimental::OpenMPTargetSpace>;
-  friend Kokkos::Experimental::OpenMPTargetSpace;
-
-  using base_t = HostInaccessibleSharedAllocationRecordCommon<
-      Kokkos::Experimental::OpenMPTargetSpace>;
-  using RecordBase = SharedAllocationRecord<void, void>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-
-  /**\brief  Root record for tracked allocations from this OpenMPTargetSpace
-   * instance */
-  static RecordBase s_root_record;
-
-  const Kokkos::Experimental::OpenMPTargetSpace m_space;
-
- protected:
-  ~SharedAllocationRecord();
-  SharedAllocationRecord() = default;
-
-  template <typename ExecutionSpace>
-  SharedAllocationRecord(
-      const ExecutionSpace& /*exec_space*/,
-      const Kokkos::Experimental::OpenMPTargetSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate)
-      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
-                               arg_dealloc) {}
-
-  SharedAllocationRecord(
-      const Kokkos::Experimental::OpenMPTargetSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate);
-
- public:
-  KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate(
-      const Kokkos::Experimental::OpenMPTargetSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc) {
-    KOKKOS_IF_ON_HOST(
-        (return new SharedAllocationRecord(arg_space, arg_label, arg_alloc);))
-    KOKKOS_IF_ON_DEVICE(
-        ((void)arg_space; (void)arg_label; (void)arg_alloc; return nullptr;))
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
+KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(
+    Kokkos::Experimental::OpenMPTargetSpace);
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
index 1902c38409a..b39f5aca353 100644
--- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
@@ -75,6 +75,7 @@ int* OpenMPTargetExec::m_lock_array           = nullptr;
 uint64_t OpenMPTargetExec::m_lock_size        = 0;
 uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr;
 int OpenMPTargetExec::MAX_ACTIVE_THREADS      = 0;
+std::mutex OpenMPTargetExec::m_mutex_scratch_ptr;
 
 void OpenMPTargetExec::clear_scratch() {
   Kokkos::Experimental::OpenMPTargetSpace space;
@@ -98,6 +99,11 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0,
                                       int64_t shmem_size_L1,
                                       int64_t league_size) {
   Kokkos::Experimental::OpenMPTargetSpace space;
+  // Level-0 scratch when using clang/17 and higher comes from their OpenMP
+  // extension, `ompx_dyn_cgroup_mem`.
+#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS)
+  shmem_size_L0 = 0;
+#endif
   const int64_t shmem_size =
       shmem_size_L0 + shmem_size_L1;  // L0 + L1 scratch memory per team.
   const int64_t padding = shmem_size * 10 / 100;  // Padding per team.
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
index 9e8844a6f20..3387108da39 100644
--- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -178,8 +178,10 @@ void OpenMPTarget::impl_static_fence(const std::string& name) {
 }
 
 void OpenMPTarget::impl_initialize(InitializationSettings const& settings) {
+  using Kokkos::Impl::get_visible_devices;
+  std::vector<int> const& visible_devices = get_visible_devices();
   using Kokkos::Impl::get_gpu;
-  const int device_num = get_gpu(settings);
+  const int device_num = get_gpu(settings).value_or(visible_devices[0]);
   omp_set_default_device(device_num);
 
   Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize();
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp
new file mode 100644
index 00000000000..2bd672f4d06
--- /dev/null
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp
@@ -0,0 +1,46 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMPTARGET_MACROS_HPP
+#define KOKKOS_OPENMPTARGET_MACROS_HPP
+
+// Intel architectures prefer the classical hierarchical parallelism that relies
+// on OpenMP.
+#if defined(KOKKOS_ARCH_INTEL_GPU)
+#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU
+#endif
+
+// Define a macro for llvm compiler greater than version 17 and on NVIDIA and
+// AMD GPUs. This would be useful in cases where non-OpenMP standard llvm
+// extensions can be used.
+#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1700) && \
+    (defined(KOKKOS_ARCH_AMD_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU))
+#define KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS
+#endif
+
+#define KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(x) _Pragma(#x)
+#define KOKKOS_IMPL_OMPTARGET_PRAGMA(x) \
+  KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(omp target x)
+
+// Use scratch memory extensions to request dynamic shared memory for the
+// right compiler/architecture combination.
+#ifdef KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS
+#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) ompx_dyn_cgroup_mem(N)
+#else
+#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N)
+#endif
+
+#endif  // KOKKOS_OPENMPTARGET_MACROS_HPP
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
index 9767d8e53ef..dcc509d2faf 100644
--- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
@@ -21,16 +21,10 @@
 #include <sstream>
 #include <Kokkos_Parallel.hpp>
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Spinwait.hpp>
 
 #include <Kokkos_Atomic.hpp>
 #include "Kokkos_OpenMPTarget_Abort.hpp"
-
-// Intel architectures prefer the classical hierarchical parallelism that relies
-// on OpenMP.
-#if defined(KOKKOS_ARCH_INTEL_GPU)
-#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU
-#endif
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -113,14 +107,20 @@ class OpenMPTargetExecTeamMember {
     team_broadcast(value, thread_id);
   }
 
-  // FIXME_OPENMPTARGET this function has the wrong interface and currently
-  // ignores the reducer passed.
-  template <class ValueType, class JoinOp>
-  KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value,
-                                               const JoinOp&) const {
+  template <typename ReducerType>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  team_reduce(ReducerType const& reducer) const noexcept {
+    team_reduce(reducer, reducer.reference());
+  }
+
+  // FIXME_OPENMPTARGET this function currently ignores the reducer passed.
+  template <typename ReducerType>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  team_reduce(ReducerType const&, typename ReducerType::value_type& value) const
+      noexcept {
 #pragma omp barrier
 
-    using value_type = ValueType;
+    using value_type = typename ReducerType::value_type;
     //    const JoinLambdaAdapter<value_type, JoinOp> op(op_in);
 
     // Make sure there is enough scratch space:
@@ -149,8 +149,9 @@ class OpenMPTargetExecTeamMember {
       }
 #pragma omp barrier
     }
-    return team_scratch[0];
+    value = team_scratch[0];
   }
+
   /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
    *          with intra-team non-deterministic ordering accumulation.
    *
@@ -249,15 +250,37 @@ class OpenMPTargetExecTeamMember {
     // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for
     // hierarchical reduction. There is an additional 10% of the requested
     // scratch memory allocated per team as padding. Hence the product with 0.1.
+    //
+    // Use llvm extensions for dynamic shared memory with compilers/architecture
+    // combinations where it is supported.
+    //
+    // Size allocated in HBM will now change based on whether we use llvm
+    // extensions.
+#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS)
+    const int total_shmem = shmem_size_L1 + shmem_size_L1 * 0.1;
+#else
+    const int total_shmem =
+        shmem_size_L0 + shmem_size_L1 + (shmem_size_L0 + shmem_size_L1) * 0.1;
+#endif
+
+    // Per team offset for buffer in HBM.
     const int reduce_offset =
-        m_shmem_block_index *
-        (shmem_size_L0 + shmem_size_L1 +
-         ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE);
+        m_shmem_block_index * (total_shmem + TEAM_REDUCE_SIZE);
+
+#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS)
+    const int l1_offset = reduce_offset + TEAM_REDUCE_SIZE;
+    char* l0_scratch =
+        static_cast<char*>(llvm_omp_target_dynamic_shared_alloc());
+    m_team_shared = scratch_memory_space(
+        l0_scratch, shmem_size_L0, static_cast<char*>(glb_scratch) + l1_offset,
+        shmem_size_L1);
+#else
     const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE;
     const int l1_offset = l0_offset + shmem_size_L0;
     m_team_shared       = scratch_memory_space(
         (static_cast<char*>(glb_scratch) + l0_offset), shmem_size_L0,
         static_cast<char*>(glb_scratch) + l1_offset, shmem_size_L1);
+#endif
     m_reduce_scratch = static_cast<char*>(glb_scratch) + reduce_offset;
     m_league_rank    = league_rank;
     m_team_rank      = omp_tid;
@@ -751,6 +774,7 @@ class OpenMPTargetExec {
                              int64_t thread_local_bytes, int64_t league_size);
 
   static void* m_scratch_ptr;
+  static std::mutex m_mutex_scratch_ptr;
   static int64_t m_scratch_size;
   static int* m_lock_array;
   static uint64_t m_lock_size;
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp
index 1abc925caed..26085f11400 100644
--- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp
@@ -19,6 +19,7 @@
 
 #include <omp.h>
 #include <sstream>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp>
 #include <Kokkos_Parallel.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
 
@@ -140,8 +141,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 // guarantees that the number of teams specified in the `num_teams` clause is
 // always less than or equal to the maximum concurrently running teams.
 #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU)
-#pragma omp target teams thread_limit(team_size) firstprivate(a_functor) \
-    num_teams(max_active_teams) is_device_ptr(scratch_ptr)
+    KOKKOS_IMPL_OMPTARGET_PRAGMA(
+        teams thread_limit(team_size) firstprivate(a_functor)
+            num_teams(max_active_teams) is_device_ptr(scratch_ptr)
+                KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0))
 #pragma omp parallel
     {
       if (omp_get_num_teams() > max_active_teams)
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp
index 4452af3846d..caa568a8925 100644
--- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp
@@ -55,6 +55,9 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
   const pointer_type m_result_ptr;
   bool m_result_ptr_on_device;
   const int m_result_ptr_num_elems;
+  // Only let one ParallelReduce instance at a time use the scratch memory.
+  // The constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_memory_lock;
   using TagType = typename Policy::work_tag;
 
  public:
@@ -105,7 +108,8 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
         m_result_ptr_on_device(
             MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
                               typename ViewType::memory_space>::accessible),
-        m_result_ptr_num_elems(arg_result_view.size()) {}
+        m_result_ptr_num_elems(arg_result_view.size()),
+        m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {}
 };
 
 }  // namespace Impl
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp
index a302fa71511..8abffa47a43 100644
--- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp
@@ -470,6 +470,10 @@ class ParallelReduce<CombinedFunctorReducerType,
   const pointer_type m_result_ptr;
   const size_t m_shmem_size;
 
+  // Only let one ParallelReduce instance at a time use the scratch memory.
+  // The constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_memory_lock;
+
  public:
   void execute() const {
     const FunctorType& functor = m_functor_reducer.get_functor();
@@ -517,7 +521,8 @@ class ParallelReduce<CombinedFunctorReducerType,
         m_shmem_size(
             arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
             FunctorTeamShmemSize<FunctorType>::value(
-                arg_functor_reducer.get_functor(), arg_policy.team_size())) {}
+                arg_functor_reducer.get_functor(), arg_policy.team_size())),
+        m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {}
 };
 
 }  // namespace Impl
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp
index 1d6677a1df6..c1f7851f413 100644
--- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp
@@ -238,8 +238,10 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 
       if (!base_t::m_result_ptr_device_accessible) {
         const int size = base_t::m_functor_reducer.get_reducer().value_size();
-        DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace>(
-            base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size);
+        DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace,
+                 Kokkos::Experimental::OpenMPTarget>(
+            base_t::m_policy.space(), base_t::m_result_ptr,
+            chunk_values.data() + (n_chunks - 1), size);
       }
     } else if (!base_t::m_result_ptr_device_accessible) {
       base_t::m_functor_reducer.get_reducer().init(base_t::m_result_ptr);
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp
index fb75f05f270..eb3dc3773c4 100644
--- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp
@@ -21,6 +21,7 @@
 #include <sstream>
 #include <Kokkos_Parallel.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp>
 
 namespace Kokkos {
 namespace Impl {
@@ -394,9 +395,11 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
 
 #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU)
-#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \
-    firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom              \
-                                                         : result)
+    KOKKOS_IMPL_OMPTARGET_PRAGMA(
+        teams num_teams(max_active_teams) thread_limit(team_size)
+            firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom
+                                                                 : result)
+                KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0))
 #pragma omp parallel reduction(custom : result)
     {
       if (omp_get_num_teams() > max_active_teams)
@@ -482,9 +485,11 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
 
       // Case where reduction is on a native data type.
       if constexpr (std::is_arithmetic<ValueType>::value) {
-#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to   \
-                                                                       : f) \
-    is_device_ptr(scratch_ptr) reduction(+: result)
+        // Use scratch memory extensions to request dynamic shared memory for
+        // the right compiler/architecture combination.
+        KOKKOS_IMPL_OMPTARGET_PRAGMA(teams num_teams(max_active_teams) thread_limit(team_size) map(to: f) \
+    is_device_ptr(scratch_ptr) reduction(+: result)               \
+        KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0))
 #pragma omp parallel reduction(+ : result)
         {
           if (omp_get_num_teams() > max_active_teams)
@@ -636,11 +641,13 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
 
       return;
     }
-
-#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
-                                                                       : f) \
-    is_device_ptr(scratch_ptr)
-    {
+    // Use scratch memory extensions to request dynamic shared memory for the
+    // right compiler/architecture combination.
+    KOKKOS_IMPL_OMPTARGET_PRAGMA(
+        teams num_teams(nteams) thread_limit(team_size) map(to
+                                                            : f)
+            is_device_ptr(scratch_ptr)
+                KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) {
 #pragma omp parallel
       {
         const int team_num      = omp_get_team_num();
@@ -665,9 +672,8 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
 
     int tree_neighbor_offset = 1;
     do {
-#pragma omp target teams distribute parallel for simd map(to               \
-                                                          : final_reducer) \
-    is_device_ptr(scratch_ptr)
+#pragma omp target teams distribute parallel for simd firstprivate( \
+    final_reducer) is_device_ptr(scratch_ptr)
       for (int i = 0; i < nteams - tree_neighbor_offset;
            i += 2 * tree_neighbor_offset) {
         ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr);
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
index 41e62ce6e6b..6878531730d 100644
--- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
@@ -438,6 +438,10 @@ class ParallelReduce<CombinedFunctorReducerType,
 
   bool m_result_ptr_on_device;
 
+  // Only let one ParallelReduce instance at a time use the scratch memory.
+  // The constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_memory_lock;
+
  public:
   inline void execute() const {
     execute_tile<Policy::rank, typename ReducerType::value_type>(
@@ -452,7 +456,8 @@ class ParallelReduce<CombinedFunctorReducerType,
         m_policy(arg_policy),
         m_result_ptr_on_device(
             MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
-                              typename ViewType::memory_space>::accessible) {}
+                              typename ViewType::memory_space>::accessible),
+        m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {}
 
   template <int Rank, class ValueType>
   inline std::enable_if_t<Rank == 2> execute_tile(const FunctorType& functor,
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp
index 672271ed6b9..9b578aca112 100644
--- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp
+++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp
@@ -18,7 +18,6 @@
 #define KOKKOS_OPENMPTARGETREDUCER_HPP
 
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Spinwait.hpp>
 
 #include <Kokkos_Atomic.hpp>
 #include "Kokkos_OpenMPTarget_Abort.hpp"
diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp
index 7fa935f693a..9a246f7642f 100644
--- a/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/core/src/SYCL/Kokkos_SYCL.cpp
@@ -88,26 +88,57 @@ bool SYCL::impl_is_initialized() {
 void SYCL::impl_finalize() { Impl::SYCLInternal::singleton().finalize(); }
 
 void SYCL::print_configuration(std::ostream& os, bool verbose) const {
-  os << "Devices:\n";
-  os << "  KOKKOS_ENABLE_SYCL: yes\n";
-
   os << "\nRuntime Configuration:\n";
 
-  os << "macro  KOKKOS_ENABLE_SYCL : defined\n";
+#ifdef KOKKOS_ENABLE_ONEDPL
+  os << "macro  KOKKOS_ENABLE_ONEDPL : defined\n";
+#else
+  os << "macro  KOKKOS_ENABLE_ONEDPL : undefined\n";
+#endif
 #ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED
   os << "macro  KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : defined\n";
 #else
   os << "macro  KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n";
 #endif
-
+#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL
+  os << "macro  SYCL_EXT_ONEAPI_DEVICE_GLOBAL : defined\n";
+#else
+  os << "macro  SYCL_EXT_ONEAPI_DEVICE_GLOBAL : undefined\n";
+#endif
 #ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
   os << "macro  KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : defined\n";
 #else
   os << "macro  KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : undefined\n";
 #endif
 
-  if (verbose)
+  int counter       = 0;
+  int active_device = Kokkos::device_id();
+  std::cout << "\nAvailable devices: \n";
+  std::vector<sycl::device> devices = Impl::get_sycl_devices();
+  for (const auto& device : devices) {
+    std::string device_type;
+    switch (device.get_info<sycl::info::device::device_type>()) {
+      case sycl::info::device_type::cpu: device_type = "cpu"; break;
+      case sycl::info::device_type::gpu: device_type = "gpu"; break;
+      case sycl::info::device_type::accelerator:
+        device_type = "accelerator";
+        break;
+      case sycl::info::device_type::custom: device_type = "custom"; break;
+      case sycl::info::device_type::automatic: device_type = "automatic"; break;
+      case sycl::info::device_type::host: device_type = "host"; break;
+      case sycl::info::device_type::all: device_type = "all"; break;
+    }
+    os << "[" << device.get_backend() << "]:" << device_type << ':' << counter
+       << "] " << device.get_info<sycl::info::device::name>();
+    if (counter == active_device) os << " : Selected";
+    os << '\n';
+    ++counter;
+  }
+
+  if (verbose) {
+    os << '\n';
     SYCL::impl_sycl_info(os, m_space_instance->m_queue->get_device());
+  }
 }
 
 void SYCL::fence(const std::string& name) const {
@@ -137,20 +168,11 @@ void SYCL::impl_static_fence(const std::string& name) {
 }
 
 void SYCL::impl_initialize(InitializationSettings const& settings) {
-  std::vector<sycl::device> gpu_devices =
-      sycl::device::get_devices(sycl::info::device_type::gpu);
-  // If the device id is not specified and there are no GPUs, sidestep Kokkos
-  // device selection and use whatever is available (if no GPU architecture is
-  // specified).
-#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
-  if (!settings.has_device_id() && gpu_devices.empty()) {
-    Impl::SYCLInternal::singleton().initialize(sycl::device());
-    Impl::SYCLInternal::m_syclDev = 0;
-    return;
-  }
-#endif
-  const auto id = ::Kokkos::Impl::get_gpu(settings);
-  Impl::SYCLInternal::singleton().initialize(gpu_devices[id]);
+  const auto& visible_devices = ::Kokkos::Impl::get_visible_devices();
+  const auto id =
+      ::Kokkos::Impl::get_gpu(settings).value_or(visible_devices[0]);
+  std::vector<sycl::device> sycl_devices = Impl::get_sycl_devices();
+  Impl::SYCLInternal::singleton().initialize(sycl_devices[id]);
   Impl::SYCLInternal::m_syclDev = id;
 }
 
@@ -243,9 +265,32 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os,
 
 namespace Impl {
 
+std::vector<sycl::device> get_sycl_devices() {
+#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) || \
+    defined(KOKKOS_ARCH_AMD_GPU)
+  std::vector<sycl::device> devices =
+      sycl::device::get_devices(sycl::info::device_type::gpu);
+#if defined(KOKKOS_ARCH_INTEL_GPU)
+  sycl::backend backend = sycl::backend::ext_oneapi_level_zero;
+#elif defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
+  sycl::backend backend = sycl::backend::ext_oneapi_cuda;
+#elif defined(KOKKOS_ARCH_AMD_GPU)
+  sycl::backend backend = sycl::backend::ext_oneapi_hip;
+#endif
+  devices.erase(std::remove_if(devices.begin(), devices.end(),
+                               [backend](const sycl::device& d) {
+                                 return d.get_backend() != backend;
+                               }),
+                devices.end());
+#else
+  std::vector<sycl::device> devices = sycl::device::get_devices();
+#endif
+  return devices;
+}
+
 int g_sycl_space_factory_initialized =
     Kokkos::Impl::initialize_space_factory<SYCL>("170_SYCL");
 
-}
+}  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/core/src/SYCL/Kokkos_SYCL.hpp b/core/src/SYCL/Kokkos_SYCL.hpp
index be6b4b89302..0f3d1f0994d 100644
--- a/core/src/SYCL/Kokkos_SYCL.hpp
+++ b/core/src/SYCL/Kokkos_SYCL.hpp
@@ -78,19 +78,15 @@ class SYCL {
   //! \name Functions that all Kokkos devices must implement.
   //@{
 
-  KOKKOS_INLINE_FUNCTION static int in_parallel() {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() {
 #if defined(__SYCL_DEVICE_ONLY__)
     return true;
 #else
     return false;
 #endif
   }
-
-  /** \brief  Set the device in a "sleep" state. */
-  static bool sleep();
-
-  /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
-  static bool wake();
+#endif
 
   /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
   static void impl_static_fence(const std::string& name);
@@ -188,6 +184,10 @@ std::vector<SYCL> partition_space(const SYCL& sycl_space,
         sycl::queue(context, device, sycl::property::queue::in_order()));
   return instances;
 }
+
+namespace Impl {
+std::vector<sycl::device> get_sycl_devices();
+}  // namespace Impl
 }  // namespace Experimental
 
 }  // namespace Kokkos
diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp
index 080369770d7..0e67adb5787 100644
--- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp
+++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp
@@ -54,7 +54,7 @@ Kokkos::View<uint32_t*, SYCLDeviceUSMSpace> sycl_global_unique_token_locks(
 }
 
 SYCLInternal::~SYCLInternal() {
-  if (!was_finalized || m_scratchSpace || m_scratchFlags) {
+  if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) {
     std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call "
                  "Kokkos::Experimental::SYCL::finalize()"
               << std::endl;
@@ -102,6 +102,23 @@ void SYCLInternal::initialize(const sycl::device& d) {
 void SYCLInternal::initialize(const sycl::queue& q) {
   KOKKOS_EXPECTS(!is_initialized());
 
+#define KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(BACKEND, REQUIRED)            \
+  if (BACKEND != REQUIRED)                                                   \
+  Kokkos::abort(                                                             \
+      "The SYCL execution space instance was initialized with an "           \
+      "unsupported backend type! For this GPU architecture, only " #REQUIRED \
+      " is supported.")
+#if defined(KOKKOS_ARCH_INTEL_GPU)
+  KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(),
+                                         sycl::backend::ext_oneapi_level_zero);
+#elif defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
+  KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(),
+                                         sycl::backend::ext_oneapi_cuda);
+#elif defined(KOKKOS_ARCH_AMD_GPU)
+  KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(),
+                                         sycl::backend::ext_oneapi_hip);
+#endif
+
   if (was_finalized)
     Kokkos::abort("Calling SYCL::initialize after SYCL::finalize is illegal\n");
 
@@ -196,14 +213,22 @@ void SYCLInternal::finalize() {
 #endif
   }
 
-  using RecordSYCL = Kokkos::Impl::SharedAllocationRecord<SYCLDeviceUSMSpace>;
+  auto device_mem_space = SYCLDeviceUSMSpace(*m_queue);
+  auto host_mem_space   = SYCLHostUSMSpace(*m_queue);
   if (nullptr != m_scratchSpace)
-    RecordSYCL::decrement(RecordSYCL::get_record(m_scratchSpace));
+    device_mem_space.deallocate(m_scratchSpace,
+                                m_scratchSpaceCount * sizeScratchGrain);
+  if (nullptr != m_scratchHost)
+    host_mem_space.deallocate(m_scratchHost,
+                              m_scratchHostCount * sizeScratchGrain);
   if (nullptr != m_scratchFlags)
-    RecordSYCL::decrement(RecordSYCL::get_record(m_scratchFlags));
+    device_mem_space.deallocate(m_scratchFlags,
+                                m_scratchFlagsCount * sizeScratchGrain);
   m_syclDev           = -1;
   m_scratchSpaceCount = 0;
   m_scratchSpace      = nullptr;
+  m_scratchHostCount  = 0;
+  m_scratchHost       = nullptr;
   m_scratchFlagsCount = 0;
   m_scratchFlags      = nullptr;
 
@@ -228,54 +253,68 @@ void SYCLInternal::finalize() {
 sycl::device_ptr<void> SYCLInternal::scratch_space(const std::size_t size) {
   if (verify_is_initialized("scratch_space") &&
       m_scratchSpaceCount < scratch_count(size)) {
-    m_scratchSpaceCount = scratch_count(size);
-
-    using Record = Kokkos::Impl::SharedAllocationRecord<
-        Kokkos::Experimental::SYCLDeviceUSMSpace, void>;
+    auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue);
 
     if (nullptr != m_scratchSpace)
-      Record::decrement(Record::get_record(m_scratchSpace));
+      mem_space.deallocate(m_scratchSpace,
+                           m_scratchSpaceCount * sizeScratchGrain);
+
+    m_scratchSpaceCount = scratch_count(size);
 
     std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort(
         m_scratchSpaceCount, sizeScratchGrain);
-    Record* const r = Record::allocate(
-        Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-        "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size);
+    m_scratchSpace = static_cast<size_type*>(mem_space.allocate(
+        "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size));
+  }
+
+  return m_scratchSpace;
+}
+
+sycl::host_ptr<void> SYCLInternal::scratch_host(const std::size_t size) {
+  if (verify_is_initialized("scratch_unified") &&
+      m_scratchHostCount < scratch_count(size)) {
+    auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue);
 
-    Record::increment(r);
+    if (nullptr != m_scratchHost)
+      mem_space.deallocate(m_scratchHost,
+                           m_scratchHostCount * sizeScratchGrain);
 
-    m_scratchSpace = reinterpret_cast<size_type*>(r->data());
+    m_scratchHostCount = scratch_count(size);
+
+    std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort(
+        m_scratchHostCount, sizeScratchGrain);
+    m_scratchHost = static_cast<size_type*>(mem_space.allocate(
+        "Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size));
   }
 
-  return m_scratchSpace;
+  return m_scratchHost;
 }
 
 sycl::device_ptr<void> SYCLInternal::scratch_flags(const std::size_t size) {
   if (verify_is_initialized("scratch_flags") &&
       m_scratchFlagsCount < scratch_count(size)) {
-    m_scratchFlagsCount = scratch_count(size);
-
-    using Record = Kokkos::Impl::SharedAllocationRecord<
-        Kokkos::Experimental::SYCLDeviceUSMSpace, void>;
+    auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue);
 
     if (nullptr != m_scratchFlags)
-      Record::decrement(Record::get_record(m_scratchFlags));
+      mem_space.deallocate(m_scratchFlags,
+                           m_scratchFlagsCount * sizeScratchGrain);
+
+    m_scratchFlagsCount = scratch_count(size);
 
     std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort(
         m_scratchFlagsCount, sizeScratchGrain);
-    Record* const r = Record::allocate(
-        Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-        "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size);
-
-    Record::increment(r);
-
-    m_scratchFlags = reinterpret_cast<size_type*>(r->data());
-  }
-  auto memset_event = m_queue->memset(m_scratchFlags, 0,
-                                      m_scratchFlagsCount * sizeScratchGrain);
+    m_scratchFlags = static_cast<size_type*>(mem_space.allocate(
+        "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size));
+
+    // We only zero-initialize the allocation when we actually allocate.
+    // It's the responsibility of the features using scratch_flags,
+    // namely parallel_reduce and parallel_scan, to reset the used values to 0.
+    auto memset_event = m_queue->memset(m_scratchFlags, 0,
+                                        m_scratchFlagsCount * sizeScratchGrain);
 #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
-  m_queue->ext_oneapi_submit_barrier(std::vector{memset_event});
+    m_queue->ext_oneapi_submit_barrier(std::vector{memset_event});
 #endif
+  }
 
   return m_scratchFlags;
 }
@@ -318,15 +357,12 @@ size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
   assert(m_q);
 
   if (m_capacity < n) {
-    using Record = Kokkos::Impl::SharedAllocationRecord<AllocationSpace, void>;
-    // First free what we have (in case malloc can reuse it)
-    if (m_data) Record::decrement(Record::get_record(m_data));
+    AllocationSpace alloc_space(*m_q);
+    if (m_data) alloc_space.deallocate(m_data, m_capacity);
 
-    Record* const r = Record::allocate(
-        AllocationSpace(*m_q), "Kokkos::Experimental::SYCL::USMObjectMem", n);
-    Record::increment(r);
+    m_data =
+        alloc_space.allocate("Kokkos::Experimental::SYCL::USMObjectMem", n);
 
-    m_data = r->data();
     if constexpr (sycl::usm::alloc::device == Kind)
       m_staging.reset(new char[n]);
     m_capacity = n;
@@ -340,8 +376,8 @@ void SYCLInternal::USMObjectMem<Kind>::reset() {
   if (m_data) {
     // This implies a fence since this class is not copyable
     // and deallocating implies a fence across all registered queues.
-    using Record = Kokkos::Impl::SharedAllocationRecord<AllocationSpace, void>;
-    Record::decrement(Record::get_record(m_data));
+    AllocationSpace alloc_space(*m_q);
+    alloc_space.deallocate(m_data, m_capacity);
 
     m_capacity = 0;
     m_data     = nullptr;
diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp
index 51a617054d6..ab7e8ce71e0 100644
--- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp
+++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp
@@ -45,6 +45,7 @@ class SYCLInternal {
 
   sycl::device_ptr<void> scratch_space(const std::size_t size);
   sycl::device_ptr<void> scratch_flags(const std::size_t size);
+  sycl::host_ptr<void> scratch_host(const std::size_t size);
   int acquire_team_scratch_space();
   sycl::device_ptr<void> resize_team_scratch_space(int scratch_pool_id,
                                                    std::int64_t bytes,
@@ -60,6 +61,8 @@ class SYCLInternal {
 
   std::size_t m_scratchSpaceCount            = 0;
   sycl::device_ptr<size_type> m_scratchSpace = nullptr;
+  std::size_t m_scratchHostCount             = 0;
+  sycl::host_ptr<size_type> m_scratchHost    = nullptr;
   std::size_t m_scratchFlagsCount            = 0;
   sycl::device_ptr<size_type> m_scratchFlags = nullptr;
   // mutex to access shared memory
@@ -330,8 +333,8 @@ struct sycl::is_device_copyable<
     Kokkos::Experimental::Impl::SYCLFunctionWrapper<Functor, Storage, false>>
     : std::true_type {};
 
-// FIXME_SYCL Remove when this specialization when specializations for
-// sycl::device_copyable also apply to const-qualified types.
+#if (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER < 20240000) || \
+    (defined(__LIBSYCL_MAJOR_VERSION) && __LIBSYCL_MAJOR_VERSION < 7)
 template <typename>
 struct NonTriviallyCopyableAndDeviceCopyable {
   NonTriviallyCopyableAndDeviceCopyable(
@@ -356,3 +359,4 @@ struct sycl::is_device_copyable<
     : std::true_type {};
 #endif
 #endif
+#endif
diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp
index f4fada570b0..7fbf5420f83 100644
--- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp
+++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp
@@ -118,6 +118,8 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
     const BarePolicy bare_policy(m_policy);
 
+    desul::ensure_sycl_lock_arrays_on_device(q);
+
     auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
       const auto range                  = compute_ranges();
       const sycl::range<3> global_range = range.get_global_range();
diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp
index 9c5767d209f..b4de7eb89ff 100644
--- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp
+++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp
@@ -81,6 +81,8 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
     const Kokkos::Experimental::SYCL& space = policy.space();
     sycl::queue& q                          = space.sycl_queue();
 
+    desul::ensure_sycl_lock_arrays_on_device(q);
+
     auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
 #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
       cgh.depends_on(memcpy_event);
diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp
index 4fc5818ce9b..ecb4a863da2 100644
--- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp
+++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp
@@ -46,9 +46,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   int m_shmem_size;
   sycl::device_ptr<char> m_global_scratch_ptr;
   size_t m_scratch_size[2];
-  // Only let one ParallelFor/Reduce modify the team scratch memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_scratch_lock;
+  // Only let one ParallelFor instance at a time use the team scratch memory.
+  // The constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_buffers_lock;
   int m_scratch_pool_id = -1;
 
   template <typename FunctorWrapper>
@@ -59,6 +59,8 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const Kokkos::Experimental::SYCL& space = policy.space();
     sycl::queue& q                          = space.sycl_queue();
 
+    desul::ensure_sycl_lock_arrays_on_device(q);
+
     auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
       // FIXME_SYCL accessors seem to need a size greater than zero at least for
       // host queues
@@ -74,7 +76,8 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
       auto lambda = [=](sycl::nd_item<2> item) {
         const member_type team_member(
-            team_scratch_memory_L0.get_pointer(), shmem_begin, scratch_size[0],
+            KOKKOS_IMPL_SYCL_GET_MULTI_PTR(team_scratch_memory_L0), shmem_begin,
+            scratch_size[0],
             global_scratch_ptr + item.get_group(1) * scratch_size[1],
             scratch_size[1], item, item.get_group_linear_id(),
             item.get_group_range(1));
@@ -141,9 +144,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
         m_vector_size(arg_policy.impl_vector_length()),
-        m_scratch_lock(arg_policy.space()
-                           .impl_internal_space_instance()
-                           ->m_team_scratch_mutex) {
+        m_scratch_buffers_lock(arg_policy.space()
+                                   .impl_internal_space_instance()
+                                   ->m_team_scratch_mutex) {
     // FIXME_SYCL optimize
     if (m_team_size < 0)
       m_team_size =
diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp
index 6964c2dbcf0..f55280e22e3 100644
--- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp
+++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp
@@ -78,7 +78,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         m_result_ptr_device_accessible(
             MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
                               typename View::memory_space>::accessible),
-        m_shared_memory_lock(
+        m_scratch_buffers_lock(
             m_space.impl_internal_space_instance()->m_mutexScratchSpace) {}
 
  private:
@@ -95,9 +95,16 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
     const unsigned int value_count =
         m_functor_reducer.get_reducer().value_count();
     sycl::device_ptr<value_type> results_ptr;
+    auto host_result_ptr =
+        (m_result_ptr && !m_result_ptr_device_accessible)
+            ? static_cast<sycl::host_ptr<value_type>>(
+                  instance.scratch_host(sizeof(value_type) * value_count))
+            : nullptr;
 
     sycl::event last_reduction_event;
 
+    desul::ensure_sycl_lock_arrays_on_device(q);
+
     // If n_tiles==0 we only call init() and final() working with the global
     // scratch memory but don't copy back to m_result_ptr yet.
     if (n_tiles == 0) {
@@ -109,8 +116,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
 #endif
         results_ptr = static_cast<sycl::device_ptr<value_type>>(
             instance.scratch_space(sizeof(value_type) * value_count));
-        sycl::global_ptr<value_type> device_accessible_result_ptr =
-            m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+        auto device_accessible_result_ptr =
+            m_result_ptr_device_accessible
+                ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
+                : static_cast<sycl::global_ptr<value_type>>(host_result_ptr);
         cgh.single_task([=]() {
           const CombinedFunctorReducerType& functor_reducer =
               functor_reducer_wrapper.get_functor();
@@ -148,8 +157,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
 
       results_ptr = static_cast<sycl::device_ptr<value_type>>(
           instance.scratch_space(sizeof(value_type) * value_count * n_wgroups));
-      sycl::global_ptr<value_type> device_accessible_result_ptr =
-          m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+      auto device_accessible_result_ptr =
+          m_result_ptr_device_accessible
+              ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
+              : static_cast<sycl::global_ptr<value_type>>(host_result_ptr);
       auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>(
           instance.scratch_flags(sizeof(unsigned int)));
 
@@ -223,6 +234,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
                 }
                 item.barrier(sycl::access::fence_space::local_space);
                 if (num_teams_done[0] == n_wgroups) {
+                  if (local_id == 0) *scratch_flags = 0;
                   if (local_id >= static_cast<int>(n_wgroups))
                     reducer.init(&local_mem[local_id * value_count]);
                   else {
@@ -268,6 +280,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
                 }
                 item.barrier(sycl::access::fence_space::local_space);
                 if (num_teams_done[0] == n_wgroups) {
+                  if (local_id == 0) *scratch_flags = 0;
                   if (local_id >= static_cast<int>(n_wgroups))
                     reducer.init(&local_value);
                   else {
@@ -296,11 +309,13 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr && !m_result_ptr_device_accessible) {
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          m_space, m_result_ptr, results_ptr,
-          sizeof(*m_result_ptr) * value_count);
+    // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower.
+    if (host_result_ptr) {
+      m_space.fence(
+          "Kokkos::Impl::ParallelReduce<SYCL, MDRangePolicy>::execute: result "
+          "not device-accessible");
+      std::memcpy(m_result_ptr, host_result_ptr,
+                  sizeof(value_type) * value_count);
     }
 
     return last_reduction_event;
@@ -335,9 +350,9 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
 
-  // Only let one Parallel/Scan modify the shared memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_shared_memory_lock;
+  // Only let one ParallelReduce instance at a time use the host scratch memory.
+  // The constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_buffers_lock;
 };
 
 #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */
diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp
index 8c900cfa428..5333e3c8a83 100644
--- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp
+++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp
@@ -51,7 +51,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         m_result_ptr_device_accessible(
             MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
                               typename View::memory_space>::accessible),
-        m_shared_memory_lock(
+        m_scratch_buffers_lock(
             p.space().impl_internal_space_instance()->m_mutexScratchSpace) {}
 
  private:
@@ -70,11 +70,20 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
     const unsigned int value_count =
         m_functor_reducer.get_reducer().value_count();
     sycl::device_ptr<value_type> results_ptr = nullptr;
-    sycl::global_ptr<value_type> device_accessible_result_ptr =
-        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+    auto host_result_ptr =
+        (m_result_ptr && !m_result_ptr_device_accessible)
+            ? static_cast<sycl::host_ptr<value_type>>(
+                  instance.scratch_host(sizeof(value_type) * value_count))
+            : nullptr;
+    auto device_accessible_result_ptr =
+        m_result_ptr_device_accessible
+            ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
+            : static_cast<sycl::global_ptr<value_type>>(host_result_ptr);
 
     sycl::event last_reduction_event;
 
+    desul::ensure_sycl_lock_arrays_on_device(q);
+
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
@@ -168,6 +177,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
                 }
                 item.barrier(sycl::access::fence_space::local_space);
                 if (num_teams_done[0] == n_wgroups) {
+                  if (local_id == 0) *scratch_flags = 0;
                   if (local_id >= n_wgroups)
                     reducer.init(&local_mem[local_id * value_count]);
                   else {
@@ -210,6 +220,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
                 }
                 item.barrier(sycl::access::fence_space::local_space);
                 if (num_teams_done[0] == n_wgroups) {
+                  if (local_id == 0) *scratch_flags = 0;
                   if (local_id >= n_wgroups)
                     reducer.init(&local_value);
                   else {
@@ -320,11 +331,13 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr && !m_result_ptr_device_accessible) {
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, m_result_ptr, results_ptr,
-          sizeof(*m_result_ptr) * value_count);
+    // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower.
+    if (host_result_ptr) {
+      space.fence(
+          "Kokkos::Impl::ParallelReduce<SYCL, RangePolicy>::execute: result "
+          "not device-accessible");
+      std::memcpy(m_result_ptr, host_result_ptr,
+                  sizeof(*m_result_ptr) * value_count);
     }
 
     return last_reduction_event;
@@ -354,9 +367,9 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
 
-  // Only let one Parallel/Scan modify the shared memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_shared_memory_lock;
+  // Only let one ParallelReduce instance at a time use the host scratch memory.
+  // The constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_buffers_lock;
 };
 
 #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */
diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp
index 07145b0fb93..27165c59e3a 100644
--- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp
+++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp
@@ -59,9 +59,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
   const size_type m_league_size;
   int m_team_size;
   const size_type m_vector_size;
-  // Only let one ParallelFor/Reduce modify the team scratch memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_scratch_lock;
+  // Only let one ParallelReduce instance at a time use the team scratch memory
+  // and the host scratch memory. The constructor acquires the mutex which is
+  // released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_buffers_lock;
   int m_scratch_pool_id = -1;
 
   template <typename PolicyType, typename CombinedFunctorReducerWrapper>
@@ -79,9 +80,16 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         m_functor_reducer.get_reducer().value_count();
     std::size_t size = std::size_t(m_league_size) * m_team_size * m_vector_size;
     value_type* results_ptr = nullptr;
+    auto host_result_ptr =
+        (m_result_ptr && !m_result_ptr_device_accessible)
+            ? static_cast<sycl::host_ptr<value_type>>(
+                  instance.scratch_host(sizeof(value_type) * value_count))
+            : nullptr;
 
     sycl::event last_reduction_event;
 
+    desul::ensure_sycl_lock_arrays_on_device(q);
+
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
@@ -89,8 +97,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
       results_ptr =
           static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
               sizeof(value_type) * std::max(value_count, 1u)));
-      sycl::global_ptr<value_type> device_accessible_result_ptr =
-          m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+      auto device_accessible_result_ptr =
+          m_result_ptr_device_accessible
+              ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
+              : static_cast<sycl::global_ptr<value_type>>(host_result_ptr);
 
       auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         // FIXME_SYCL accessors seem to need a size greater than zero at least
@@ -121,9 +131,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
               reference_type update = reducer.init(results_ptr);
               if (size == 1) {
                 const member_type team_member(
-                    team_scratch_memory_L0.get_pointer(), shmem_begin,
-                    scratch_size[0], global_scratch_ptr, scratch_size[1], item,
-                    item.get_group_linear_id(), item.get_group_range(1));
+                    KOKKOS_IMPL_SYCL_GET_MULTI_PTR(team_scratch_memory_L0),
+                    shmem_begin, scratch_size[0], global_scratch_ptr,
+                    scratch_size[1], item, item.get_group_linear_id(),
+                    item.get_group_range(1));
                 if constexpr (std::is_void_v<WorkTag>)
                   functor(team_member, update);
                 else
@@ -160,12 +171,16 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         const auto league_size       = m_league_size;
         const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
         sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;
+        sycl::local_accessor<unsigned int> num_teams_done(1, cgh);
 
         auto team_reduction_factory =
             [&](sycl::local_accessor<value_type, 1> local_mem,
                 sycl::device_ptr<value_type> results_ptr) {
-              sycl::global_ptr<value_type> device_accessible_result_ptr =
-                  m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+              auto device_accessible_result_ptr =
+                  m_result_ptr_device_accessible
+                      ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
+                      : static_cast<sycl::global_ptr<value_type>>(
+                            host_result_ptr);
               auto lambda = [=](sycl::nd_item<2> item) {
                 auto n_wgroups = item.get_group_range()[1];
                 int wgroup_size =
@@ -173,8 +188,6 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
                 auto group_id = item.get_group_linear_id();
                 auto size     = n_wgroups * wgroup_size;
 
-                auto& num_teams_done = reinterpret_cast<unsigned int&>(
-                    local_mem[wgroup_size * std::max(value_count, 1u)]);
                 const auto local_id = item.get_local_linear_id();
                 const CombinedFunctorReducerType& functor_reducer =
                     functor_reducer_wrapper.get_functor();
@@ -188,8 +201,8 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
                   for (int league_rank = group_id; league_rank < league_size;
                        league_rank += n_wgroups) {
                     const member_type team_member(
-                        team_scratch_memory_L0.get_pointer(), shmem_begin,
-                        scratch_size[0],
+                        KOKKOS_IMPL_SYCL_GET_MULTI_PTR(team_scratch_memory_L0),
+                        shmem_begin, scratch_size[0],
                         global_scratch_ptr +
                             item.get_group(1) * scratch_size[1],
                         scratch_size[1], item, league_rank, league_size);
@@ -212,10 +225,11 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
                                      sycl::memory_scope::device,
                                      sycl::access::address_space::global_space>
                         scratch_flags_ref(*scratch_flags);
-                    num_teams_done = ++scratch_flags_ref;
+                    num_teams_done[0] = ++scratch_flags_ref;
                   }
                   sycl::group_barrier(item.get_group());
-                  if (num_teams_done == n_wgroups) {
+                  if (num_teams_done[0] == n_wgroups) {
+                    if (local_id == 0) *scratch_flags = 0;
                     if (local_id >= n_wgroups)
                       reducer.init(&local_mem[local_id * value_count]);
                     else {
@@ -241,8 +255,8 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
                   for (int league_rank = group_id; league_rank < league_size;
                        league_rank += n_wgroups) {
                     const member_type team_member(
-                        team_scratch_memory_L0.get_pointer(), shmem_begin,
-                        scratch_size[0],
+                        KOKKOS_IMPL_SYCL_GET_MULTI_PTR(team_scratch_memory_L0),
+                        shmem_begin, scratch_size[0],
                         global_scratch_ptr +
                             item.get_group(1) * scratch_size[1],
                         scratch_size[1], item, league_rank, league_size);
@@ -264,10 +278,11 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
                                      sycl::memory_scope::device,
                                      sycl::access::address_space::global_space>
                         scratch_flags_ref(*scratch_flags);
-                    num_teams_done = ++scratch_flags_ref;
+                    num_teams_done[0] = ++scratch_flags_ref;
                   }
                   item.barrier(sycl::access::fence_space::local_space);
-                  if (num_teams_done == n_wgroups) {
+                  if (num_teams_done[0] == n_wgroups) {
+                    if (local_id == 0) *scratch_flags = 0;
                     if (local_id >= n_wgroups)
                       reducer.init(&local_value);
                     else {
@@ -311,10 +326,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         auto wgroup_size = m_team_size * final_vector_size;
         std::size_t size = std::size_t(m_league_size) * wgroup_size;
         sycl::local_accessor<value_type, 1> local_mem(
-            sycl::range<1>(wgroup_size) * std::max(value_count, 1u) +
-                (sizeof(unsigned int) + sizeof(value_type) - 1) /
-                    sizeof(value_type),
-            cgh);
+            sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh);
 
         const auto init_size =
             std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
@@ -358,11 +370,13 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr && !m_result_ptr_device_accessible) {
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, m_result_ptr, results_ptr,
-          sizeof(*m_result_ptr) * value_count);
+    // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower.
+    if (host_result_ptr) {
+      space.fence(
+          "Kokkos::Impl::ParallelReduce<SYCL, TeamPolicy>::execute: result not "
+          "device-accessible");
+      std::memcpy(m_result_ptr, host_result_ptr,
+                  sizeof(*m_result_ptr) * value_count);
     }
 
     return last_reduction_event;
@@ -448,9 +462,9 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
         m_vector_size(arg_policy.impl_vector_length()),
-        m_scratch_lock(arg_policy.space()
-                           .impl_internal_space_instance()
-                           ->m_team_scratch_mutex) {
+        m_scratch_buffers_lock(arg_policy.space()
+                                   .impl_internal_space_instance()
+                                   ->m_team_scratch_mutex) {
     initialize();
   }
 };
diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp
index 04425723e19..977b69bc9eb 100644
--- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp
+++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp
@@ -14,8 +14,8 @@
 //
 //@HEADER
 
-#ifndef KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP
-#define KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP
+#ifndef KOKKOS_SYCL_PARALLEL_SCAN_RANGE_HPP
+#define KOKKOS_SYCL_PARALLEL_SCAN_RANGE_HPP
 
 #include <Kokkos_Macros.hpp>
 #include <memory>
@@ -111,13 +111,13 @@ class ParallelScanSYCLBase {
   const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>
       m_functor_reducer;
   const Policy m_policy;
-  pointer_type m_scratch_space = nullptr;
-  const pointer_type m_result_ptr;
+  sycl::host_ptr<value_type> m_scratch_host = nullptr;
+  pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
 
-  // Only let one Parallel/Scan modify the shared memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_shared_memory_lock;
+  // Only let one ParallelScan instance at a time use the host scratch memory.
+  // The constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_buffers_lock;
 
  private:
   template <typename FunctorWrapper>
@@ -187,6 +187,7 @@ class ParallelScanSYCLBase {
             }
             item.barrier(sycl::access::fence_space::global_space);
             if (num_teams_done[0] == n_wgroups) {
+              if (local_id == 0) *scratch_flags = 0;
               value_type total;
               reducer.init(&total);
 
@@ -220,6 +221,8 @@ class ParallelScanSYCLBase {
     sycl::device_ptr<value_type> global_mem;
     sycl::device_ptr<value_type> group_results;
 
+    desul::ensure_sycl_lock_arrays_on_device(q);
+
     auto perform_work_group_scans = q.submit([&](sycl::handler& cgh) {
       sycl::local_accessor<unsigned int> num_teams_done(1, cgh);
 
@@ -253,7 +256,8 @@ class ParallelScanSYCLBase {
       global_mem =
           static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
               n_wgroups * (wgroup_size + 1) * sizeof(value_type)));
-      m_scratch_space = global_mem;
+      m_scratch_host = static_cast<sycl::host_ptr<value_type>>(
+          instance.scratch_host(sizeof(value_type)));
 
       group_results = global_mem + n_wgroups * wgroup_size;
 
@@ -281,10 +285,11 @@ class ParallelScanSYCLBase {
 
     // Write results to global memory
     auto update_global_results = q.submit([&](sycl::handler& cgh) {
-      auto result_ptr_device_accessible = m_result_ptr_device_accessible;
       // The compiler failed with CL_INVALID_ARG_VALUE if using m_result_ptr
       // directly.
-      auto result_ptr = m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+      pointer_type result_ptr = m_result_ptr_device_accessible
+                                    ? m_result_ptr
+                                    : static_cast<pointer_type>(m_scratch_host);
 
 #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
       cgh.depends_on(perform_work_group_scans);
@@ -293,7 +298,6 @@ class ParallelScanSYCLBase {
       cgh.parallel_for(
           sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
           [=](sycl::nd_item<1> item) {
-            auto global_mem_copy       = global_mem;
             const index_type global_id = item.get_global_linear_id();
             const CombinedFunctorReducer<
                 FunctorType, typename Analysis::Reducer>& functor_reducer =
@@ -312,9 +316,7 @@ class ParallelScanSYCLBase {
               else
                 functor(WorkTag(), global_id + begin, update, true);
 
-              global_mem_copy[global_id] = update;
-              if (global_id == size - 1 && result_ptr_device_accessible)
-                *result_ptr = update;
+              if (global_id == size - 1) *result_ptr = update;
             }
           });
     });
@@ -351,9 +353,9 @@ class ParallelScanSYCLBase {
         m_policy(arg_policy),
         m_result_ptr(arg_result_ptr),
         m_result_ptr_device_accessible(arg_result_ptr_device_accessible),
-        m_shared_memory_lock(m_policy.space()
-                                 .impl_internal_space_instance()
-                                 ->m_mutexScratchSpace) {}
+        m_scratch_buffers_lock(m_policy.space()
+                                   .impl_internal_space_instance()
+                                   ->m_mutexScratchSpace) {}
 };
 
 }  // namespace Kokkos::Impl
@@ -390,11 +392,13 @@ class Kokkos::Impl::ParallelScanWithTotal<
     Base::impl_execute([&]() {
       const long long nwork = Base::m_policy.end() - Base::m_policy.begin();
       if (nwork > 0 && !Base::m_result_ptr_device_accessible) {
+        // Using DeepCopy instead of fence+memcpy turned out to be up to 2x
+        // slower.
+        m_exec.fence(
+            "Kokkos::Impl::ParallelReduce<SYCL, MDRangePolicy>::execute: "
+            "result not device-accessible");
         const int size = Base::m_functor_reducer.get_reducer().value_size();
-        DeepCopy<HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>(m_exec, Base::m_result_ptr,
-                                             Base::m_scratch_space + nwork - 1,
-                                             size);
+        std::memcpy(Base::m_result_ptr, Base::m_scratch_host, size);
       }
     });
   }
diff --git a/core/src/SYCL/Kokkos_SYCL_Space.cpp b/core/src/SYCL/Kokkos_SYCL_Space.cpp
index 64b7f56796a..9cc8008cdf3 100644
--- a/core/src/SYCL/Kokkos_SYCL_Space.cpp
+++ b/core/src/SYCL/Kokkos_SYCL_Space.cpp
@@ -25,7 +25,6 @@
 #include <SYCL/Kokkos_SYCL_Space.hpp>
 #include <SYCL/Kokkos_SYCL_DeepCopy.hpp>
 #include <SYCL/Kokkos_SYCL_Instance.hpp>
-#include <impl/Kokkos_MemorySpace.hpp>
 #include <impl/Kokkos_Profiling.hpp>
 
 /*--------------------------------------------------------------------------*/
@@ -243,226 +242,17 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label,
 }  // namespace Experimental
 }  // namespace Kokkos
 
-namespace Kokkos {
-namespace Impl {
-
-#ifdef KOKKOS_ENABLE_DEBUG
-SharedAllocationRecord<void, void> SharedAllocationRecord<
-    Kokkos::Experimental::SYCLDeviceUSMSpace, void>::s_root_record;
-
-SharedAllocationRecord<void, void> SharedAllocationRecord<
-    Kokkos::Experimental::SYCLSharedUSMSpace, void>::s_root_record;
-
-SharedAllocationRecord<void, void> SharedAllocationRecord<
-    Kokkos::Experimental::SYCLHostUSMSpace, void>::s_root_record;
-#endif
-
-SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
-    SharedAllocationRecord(
-        const Kokkos::Experimental::SYCLDeviceUSMSpace& space,
-        const std::string& label, const size_t size,
-        const SharedAllocationRecord<void, void>::function_type dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                                  void>::s_root_record,
-#endif
-          Kokkos::Impl::checked_allocation_with_header(space, label, size),
-          sizeof(SharedAllocationHeader) + size, dealloc, label),
-      m_space(space) {
-  SharedAllocationHeader header;
-
-  this->base_t::_fill_host_accessible_header_info(header, label);
-
-  // Copy to device memory
-  Kokkos::Experimental::SYCL exec;
-  Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, HostSpace>(
-      exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader));
-  exec.fence(
-      "SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, "
-      "void>::SharedAllocationRecord(): fence after copying header from "
-      "HostSpace");
-}
-
-SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
-    SharedAllocationRecord(
-        const Kokkos::Experimental::SYCL& arg_exec_space,
-        const Kokkos::Experimental::SYCLDeviceUSMSpace& space,
-        const std::string& label, const size_t size,
-        const SharedAllocationRecord<void, void>::function_type dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                                  void>::s_root_record,
-#endif
-          Kokkos::Impl::checked_allocation_with_header(arg_exec_space, space,
-                                                       label, size),
-          sizeof(SharedAllocationHeader) + size, dealloc, label),
-      m_space(space) {
-  SharedAllocationHeader header;
-
-  this->base_t::_fill_host_accessible_header_info(header, label);
-
-  // Copy to device memory
-  Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, HostSpace>(
-      arg_exec_space, RecordBase::m_alloc_ptr, &header,
-      sizeof(SharedAllocationHeader));
-}
-
-SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>::
-    SharedAllocationRecord(
-        const Kokkos::Experimental::SYCL& exec_space,
-        const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space,
-        const std::string& arg_label, const size_t arg_alloc_size,
-        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace,
-                                  void>::s_root_record,
-#endif
-          Impl::checked_allocation_with_header(exec_space, arg_space, arg_label,
-                                               arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-
-  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
-                                                  arg_label);
-}
-
-SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>::
-    SharedAllocationRecord(
-        const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space,
-        const std::string& arg_label, const size_t arg_alloc_size,
-        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace,
-                                  void>::s_root_record,
-#endif
-          Impl::checked_allocation_with_header(arg_space, arg_label,
-                                               arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-
-  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
-                                                  arg_label);
-}
-
-SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>::
-    SharedAllocationRecord(
-        const Kokkos::Experimental::SYCL& exec_space,
-        const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
-        const std::string& arg_label, const size_t arg_alloc_size,
-        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace,
-                                  void>::s_root_record,
-#endif
-          Impl::checked_allocation_with_header(exec_space, arg_space, arg_label,
-                                               arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-
-  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
-                                                  arg_label);
-}
-
-SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>::
-    SharedAllocationRecord(
-        const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
-        const std::string& arg_label, const size_t arg_alloc_size,
-        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace,
-                                  void>::s_root_record,
-#endif
-          Impl::checked_allocation_with_header(arg_space, arg_label,
-                                               arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-
-  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
-                                                  arg_label);
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                       void>::~SharedAllocationRecord() {
-  const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
-  m_space.deallocate(m_label.c_str(),
-                     SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     alloc_size, alloc_size - sizeof(SharedAllocationHeader));
-}
-
-SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace,
-                       void>::~SharedAllocationRecord() {
-  const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
-  m_space.deallocate(m_label.c_str(),
-                     SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     alloc_size, alloc_size - sizeof(SharedAllocationHeader));
-}
-
-SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace,
-                       void>::~SharedAllocationRecord() {
-  const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
-  m_space.deallocate(m_label.c_str(),
-                     SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     alloc_size, alloc_size - sizeof(SharedAllocationHeader));
-}
-
-//----------------------------------------------------------------------------
-
-}  // namespace Impl
-}  // namespace Kokkos
-
 //==============================================================================
 // <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
 
 #include <impl/Kokkos_SharedAlloc_timpl.hpp>
 
-namespace Kokkos {
-namespace Impl {
-
-// To avoid additional compilation cost for something that's (mostly?) not
-// performance sensitive, we explicity instantiate these CRTP base classes here,
-// where we have access to the associated *_timpl.hpp header files.
-template class HostInaccessibleSharedAllocationRecordCommon<
-    Kokkos::Experimental::SYCLDeviceUSMSpace>;
-template class SharedAllocationRecordCommon<
-    Kokkos::Experimental::SYCLDeviceUSMSpace>;
-template class SharedAllocationRecordCommon<
-    Kokkos::Experimental::SYCLSharedUSMSpace>;
-template class SharedAllocationRecordCommon<
-    Kokkos::Experimental::SYCLHostUSMSpace>;
-
-}  // namespace Impl
-}  // namespace Kokkos
+KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
+    Kokkos::Experimental::SYCLDeviceUSMSpace);
+KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
+    Kokkos::Experimental::SYCLSharedUSMSpace);
+KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
+    Kokkos::Experimental::SYCLHostUSMSpace);
 
 // </editor-fold> end Explicit instantiations of CRTP Base classes }}}1
 //==============================================================================
diff --git a/core/src/SYCL/Kokkos_SYCL_Space.hpp b/core/src/SYCL/Kokkos_SYCL_Space.hpp
index 239c6e3ce0b..b86cfca413c 100644
--- a/core/src/SYCL/Kokkos_SYCL_Space.hpp
+++ b/core/src/SYCL/Kokkos_SYCL_Space.hpp
@@ -66,11 +66,6 @@ class SYCLDeviceUSMSpace {
                   const size_t arg_alloc_size,
                   const size_t arg_logical_size = 0) const;
 
- private:
-  template <class, class, class, class>
-  friend class LogicalMemorySpace;
-
- public:
   static constexpr const char* name() { return "SYCLDeviceUSM"; };
 
  private:
@@ -87,6 +82,16 @@ class SYCLSharedUSMSpace {
   SYCLSharedUSMSpace();
   explicit SYCLSharedUSMSpace(sycl::queue queue);
 
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
+    return allocate(arg_alloc_size);
+  }
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const {
+    return allocate(arg_label, arg_alloc_size, arg_logical_size);
+  }
   void* allocate(const SYCL& exec_space,
                  const std::size_t arg_alloc_size) const;
   void* allocate(const SYCL& exec_space, const char* arg_label,
@@ -102,11 +107,6 @@ class SYCLSharedUSMSpace {
                   const size_t arg_alloc_size,
                   const size_t arg_logical_size = 0) const;
 
- private:
-  template <class, class, class, class>
-  friend class LogicalMemorySpace;
-
- public:
   static constexpr const char* name() { return "SYCLSharedUSM"; };
 
  private:
@@ -123,6 +123,16 @@ class SYCLHostUSMSpace {
   SYCLHostUSMSpace();
   explicit SYCLHostUSMSpace(sycl::queue queue);
 
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
+    return allocate(arg_alloc_size);
+  }
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const {
+    return allocate(arg_label, arg_alloc_size, arg_logical_size);
+  }
   void* allocate(const SYCL& exec_space,
                  const std::size_t arg_alloc_size) const;
   void* allocate(const SYCL& exec_space, const char* arg_label,
@@ -138,11 +148,6 @@ class SYCLHostUSMSpace {
                   const size_t arg_alloc_size,
                   const size_t arg_logical_size = 0) const;
 
- private:
-  template <class, class, class, class>
-  friend class LogicalMemorySpace;
-
- public:
   static constexpr const char* name() { return "SYCLHostUSM"; };
 
  private:
@@ -166,19 +171,16 @@ struct is_sycl_type_space<Kokkos::Experimental::SYCLHostUSMSpace>
     : public std::true_type {};
 
 static_assert(Kokkos::Impl::MemorySpaceAccess<
-                  Kokkos::Experimental::SYCLDeviceUSMSpace,
-                  Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
-              "");
+              Kokkos::Experimental::SYCLDeviceUSMSpace,
+              Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable);
 
 static_assert(Kokkos::Impl::MemorySpaceAccess<
-                  Kokkos::Experimental::SYCLSharedUSMSpace,
-                  Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
-              "");
+              Kokkos::Experimental::SYCLSharedUSMSpace,
+              Kokkos::Experimental::SYCLSharedUSMSpace>::assignable);
 
 static_assert(Kokkos::Impl::MemorySpaceAccess<
-                  Kokkos::Experimental::SYCLDeviceUSMSpace,
-                  Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
-              "");
+              Kokkos::Experimental::SYCLDeviceUSMSpace,
+              Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable);
 
 template <>
 struct MemorySpaceAccess<Kokkos::HostSpace,
@@ -310,151 +312,14 @@ struct MemorySpaceAccess<
 
 }  // namespace Impl
 
-namespace Impl {
-
-template <>
-class SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>
-    : public HostInaccessibleSharedAllocationRecordCommon<
-          Kokkos::Experimental::SYCLDeviceUSMSpace> {
- private:
-  friend class SharedAllocationRecordCommon<
-      Kokkos::Experimental::SYCLDeviceUSMSpace>;
-  friend class HostInaccessibleSharedAllocationRecordCommon<
-      Kokkos::Experimental::SYCLDeviceUSMSpace>;
-  using base_t = HostInaccessibleSharedAllocationRecordCommon<
-      Kokkos::Experimental::SYCLDeviceUSMSpace>;
-  using RecordBase = SharedAllocationRecord<void, void>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord(SharedAllocationRecord&&)      = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete;
-
-#ifdef KOKKOS_ENABLE_DEBUG
-  static RecordBase s_root_record;
-#endif
-
-  const Kokkos::Experimental::SYCLDeviceUSMSpace m_space;
-
- protected:
-  ~SharedAllocationRecord();
-
-  template <typename ExecutionSpace>
-  SharedAllocationRecord(
-      const ExecutionSpace& /*exec_space*/,
-      const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
-      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
-                               arg_dealloc) {}
-
-  SharedAllocationRecord(
-      const Kokkos::Experimental::SYCL& exec_space,
-      const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-
-  SharedAllocationRecord(
-      const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-};
-
-template <>
-class SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>
-    : public SharedAllocationRecordCommon<
-          Kokkos::Experimental::SYCLSharedUSMSpace> {
- private:
-  friend class SharedAllocationRecordCommon<
-      Kokkos::Experimental::SYCLSharedUSMSpace>;
-  using base_t =
-      SharedAllocationRecordCommon<Kokkos::Experimental::SYCLSharedUSMSpace>;
-  using RecordBase = SharedAllocationRecord<void, void>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord(SharedAllocationRecord&&)      = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete;
-
-  static RecordBase s_root_record;
-
-  const Kokkos::Experimental::SYCLSharedUSMSpace m_space;
-
- protected:
-  ~SharedAllocationRecord();
-
-  SharedAllocationRecord() = default;
-
-  template <typename ExecutionSpace>
-  SharedAllocationRecord(
-      const ExecutionSpace& /*exec_space*/,
-      const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
-      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
-                               arg_dealloc) {}
-
-  SharedAllocationRecord(
-      const Kokkos::Experimental::SYCL& exec_space,
-      const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-
-  SharedAllocationRecord(
-      const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-};
-
-template <>
-class SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>
-    : public SharedAllocationRecordCommon<
-          Kokkos::Experimental::SYCLHostUSMSpace> {
- private:
-  friend class SharedAllocationRecordCommon<
-      Kokkos::Experimental::SYCLHostUSMSpace>;
-  using base_t =
-      SharedAllocationRecordCommon<Kokkos::Experimental::SYCLHostUSMSpace>;
-  using RecordBase = SharedAllocationRecord<void, void>;
-
-  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord(SharedAllocationRecord&&)      = delete;
-  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-  SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete;
-
-  static RecordBase s_root_record;
-
-  const Kokkos::Experimental::SYCLHostUSMSpace m_space;
-
- protected:
-  ~SharedAllocationRecord();
-
-  SharedAllocationRecord() = default;
-
-  template <typename ExecutionSpace>
-  SharedAllocationRecord(
-      const ExecutionSpace& /*exec_space*/,
-      const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
-      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
-                               arg_dealloc) {}
-
-  SharedAllocationRecord(
-      const Kokkos::Experimental::SYCL& exec_space,
-      const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-
-  SharedAllocationRecord(
-      const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-};
-
-}  // namespace Impl
-
 }  // namespace Kokkos
 
+KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(
+    Kokkos::Experimental::SYCLDeviceUSMSpace);
+KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(
+    Kokkos::Experimental::SYCLSharedUSMSpace);
+KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(
+    Kokkos::Experimental::SYCLHostUSMSpace);
+
 #endif
 #endif
diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp
index 89c09c3195f..dbba3827581 100644
--- a/core/src/SYCL/Kokkos_SYCL_Team.hpp
+++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp
@@ -140,9 +140,14 @@ class SYCLTeamMember {
     }
     value = sg.shuffle(value, 0);
 
+    const auto n_subgroups = sg.get_group_range()[0];
+    if (n_subgroups == 1) {
+      reducer.reference() = value;
+      return;
+    }
+
     // We need to chunk up the whole reduction because we might not have
     // allocated enough memory.
-    const auto n_subgroups = sg.get_group_range()[0];
     const unsigned int maximum_work_range =
         std::min<int>(m_team_reduce_size / sizeof(value_type), n_subgroups);
 
diff --git a/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp
index 9548f211d9e..61db6b34aac 100644
--- a/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp
+++ b/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp
@@ -26,8 +26,7 @@ namespace Impl {
 template <class T, class... P>
 struct ZeroMemset<Kokkos::Experimental::SYCL, View<T, P...>> {
   ZeroMemset(const Kokkos::Experimental::SYCL& exec_space,
-             const View<T, P...>& dst,
-             typename View<T, P...>::const_value_type&) {
+             const View<T, P...>& dst) {
     auto event = exec_space.impl_internal_space_instance()->m_queue->memset(
         dst.data(), 0, dst.size() * sizeof(typename View<T, P...>::value_type));
 #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
@@ -35,12 +34,6 @@ struct ZeroMemset<Kokkos::Experimental::SYCL, View<T, P...>> {
         ->m_queue->ext_oneapi_submit_barrier(std::vector<sycl::event>{event});
 #endif
   }
-
-  ZeroMemset(const View<T, P...>& dst,
-             typename View<T, P...>::const_value_type&) {
-    Experimental::Impl::SYCLInternal::singleton().m_queue->memset(
-        dst.data(), 0, dst.size() * sizeof(typename View<T, P...>::value_type));
-  }
 };
 
 }  // namespace Impl
diff --git a/core/src/Serial/Kokkos_Serial.cpp b/core/src/Serial/Kokkos_Serial.cpp
index 071ecdbc4fa..39b201976b5 100644
--- a/core/src/Serial/Kokkos_Serial.cpp
+++ b/core/src/Serial/Kokkos_Serial.cpp
@@ -153,7 +153,7 @@ void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const {
   os << "Host Serial Execution Space:\n";
   os << "  KOKKOS_ENABLE_SERIAL: yes\n";
 
-#ifdef KOKKOS_INTERNAL_NOT_PARALLEL
+#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS
   os << "Kokkos atomics disabled\n";
 #endif
 
diff --git a/core/src/Serial/Kokkos_Serial.hpp b/core/src/Serial/Kokkos_Serial.hpp
index 67119cac164..43eb4992ed7 100644
--- a/core/src/Serial/Kokkos_Serial.hpp
+++ b/core/src/Serial/Kokkos_Serial.hpp
@@ -121,7 +121,10 @@ class Serial {
   /// For the Serial device, this method <i>always</i> returns false,
   /// because parallel_for or parallel_reduce with the Serial device
   /// always execute sequentially.
-  inline static int in_parallel() { return false; }
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED inline static int in_parallel() { return false; }
+#endif
 
   /// \brief Wait until all dispatched functors complete.
   ///
diff --git a/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp
index 69787aa5001..67978aa3e9f 100644
--- a/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp
+++ b/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp
@@ -14,8 +14,8 @@
 //
 //@HEADER
 
-#ifndef KOKKO_SERIAL_PARALLEL_MDRANGE_HPP
-#define KOKKO_SERIAL_PARALLEL_MDRANGE_HPP
+#ifndef KOKKOS_SERIAL_PARALLEL_MDRANGE_HPP
+#define KOKKOS_SERIAL_PARALLEL_MDRANGE_HPP
 
 #include <Kokkos_Parallel.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp
index 56894716dbd..91b4c567113 100644
--- a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp
+++ b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp
@@ -14,8 +14,8 @@
 //
 //@HEADER
 
-#ifndef KOKKO_SERIAL_PARALLEL_RANGE_HPP
-#define KOKKO_SERIAL_PARALLEL_RANGE_HPP
+#ifndef KOKKOS_SERIAL_PARALLEL_RANGE_HPP
+#define KOKKOS_SERIAL_PARALLEL_RANGE_HPP
 
 #include <Kokkos_Parallel.hpp>
 
diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp
index 0876f1af229..f34a7daaca0 100644
--- a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp
+++ b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp
@@ -14,8 +14,8 @@
 //
 //@HEADER
 
-#ifndef KOKKO_SERIAL_PARALLEL_TEAM_HPP
-#define KOKKO_SERIAL_PARALLEL_TEAM_HPP
+#ifndef KOKKOS_SERIAL_PARALLEL_TEAM_HPP
+#define KOKKOS_SERIAL_PARALLEL_TEAM_HPP
 
 #include <Kokkos_Parallel.hpp>
 
diff --git a/core/src/Serial/Kokkos_Serial_Task.hpp b/core/src/Serial/Kokkos_Serial_Task.hpp
index f9c86f55ce0..5905d6d32e1 100644
--- a/core/src/Serial/Kokkos_Serial_Task.hpp
+++ b/core/src/Serial/Kokkos_Serial_Task.hpp
@@ -121,7 +121,7 @@ class TaskQueueSpecializationConstrained<
     using task_base_type = TaskBase;
     using queue_type     = typename scheduler_type::queue_type;
 
-    task_base_type* const end = (task_base_type*)task_base_type::EndTag;
+    auto* const end = reinterpret_cast<task_base_type*>(task_base_type::EndTag);
 
     execution_space serial_execution_space;
     auto& data = serial_execution_space.impl_internal_space_instance()
@@ -157,7 +157,7 @@ class TaskQueueSpecializationConstrained<
     using task_base_type = TaskBase;
     using queue_type     = typename scheduler_type::queue_type;
 
-    task_base_type* const end = (task_base_type*)task_base_type::EndTag;
+    auto* const end = reinterpret_cast<task_base_type*>(task_base_type::EndTag);
 
     execution_space serial_execution_space;
 
diff --git a/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp
index 3ec2dfbcfa0..6ad6aabc5a7 100644
--- a/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp
+++ b/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp
@@ -22,6 +22,7 @@
 #include <Serial/Kokkos_Serial.hpp>
 
 #include <type_traits>
+#include <cstring>
 
 namespace Kokkos {
 namespace Impl {
@@ -34,14 +35,11 @@ template <class T, class... P>
 struct ZeroMemset<
     std::conditional_t<!std::is_same<Serial, DefaultHostExecutionSpace>::value,
                        Serial, DummyExecutionSpace>,
-    View<T, P...>>
-    : public ZeroMemset<DefaultHostExecutionSpace, View<T, P...>> {
-  using Base = ZeroMemset<DefaultHostExecutionSpace, View<T, P...>>;
-  using Base::Base;
-
-  ZeroMemset(const Serial&, const View<T, P...>& dst,
-             typename View<T, P...>::const_value_type& value)
-      : Base(dst, value) {}
+    View<T, P...>> {
+  ZeroMemset(const Serial&, const View<T, P...>& dst) {
+    using ValueType = typename View<T, P...>::value_type;
+    std::memset(dst.data(), 0, sizeof(ValueType) * dst.size());
+  }
 };
 
 }  // namespace Impl
diff --git a/core/src/Threads/Kokkos_Threads.hpp b/core/src/Threads/Kokkos_Threads.hpp
index c0d70c03ecb..31653c46cac 100644
--- a/core/src/Threads/Kokkos_Threads.hpp
+++ b/core/src/Threads/Kokkos_Threads.hpp
@@ -38,15 +38,6 @@ static_assert(false,
 
 /*--------------------------------------------------------------------------*/
 
-namespace Kokkos {
-namespace Impl {
-class ThreadsExec;
-enum class fence_is_static { yes, no };
-}  // namespace Impl
-}  // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
 namespace Kokkos {
 
 /** \brief  Execution space for a pool of C++11 threads on a CPU. */
@@ -73,7 +64,9 @@ class Threads {
 
   /// \brief True if and only if this method is being called in a
   ///   thread-parallel function.
-  static int in_parallel();
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED static int in_parallel();
+#endif
 
   /// \brief Print configuration information to the given output stream.
   void print_configuration(std::ostream& os, bool verbose = false) const;
diff --git a/core/src/Threads/Kokkos_ThreadsExec.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp
similarity index 56%
rename from core/src/Threads/Kokkos_ThreadsExec.cpp
rename to core/src/Threads/Kokkos_Threads_Instance.cpp
index 801a1ac82e9..3842966cd77 100644
--- a/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/core/src/Threads/Kokkos_Threads_Instance.cpp
@@ -16,17 +16,15 @@
 
 #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
 #define KOKKOS_IMPL_PUBLIC_INCLUDE
+#include "Threads/Kokkos_Threads_Instance.hpp"
 #endif
 
 #include <Kokkos_Macros.hpp>
 
-#include <cstdint>
-#include <limits>
 #include <utility>
 #include <iostream>
 #include <sstream>
 #include <thread>
-#include <mutex>
 
 #include <Kokkos_Core.hpp>
 
@@ -41,7 +39,6 @@
 namespace Kokkos {
 namespace Impl {
 namespace {
-std::mutex host_internal_cppthread_mutex;
 
 // std::thread compatible driver.
 // Recovery from an exception would require constant intra-thread health
@@ -49,7 +46,7 @@ std::mutex host_internal_cppthread_mutex;
 // abort the process.
 void internal_cppthread_driver() {
   try {
-    ThreadsExec::driver();
+    ThreadsInternal::driver();
   } catch (const std::exception &x) {
     std::cerr << "Exception thrown from worker thread: " << x.what()
               << std::endl;
@@ -62,32 +59,17 @@ void internal_cppthread_driver() {
   }
 }
 
-ThreadsExec s_threads_process;
-ThreadsExec *s_threads_exec[ThreadsExec::MAX_THREAD_COUNT] = {nullptr};
-std::thread::id s_threads_pid[ThreadsExec::MAX_THREAD_COUNT];
-std::pair<unsigned, unsigned> s_threads_coord[ThreadsExec::MAX_THREAD_COUNT];
+ThreadsInternal s_threads_process;
+ThreadsInternal *s_threads_exec[ThreadsInternal::MAX_THREAD_COUNT] = {nullptr};
+std::thread::id s_threads_pid[ThreadsInternal::MAX_THREAD_COUNT];
+std::pair<unsigned, unsigned>
+    s_threads_coord[ThreadsInternal::MAX_THREAD_COUNT];
 
 int s_thread_pool_size[3] = {0, 0, 0};
 
-unsigned s_current_reduce_size = 0;
-unsigned s_current_shared_size = 0;
-
-void (*volatile s_current_function)(ThreadsExec &, const void *);
+void (*volatile s_current_function)(ThreadsInternal &, const void *);
 const void *volatile s_current_function_arg = nullptr;
 
-struct Sentinel {
-  ~Sentinel() {
-    if (s_thread_pool_size[0] || s_thread_pool_size[1] ||
-        s_thread_pool_size[2] || s_current_reduce_size ||
-        s_current_shared_size || s_current_function || s_current_function_arg ||
-        s_threads_exec[0]) {
-      std::cerr << "ERROR : Process exiting while Kokkos::Threads is still "
-                   "initialized"
-                << std::endl;
-    }
-  }
-};
-
 inline unsigned fan_size(const unsigned rank, const unsigned size) {
   const unsigned rank_rev = size - (rank + 1);
   unsigned count          = 0;
@@ -97,6 +79,12 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) {
   return count;
 }
 
+void wait_yield(volatile ThreadState &flag, const ThreadState value) {
+  while (value == flag) {
+    std::this_thread::yield();
+  }
+}
+
 }  // namespace
 }  // namespace Impl
 }  // namespace Kokkos
@@ -107,66 +95,44 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) {
 namespace Kokkos {
 namespace Impl {
 
-//----------------------------------------------------------------------------
-// Spawn a thread
-
-void ThreadsExec::spawn() {
-  std::thread t(internal_cppthread_driver);
-  t.detach();
-}
-
-//----------------------------------------------------------------------------
-
-bool ThreadsExec::is_process() {
+bool ThreadsInternal::is_process() {
   static const std::thread::id master_pid = std::this_thread::get_id();
 
   return master_pid == std::this_thread::get_id();
 }
 
-void ThreadsExec::global_lock() { host_internal_cppthread_mutex.lock(); }
-
-void ThreadsExec::global_unlock() { host_internal_cppthread_mutex.unlock(); }
-
 //----------------------------------------------------------------------------
 
-void ThreadsExec::wait_yield(volatile int &flag, const int value) {
-  while (value == flag) {
-    std::this_thread::yield();
-  }
-}
-
-void execute_function_noop(ThreadsExec &, const void *) {}
+void execute_function_noop(ThreadsInternal &, const void *) {}
 
-void ThreadsExec::driver() {
+void ThreadsInternal::driver() {
   SharedAllocationRecord<void, void>::tracking_enable();
 
-  ThreadsExec this_thread;
+  ThreadsInternal this_thread;
 
-  while (ThreadsExec::Active == this_thread.m_pool_state) {
+  while (this_thread.m_pool_state == ThreadState::Active) {
     (*s_current_function)(this_thread, s_current_function_arg);
 
     // Deactivate thread and wait for reactivation
-    this_thread.m_pool_state = ThreadsExec::Inactive;
+    this_thread.m_pool_state = ThreadState::Inactive;
 
-    wait_yield(this_thread.m_pool_state, ThreadsExec::Inactive);
+    wait_yield(this_thread.m_pool_state, ThreadState::Inactive);
   }
 }
 
-ThreadsExec::ThreadsExec()
+ThreadsInternal::ThreadsInternal()
     : m_pool_base(nullptr),
       m_scratch(nullptr),
       m_scratch_reduce_end(0),
       m_scratch_thread_end(0),
-      m_numa_rank(0),
-      m_numa_core_rank(0),
       m_pool_rank(0),
       m_pool_size(0),
       m_pool_fan_size(0),
-      m_pool_state(ThreadsExec::Terminating) {
+      m_pool_state(ThreadState::Terminating) {
   if (&s_threads_process != this) {
-    // A spawned thread
-
-    ThreadsExec *const nil = nullptr;
+    // The code in the if is executed by a spawned thread not by the root
+    // thread
+    ThreadsInternal *const nil = nullptr;
 
     // Which entry in 's_threads_exec', possibly determined from hwloc binding
     const int entry = reinterpret_cast<size_t>(s_current_function_arg) <
@@ -178,80 +144,66 @@ ThreadsExec::ThreadsExec()
     // Given a good entry set this thread in the 's_threads_exec' array
     if (entry < s_thread_pool_size[0] &&
         nil == atomic_compare_exchange(s_threads_exec + entry, nil, this)) {
-      const std::pair<unsigned, unsigned> coord =
-          Kokkos::hwloc::get_this_thread_coordinate();
-
-      m_numa_rank      = coord.first;
-      m_numa_core_rank = coord.second;
-      m_pool_base      = s_threads_exec;
-      m_pool_rank      = s_thread_pool_size[0] - (entry + 1);
-      m_pool_rank_rev  = s_thread_pool_size[0] - (pool_rank() + 1);
-      m_pool_size      = s_thread_pool_size[0];
-      m_pool_fan_size  = fan_size(m_pool_rank, m_pool_size);
-      m_pool_state     = ThreadsExec::Active;
+      m_pool_base     = s_threads_exec;
+      m_pool_rank     = s_thread_pool_size[0] - (entry + 1);
+      m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1);
+      m_pool_size     = s_thread_pool_size[0];
+      m_pool_fan_size = fan_size(m_pool_rank, m_pool_size);
+      m_pool_state    = ThreadState::Active;
 
       s_threads_pid[m_pool_rank] = std::this_thread::get_id();
 
       // Inform spawning process that the threads_exec entry has been set.
-      s_threads_process.m_pool_state = ThreadsExec::Active;
+      s_threads_process.m_pool_state = ThreadState::Active;
     } else {
       // Inform spawning process that the threads_exec entry could not be set.
-      s_threads_process.m_pool_state = ThreadsExec::Terminating;
+      s_threads_process.m_pool_state = ThreadState::Terminating;
     }
   } else {
     // Enables 'parallel_for' to execute on unitialized Threads device
     m_pool_rank  = 0;
     m_pool_size  = 1;
-    m_pool_state = ThreadsExec::Inactive;
+    m_pool_state = ThreadState::Inactive;
 
     s_threads_pid[m_pool_rank] = std::this_thread::get_id();
   }
 }
 
-ThreadsExec::~ThreadsExec() {
+ThreadsInternal::~ThreadsInternal() {
   const unsigned entry = m_pool_size - (m_pool_rank + 1);
 
-  using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>;
-
   if (m_scratch) {
-    Record *const r = Record::get_record(m_scratch);
-
+    Kokkos::kokkos_free<Kokkos::HostSpace>(m_scratch);
     m_scratch = nullptr;
-
-    Record::decrement(r);
   }
 
   m_pool_base          = nullptr;
   m_scratch_reduce_end = 0;
   m_scratch_thread_end = 0;
-  m_numa_rank          = 0;
-  m_numa_core_rank     = 0;
   m_pool_rank          = 0;
   m_pool_size          = 0;
   m_pool_fan_size      = 0;
 
-  m_pool_state = ThreadsExec::Terminating;
+  m_pool_state = ThreadState::Terminating;
 
   if (&s_threads_process != this && entry < MAX_THREAD_COUNT) {
-    ThreadsExec *const nil = nullptr;
+    ThreadsInternal *const nil = nullptr;
 
     atomic_compare_exchange(s_threads_exec + entry, this, nil);
 
-    s_threads_process.m_pool_state = ThreadsExec::Terminating;
+    s_threads_process.m_pool_state = ThreadState::Terminating;
   }
 }
 
-int ThreadsExec::get_thread_count() { return s_thread_pool_size[0]; }
-
-ThreadsExec *ThreadsExec::get_thread(const int init_thread_rank) {
-  ThreadsExec *const th =
+ThreadsInternal *ThreadsInternal::get_thread(const int init_thread_rank) {
+  ThreadsInternal *const th =
       init_thread_rank < s_thread_pool_size[0]
           ? s_threads_exec[s_thread_pool_size[0] - (init_thread_rank + 1)]
           : nullptr;
 
   if (nullptr == th || th->m_pool_rank != init_thread_rank) {
     std::ostringstream msg;
-    msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : "
+    msg << "Kokkos::Impl::ThreadsInternal::get_thread ERROR : "
         << "thread " << init_thread_rank << " of " << s_thread_pool_size[0];
     if (nullptr == th) {
       msg << " does not exist";
@@ -264,24 +216,6 @@ ThreadsExec *ThreadsExec::get_thread(const int init_thread_rank) {
   return th;
 }
 
-//----------------------------------------------------------------------------
-
-void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) {
-  ThreadsExec::global_lock();
-  ThreadsExec::global_unlock();
-
-  const int n        = exec.m_pool_fan_size;
-  const int rank_rev = exec.m_pool_size - (exec.m_pool_rank + 1);
-
-  for (int i = 0; i < n; ++i) {
-    Impl::spinwait_while_equal<int>(
-        exec.m_pool_base[rank_rev + (1 << i)]->m_pool_state,
-        ThreadsExec::Active);
-  }
-
-  exec.m_pool_state = ThreadsExec::Inactive;
-}
-
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -290,8 +224,8 @@ void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) {
 namespace Kokkos {
 namespace Impl {
 
-void ThreadsExec::verify_is_process(const std::string &name,
-                                    const bool initialized) {
+void ThreadsInternal::verify_is_process(const std::string &name,
+                                        const bool initialized) {
   if (!is_process()) {
     std::string msg(name);
     msg.append(
@@ -307,63 +241,48 @@ void ThreadsExec::verify_is_process(const std::string &name,
   }
 }
 
-int ThreadsExec::in_parallel() {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+KOKKOS_DEPRECATED int ThreadsInternal::in_parallel() {
   // A thread function is in execution and
   // the function argument is not the special threads process argument and
   // the master process is a worker or is not the master process.
   return s_current_function && (&s_threads_process != s_current_function_arg) &&
          (s_threads_process.m_pool_base || !is_process());
 }
-void ThreadsExec::fence() { internal_fence(Impl::fence_is_static::yes); }
-void ThreadsExec::fence(const std::string &name) {
-  internal_fence(name, Impl::fence_is_static::yes);
+#endif
+void ThreadsInternal::fence() {
+  fence("Kokkos::ThreadsInternal::fence: Unnamed Instance Fence");
 }
-
-void ThreadsExec::internal_fence(Impl::fence_is_static is_static) {
-  internal_fence((is_static == Impl::fence_is_static::no)
-                     ? "Kokkos::ThreadsExec::fence: Unnamed Instance Fence"
-                     : "Kokkos::ThreadsExec::fence: Unnamed Static Fence",
-                 is_static);
+void ThreadsInternal::fence(const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>(
+      name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1},
+      internal_fence);
 }
 
 // Wait for root thread to become inactive
-void ThreadsExec::internal_fence(const std::string &name,
-                                 Impl::fence_is_static is_static) {
-  const auto &fence_lam = [&]() {
-    if (s_thread_pool_size[0]) {
-      // Wait for the root thread to complete:
-      Impl::spinwait_while_equal<int>(s_threads_exec[0]->m_pool_state,
-                                      ThreadsExec::Active);
-    }
+void ThreadsInternal::internal_fence() {
+  if (s_thread_pool_size[0]) {
+    // Wait for the root thread to complete:
+    Impl::spinwait_while_equal(s_threads_exec[0]->m_pool_state,
+                               ThreadState::Active);
+  }
 
-    s_current_function     = nullptr;
-    s_current_function_arg = nullptr;
+  s_current_function     = nullptr;
+  s_current_function_arg = nullptr;
 
-    // Make sure function and arguments are cleared before
-    // potentially re-activating threads with a subsequent launch.
-    memory_fence();
-  };
-  if (is_static == Impl::fence_is_static::yes) {
-    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>(
-        name,
-        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
-            GlobalDeviceSynchronization,
-        fence_lam);
-  } else {
-    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>(
-        name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1},
-        fence_lam);
-  }
+  // Make sure function and arguments are cleared before
+  // potentially re-activating threads with a subsequent launch.
+  memory_fence();
 }
 
 /** \brief  Begin execution of the asynchronous functor */
-void ThreadsExec::start(void (*func)(ThreadsExec &, const void *),
-                        const void *arg) {
-  verify_is_process("ThreadsExec::start", true);
+void ThreadsInternal::start(void (*func)(ThreadsInternal &, const void *),
+                            const void *arg) {
+  verify_is_process("ThreadsInternal::start", true);
 
   if (s_current_function || s_current_function_arg) {
     Kokkos::Impl::throw_runtime_exception(
-        std::string("ThreadsExec::start() FAILED : already executing"));
+        std::string("ThreadsInternal::start() FAILED : already executing"));
   }
 
   s_current_function     = func;
@@ -372,68 +291,29 @@ void ThreadsExec::start(void (*func)(ThreadsExec &, const void *),
   // Make sure function and arguments are written before activating threads.
   memory_fence();
 
-  // Activate threads:
+  // Activate threads. The spawned threads will start working on
+  // s_current_function. The root thread is only set to active, we still need to
+  // call s_current_function.
   for (int i = s_thread_pool_size[0]; 0 < i--;) {
-    s_threads_exec[i]->m_pool_state = ThreadsExec::Active;
+    s_threads_exec[i]->m_pool_state = ThreadState::Active;
   }
 
   if (s_threads_process.m_pool_size) {
     // Master process is the root thread, run it:
     (*func)(s_threads_process, arg);
-    s_threads_process.m_pool_state = ThreadsExec::Inactive;
+    s_threads_process.m_pool_state = ThreadState::Inactive;
   }
 }
 
 //----------------------------------------------------------------------------
 
-bool ThreadsExec::sleep() {
-  verify_is_process("ThreadsExec::sleep", true);
-
-  if (&execute_sleep == s_current_function) return false;
-
-  fence();
-
-  ThreadsExec::global_lock();
-
-  s_current_function = &execute_sleep;
-
-  // Activate threads:
-  for (unsigned i = s_thread_pool_size[0]; 0 < i;) {
-    s_threads_exec[--i]->m_pool_state = ThreadsExec::Active;
-  }
-
-  return true;
-}
-
-bool ThreadsExec::wake() {
-  verify_is_process("ThreadsExec::wake", true);
-
-  if (&execute_sleep != s_current_function) return false;
-
-  ThreadsExec::global_unlock();
-
-  if (s_threads_process.m_pool_base) {
-    execute_sleep(s_threads_process, nullptr);
-    s_threads_process.m_pool_state = ThreadsExec::Inactive;
-  }
-
-  fence();
-
-  return true;
-}
-
-//----------------------------------------------------------------------------
-
-void ThreadsExec::execute_resize_scratch_in_serial() {
+void ThreadsInternal::execute_resize_scratch_in_serial() {
   const unsigned begin = s_threads_process.m_pool_base ? 1 : 0;
 
-  auto deallocate_scratch_memory = [](ThreadsExec &exec) {
+  auto deallocate_scratch_memory = [](ThreadsInternal &exec) {
     if (exec.m_scratch) {
-      using Record =
-          Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>;
-      Record *const r = Record::get_record(exec.m_scratch);
-      exec.m_scratch  = nullptr;
-      Record::decrement(r);
+      Kokkos::kokkos_free<Kokkos::HostSpace>(exec.m_scratch);
+      exec.m_scratch = nullptr;
     }
   };
   if (s_threads_process.m_pool_base) {
@@ -449,18 +329,18 @@ void ThreadsExec::execute_resize_scratch_in_serial() {
   memory_fence();
 
   for (unsigned i = s_thread_pool_size[0]; begin < i;) {
-    ThreadsExec &th = *s_threads_exec[--i];
+    ThreadsInternal &th = *s_threads_exec[--i];
 
-    th.m_pool_state = ThreadsExec::Active;
+    th.m_pool_state = ThreadState::Active;
 
-    wait_yield(th.m_pool_state, ThreadsExec::Active);
+    wait_yield(th.m_pool_state, ThreadState::Active);
   }
 
   if (s_threads_process.m_pool_base) {
     deallocate_scratch_memory(s_threads_process);
-    s_threads_process.m_pool_state = ThreadsExec::Active;
+    s_threads_process.m_pool_state = ThreadState::Active;
     first_touch_allocate_thread_private_scratch(s_threads_process, nullptr);
-    s_threads_process.m_pool_state = ThreadsExec::Inactive;
+    s_threads_process.m_pool_state = ThreadState::Inactive;
   }
 
   s_current_function_arg = nullptr;
@@ -472,27 +352,20 @@ void ThreadsExec::execute_resize_scratch_in_serial() {
 
 //----------------------------------------------------------------------------
 
-void *ThreadsExec::root_reduce_scratch() {
+void *ThreadsInternal::root_reduce_scratch() {
   return s_threads_process.reduce_memory();
 }
 
-void ThreadsExec::first_touch_allocate_thread_private_scratch(ThreadsExec &exec,
-                                                              const void *) {
+void ThreadsInternal::first_touch_allocate_thread_private_scratch(
+    ThreadsInternal &exec, const void *) {
   exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end;
   exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end;
 
   if (s_threads_process.m_scratch_thread_end) {
     // Allocate tracked memory:
     {
-      using Record =
-          Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>;
-      Record *const r =
-          Record::allocate(Kokkos::HostSpace(), "Kokkos::thread_scratch",
-                           s_threads_process.m_scratch_thread_end);
-
-      Record::increment(r);
-
-      exec.m_scratch = r->data();
+      exec.m_scratch = Kokkos::kokkos_malloc<Kokkos::HostSpace>(
+          "Kokkos::thread_scratch", s_threads_process.m_scratch_thread_end);
     }
 
     unsigned *ptr = reinterpret_cast<unsigned *>(exec.m_scratch);
@@ -505,7 +378,7 @@ void ThreadsExec::first_touch_allocate_thread_private_scratch(ThreadsExec &exec,
   }
 }
 
-void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) {
+void *ThreadsInternal::resize_scratch(size_t reduce_size, size_t thread_size) {
   enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
 
   fence();
@@ -522,7 +395,7 @@ void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) {
   if ((old_reduce_size < reduce_size) || (old_thread_size < thread_size) ||
       ((reduce_size == 0 && thread_size == 0) &&
        (old_reduce_size != 0 || old_thread_size != 0))) {
-    verify_is_process("ThreadsExec::resize_scratch", true);
+    verify_is_process("ThreadsInternal::resize_scratch", true);
 
     s_threads_process.m_scratch_reduce_end = reduce_size;
     s_threads_process.m_scratch_thread_end = reduce_size + thread_size;
@@ -537,27 +410,22 @@ void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) {
 
 //----------------------------------------------------------------------------
 
-void ThreadsExec::print_configuration(std::ostream &s, const bool detail) {
-  verify_is_process("ThreadsExec::print_configuration", false);
+void ThreadsInternal::print_configuration(std::ostream &s, const bool detail) {
+  verify_is_process("ThreadsInternal::print_configuration", false);
 
   fence();
 
-  const unsigned numa_count     = Kokkos::hwloc::get_available_numa_count();
-  const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa();
-  const unsigned threads_per_core =
-      Kokkos::hwloc::get_available_threads_per_core();
-
-  // Forestall compiler warnings for unused variables.
-  (void)numa_count;
-  (void)cores_per_numa;
-  (void)threads_per_core;
-
   s << "Kokkos::Threads";
 
 #if defined(KOKKOS_ENABLE_THREADS)
   s << " KOKKOS_ENABLE_THREADS";
 #endif
 #if defined(KOKKOS_ENABLE_HWLOC)
+  const unsigned numa_count     = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core =
+      Kokkos::hwloc::get_available_threads_per_core();
+
   s << " hwloc[" << numa_count << "x" << cores_per_numa << "x"
     << threads_per_core << "]";
 #endif
@@ -569,25 +437,21 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) {
     if (nullptr == s_threads_process.m_pool_base) {
       s << " Asynchronous";
     }
-    s << " ReduceScratch[" << s_current_reduce_size << "]"
-      << " SharedScratch[" << s_current_shared_size << "]";
     s << std::endl;
 
     if (detail) {
       for (int i = 0; i < s_thread_pool_size[0]; ++i) {
-        ThreadsExec *const th = s_threads_exec[i];
+        ThreadsInternal *const th = s_threads_exec[i];
 
         if (th) {
           const int rank_rev = th->m_pool_size - (th->m_pool_rank + 1);
 
-          s << " Thread[ " << th->m_pool_rank << " : " << th->m_numa_rank << "."
-            << th->m_numa_core_rank << " ]";
+          s << " Thread[ " << th->m_pool_rank << " ]";
 
           s << " Fan{";
           for (int j = 0; j < th->m_pool_fan_size; ++j) {
-            ThreadsExec *const thfan = th->m_pool_base[rank_rev + (1 << j)];
-            s << " [ " << thfan->m_pool_rank << " : " << thfan->m_numa_rank
-              << "." << thfan->m_numa_core_rank << " ]";
+            ThreadsInternal *const thfan = th->m_pool_base[rank_rev + (1 << j)];
+            s << " [ " << thfan->m_pool_rank << " ]";
           }
           s << " }";
 
@@ -605,29 +469,21 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) {
 
 //----------------------------------------------------------------------------
 
-int ThreadsExec::is_initialized() { return nullptr != s_threads_exec[0]; }
+int ThreadsInternal::is_initialized() { return nullptr != s_threads_exec[0]; }
 
-void ThreadsExec::initialize(int thread_count_arg) {
-  // legacy arguments
-  unsigned thread_count       = thread_count_arg == -1 ? 0 : thread_count_arg;
-  unsigned use_numa_count     = 0;
-  unsigned use_cores_per_numa = 0;
-  bool allow_asynchronous_threadpool = false;
-  // need to provide an initializer for Intel compilers
-  static const Sentinel sentinel = {};
+void ThreadsInternal::initialize(int thread_count_arg) {
+  unsigned thread_count = thread_count_arg == -1 ? 0 : thread_count_arg;
 
   const bool is_initialized = 0 != s_thread_pool_size[0];
 
   unsigned thread_spawn_failed = 0;
 
-  for (int i = 0; i < ThreadsExec::MAX_THREAD_COUNT; i++)
+  for (int i = 0; i < ThreadsInternal::MAX_THREAD_COUNT; i++)
     s_threads_exec[i] = nullptr;
 
   if (!is_initialized) {
-    // If thread_count, use_numa_count, or use_cores_per_numa are zero
-    // then they will be given default values based upon hwloc detection
-    // and allowed asynchronous execution.
-
+    // If thread_count is zero then it will be given default values based upon
+    // hwloc detection.
     const bool hwloc_avail = Kokkos::hwloc::available();
     const bool hwloc_can_bind =
         hwloc_avail && Kokkos::hwloc::can_bind_threads();
@@ -640,17 +496,18 @@ void ThreadsExec::initialize(int thread_count_arg) {
                          : 1;
     }
 
-    const unsigned thread_spawn_begin = hwloc::thread_mapping(
-        "Kokkos::Threads::initialize", allow_asynchronous_threadpool,
-        thread_count, use_numa_count, use_cores_per_numa, s_threads_coord);
+    const bool allow_asynchronous_threadpool = false;
+    unsigned use_numa_count                  = 0;
+    unsigned use_cores_per_numa              = 0;
+    hwloc::thread_mapping("Kokkos::Threads::initialize",
+                          allow_asynchronous_threadpool, thread_count,
+                          use_numa_count, use_cores_per_numa, s_threads_coord);
 
     const std::pair<unsigned, unsigned> proc_coord = s_threads_coord[0];
 
-    if (thread_spawn_begin) {
-      // Synchronous with s_threads_coord[0] as the process core
-      // Claim entry #0 for binding the process core.
-      s_threads_coord[0] = std::pair<unsigned, unsigned>(~0u, ~0u);
-    }
+    // Synchronous with s_threads_coord[0] as the process core
+    // Claim entry #0 for binding the process core.
+    s_threads_coord[0] = std::pair<unsigned, unsigned>(~0u, ~0u);
 
     s_thread_pool_size[0] = thread_count;
     s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count;
@@ -658,8 +515,8 @@ void ThreadsExec::initialize(int thread_count_arg) {
     s_current_function =
         &execute_function_noop;  // Initialization work function
 
-    for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) {
-      s_threads_process.m_pool_state = ThreadsExec::Inactive;
+    for (unsigned ith = 1; ith < thread_count; ++ith) {
+      s_threads_process.m_pool_state = ThreadState::Inactive;
 
       // If hwloc available then spawned thread will
       // choose its own entry in 's_threads_coord'
@@ -675,18 +532,20 @@ void ThreadsExec::initialize(int thread_count_arg) {
       // Wait until spawned thread has attempted to initialize.
       // If spawning and initialization is successful then
       // an entry in 's_threads_exec' will be assigned.
-      ThreadsExec::spawn();
-      wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive);
-      if (s_threads_process.m_pool_state == ThreadsExec::Terminating) break;
+      std::thread t(internal_cppthread_driver);
+      t.detach();
+      wait_yield(s_threads_process.m_pool_state, ThreadState::Inactive);
+      if (s_threads_process.m_pool_state == ThreadState::Terminating) break;
     }
 
     // Wait for all spawned threads to deactivate before zeroing the function.
 
-    for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) {
+    for (unsigned ith = 1; ith < thread_count; ++ith) {
       // Try to protect against cache coherency failure by casting to volatile.
-      ThreadsExec *const th = ((ThreadsExec * volatile *)s_threads_exec)[ith];
+      ThreadsInternal *const th =
+          ((ThreadsInternal * volatile *)s_threads_exec)[ith];
       if (th) {
-        wait_yield(th->m_pool_state, ThreadsExec::Active);
+        wait_yield(th->m_pool_state, ThreadState::Active);
       } else {
         ++thread_spawn_failed;
       }
@@ -694,7 +553,7 @@ void ThreadsExec::initialize(int thread_count_arg) {
 
     s_current_function             = nullptr;
     s_current_function_arg         = nullptr;
-    s_threads_process.m_pool_state = ThreadsExec::Inactive;
+    s_threads_process.m_pool_state = ThreadState::Inactive;
 
     memory_fence();
 
@@ -705,30 +564,17 @@ void ThreadsExec::initialize(int thread_count_arg) {
         Kokkos::hwloc::bind_this_thread(proc_coord);
       }
 
-      if (thread_spawn_begin) {  // Include process in pool.
-        const std::pair<unsigned, unsigned> coord =
-            Kokkos::hwloc::get_this_thread_coordinate();
-
-        s_threads_exec[0]                  = &s_threads_process;
-        s_threads_process.m_numa_rank      = coord.first;
-        s_threads_process.m_numa_core_rank = coord.second;
-        s_threads_process.m_pool_base      = s_threads_exec;
-        s_threads_process.m_pool_rank =
-            thread_count - 1;  // Reversed for scan-compatible reductions
-        s_threads_process.m_pool_size     = thread_count;
-        s_threads_process.m_pool_fan_size = fan_size(
-            s_threads_process.m_pool_rank, s_threads_process.m_pool_size);
-        s_threads_pid[s_threads_process.m_pool_rank] =
-            std::this_thread::get_id();
-      } else {
-        s_threads_process.m_pool_base     = nullptr;
-        s_threads_process.m_pool_rank     = 0;
-        s_threads_process.m_pool_size     = 0;
-        s_threads_process.m_pool_fan_size = 0;
-      }
+      s_threads_exec[0]             = &s_threads_process;
+      s_threads_process.m_pool_base = s_threads_exec;
+      s_threads_process.m_pool_rank =
+          thread_count - 1;  // Reversed for scan-compatible reductions
+      s_threads_process.m_pool_size     = thread_count;
+      s_threads_process.m_pool_fan_size = fan_size(
+          s_threads_process.m_pool_rank, s_threads_process.m_pool_size);
+      s_threads_pid[s_threads_process.m_pool_rank] = std::this_thread::get_id();
 
       // Initial allocations:
-      ThreadsExec::resize_scratch(1024, 1024);
+      ThreadsInternal::resize_scratch(1024, 1024);
     } else {
       s_thread_pool_size[0] = 0;
       s_thread_pool_size[1] = 0;
@@ -773,8 +619,8 @@ void ThreadsExec::initialize(int thread_count_arg) {
 
 //----------------------------------------------------------------------------
 
-void ThreadsExec::finalize() {
-  verify_is_process("ThreadsExec::finalize", false);
+void ThreadsInternal::finalize() {
+  verify_is_process("ThreadsInternal::finalize", false);
 
   fence();
 
@@ -784,18 +630,18 @@ void ThreadsExec::finalize() {
 
   for (unsigned i = s_thread_pool_size[0]; begin < i--;) {
     if (s_threads_exec[i]) {
-      s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating;
+      s_threads_exec[i]->m_pool_state = ThreadState::Terminating;
 
-      wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive);
+      wait_yield(s_threads_process.m_pool_state, ThreadState::Inactive);
 
-      s_threads_process.m_pool_state = ThreadsExec::Inactive;
+      s_threads_process.m_pool_state = ThreadState::Inactive;
     }
 
     s_threads_pid[i] = std::thread::id();
   }
 
   if (s_threads_process.m_pool_base) {
-    (&s_threads_process)->~ThreadsExec();
+    (&s_threads_process)->~ThreadsInternal();
     s_threads_exec[0] = nullptr;
   }
 
@@ -808,13 +654,11 @@ void ThreadsExec::finalize() {
   s_thread_pool_size[2] = 0;
 
   // Reset master thread to run solo.
-  s_threads_process.m_numa_rank      = 0;
-  s_threads_process.m_numa_core_rank = 0;
-  s_threads_process.m_pool_base      = nullptr;
-  s_threads_process.m_pool_rank      = 0;
-  s_threads_process.m_pool_size      = 1;
-  s_threads_process.m_pool_fan_size  = 0;
-  s_threads_process.m_pool_state     = ThreadsExec::Inactive;
+  s_threads_process.m_pool_base     = nullptr;
+  s_threads_process.m_pool_rank     = 0;
+  s_threads_process.m_pool_size     = 1;
+  s_threads_process.m_pool_fan_size = 0;
+  s_threads_process.m_pool_state    = ThreadState::Inactive;
 }
 
 //----------------------------------------------------------------------------
@@ -834,7 +678,7 @@ int Threads::concurrency() const { return impl_thread_pool_size(0); }
 #endif
 
 void Threads::fence(const std::string &name) const {
-  Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::no);
+  Impl::ThreadsInternal::fence(name);
 }
 
 Threads &Threads::impl_instance(int) {
diff --git a/core/src/Threads/Kokkos_ThreadsExec.hpp b/core/src/Threads/Kokkos_Threads_Instance.hpp
similarity index 76%
rename from core/src/Threads/Kokkos_ThreadsExec.hpp
rename to core/src/Threads/Kokkos_Threads_Instance.hpp
index 377e096bfbe..a5eb231cb01 100644
--- a/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/core/src/Threads/Kokkos_Threads_Instance.hpp
@@ -14,8 +14,8 @@
 //
 //@HEADER
 
-#ifndef KOKKOS_THREADSEXEC_HPP
-#define KOKKOS_THREADSEXEC_HPP
+#ifndef KOKKOS_THREADS_INSTANCE_HPP
+#define KOKKOS_THREADS_INSTANCE_HPP
 
 #include <Kokkos_Macros.hpp>
 
@@ -23,41 +23,25 @@
 #include <ostream>
 #include <utility>
 
-#include <impl/Kokkos_Spinwait.hpp>
-
 #include <Kokkos_Atomic.hpp>
 #include <Kokkos_Pair.hpp>
 
 #include <impl/Kokkos_ConcurrentBitset.hpp>
 #include <Threads/Kokkos_Threads.hpp>
+#include <Threads/Kokkos_Threads_Spinwait.hpp>
+#include <Threads/Kokkos_Threads_State.hpp>
 
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
 namespace Impl {
-class ThreadsExec {
+class ThreadsInternal {
  public:
   // Fan array has log_2(NT) reduction threads plus 2 scan threads
   // Currently limited to 16k threads.
-  enum { MAX_FAN_COUNT = 16 };
-  enum { MAX_THREAD_COUNT = 1 << (MAX_FAN_COUNT - 2) };
-  enum { VECTOR_LENGTH = 8 };
-
-  /** \brief States of a worker thread */
-  enum {
-    Terminating  ///<  Termination in progress
-    ,
-    Inactive  ///<  Exists, waiting for work
-    ,
-    Active  ///<  Exists, performing work
-    ,
-    Rendezvous  ///<  Exists, waiting in a barrier or reduce
-
-    ,
-    ScanCompleted,
-    ScanAvailable,
-    ReductionAvailable
-  };
+  static constexpr int MAX_FAN_COUNT    = 16;
+  static constexpr int MAX_THREAD_COUNT = 1 << (MAX_FAN_COUNT - 2);
+  static constexpr int VECTOR_LENGTH    = 8;
 
  private:
   friend class Kokkos::Threads;
@@ -67,18 +51,16 @@ class ThreadsExec {
   // the threads that need them.
   // For a simple reduction the thread location is arbitrary.
 
-  ThreadsExec *const *m_pool_base;  ///< Base for pool fan-in
+  ThreadsInternal *const *m_pool_base;  ///< Base for pool fan-in
 
   void *m_scratch;
   int m_scratch_reduce_end;
   size_t m_scratch_thread_end;
-  int m_numa_rank;
-  int m_numa_core_rank;
   int m_pool_rank;
   int m_pool_rank_rev;
   int m_pool_size;
   int m_pool_fan_size;
-  int volatile m_pool_state;  ///< State for global synchronizations
+  ThreadState volatile m_pool_state;  ///< State for global synchronizations
 
   // Members for dynamic scheduling
   // Which thread am I stealing from currently
@@ -93,41 +75,36 @@ class ThreadsExec {
 
   static void global_lock();
   static void global_unlock();
-  static void spawn();
 
-  static void first_touch_allocate_thread_private_scratch(ThreadsExec &,
+  static void first_touch_allocate_thread_private_scratch(ThreadsInternal &,
                                                           const void *);
-  static void execute_sleep(ThreadsExec &, const void *);
 
-  ThreadsExec(const ThreadsExec &);
-  ThreadsExec &operator=(const ThreadsExec &);
+  ThreadsInternal(const ThreadsInternal &);
+  ThreadsInternal &operator=(const ThreadsInternal &);
 
   static void execute_resize_scratch_in_serial();
 
  public:
   KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size; }
   KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank; }
-  KOKKOS_INLINE_FUNCTION int numa_rank() const { return m_numa_rank; }
-  KOKKOS_INLINE_FUNCTION int numa_core_rank() const { return m_numa_core_rank; }
   inline long team_work_index() const { return m_team_work_index; }
 
-  static int get_thread_count();
-  static ThreadsExec *get_thread(const int init_thread_rank);
+  static ThreadsInternal *get_thread(const int init_thread_rank);
 
   inline void *reduce_memory() const { return m_scratch; }
   KOKKOS_INLINE_FUNCTION void *scratch_memory() const {
     return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end;
   }
 
-  KOKKOS_INLINE_FUNCTION int volatile &state() { return m_pool_state; }
-  KOKKOS_INLINE_FUNCTION ThreadsExec *const *pool_base() const {
+  KOKKOS_INLINE_FUNCTION ThreadState volatile &state() { return m_pool_state; }
+  KOKKOS_INLINE_FUNCTION ThreadsInternal *const *pool_base() const {
     return m_pool_base;
   }
 
   static void driver(void);
 
-  ~ThreadsExec();
-  ThreadsExec();
+  ~ThreadsInternal();
+  ThreadsInternal();
 
   static void *resize_scratch(size_t reduce_size, size_t thread_size);
 
@@ -143,15 +120,8 @@ class ThreadsExec {
 
   static void finalize();
 
-  /* Given a requested team size, return valid team size */
-  static unsigned team_size_valid(unsigned);
-
   static void print_configuration(std::ostream &, const bool detail = false);
 
-  //------------------------------------
-
-  static void wait_yield(volatile int &, const int);
-
   //------------------------------------
   // All-thread functions:
 
@@ -166,14 +136,14 @@ class ThreadsExec {
     // Fan-in reduction with highest ranking thread as the root
     for (int i = 0; i < m_pool_fan_size; ++i) {
       // Wait: Active -> Rendezvous
-      Impl::spinwait_while_equal<int>(
-          m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active);
+      spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state,
+                           ThreadState::Active);
     }
 
     if (rev_rank) {
-      m_pool_state = ThreadsExec::Rendezvous;
+      m_pool_state = ThreadState::Rendezvous;
       // Wait: Rendezvous -> Active
-      Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::Rendezvous);
+      spinwait_while_equal(m_pool_state, ThreadState::Rendezvous);
     } else {
       // Root thread does the reduction and broadcast
 
@@ -191,7 +161,7 @@ class ThreadsExec {
       memory_fence();
 
       for (int rank = 0; rank < m_pool_size; ++rank) {
-        get_thread(rank)->m_pool_state = ThreadsExec::Active;
+        get_thread(rank)->m_pool_state = ThreadState::Active;
       }
     }
 
@@ -207,21 +177,21 @@ class ThreadsExec {
     // Fan-in reduction with highest ranking thread as the root
     for (int i = 0; i < m_pool_fan_size; ++i) {
       // Wait: Active -> Rendezvous
-      Impl::spinwait_while_equal<int>(
-          m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active);
+      spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state,
+                           ThreadState::Active);
     }
 
     if (rev_rank) {
-      m_pool_state = ThreadsExec::Rendezvous;
+      m_pool_state = ThreadState::Rendezvous;
       // Wait: Rendezvous -> Active
-      Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::Rendezvous);
+      spinwait_while_equal(m_pool_state, ThreadState::Rendezvous);
     } else {
       // Root thread does the reduction and broadcast
 
       memory_fence();
 
       for (int rank = 0; rank < m_pool_size; ++rank) {
-        get_thread(rank)->m_pool_state = ThreadsExec::Active;
+        get_thread(rank)->m_pool_state = ThreadState::Active;
       }
     }
   }
@@ -234,9 +204,9 @@ class ThreadsExec {
     const int rev_rank = m_pool_size - (m_pool_rank + 1);
 
     for (int i = 0; i < m_pool_fan_size; ++i) {
-      ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)];
+      ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)];
 
-      Impl::spinwait_while_equal<int>(fan.m_pool_state, ThreadsExec::Active);
+      spinwait_while_equal(fan.m_pool_state, ThreadState::Active);
 
       f.join(
           reinterpret_cast<typename FunctorType::value_type *>(reduce_memory()),
@@ -265,8 +235,8 @@ class ThreadsExec {
     const int rev_rank = m_pool_size - (m_pool_rank + 1);
 
     for (int i = 0; i < m_pool_fan_size; ++i) {
-      Impl::spinwait_while_equal<int>(
-          m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active);
+      spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state,
+                           ThreadState::Active);
     }
   }
 
@@ -289,10 +259,10 @@ class ThreadsExec {
     //--------------------------------
     // Fan-in reduction with highest ranking thread as the root
     for (int i = 0; i < m_pool_fan_size; ++i) {
-      ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)];
+      ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)];
 
       // Wait: Active -> ReductionAvailable (or ScanAvailable)
-      Impl::spinwait_while_equal<int>(fan.m_pool_state, ThreadsExec::Active);
+      spinwait_while_equal(fan.m_pool_state, ThreadState::Active);
       f.join(work_value, fan.reduce_memory());
     }
 
@@ -303,39 +273,37 @@ class ThreadsExec {
 
     if (rev_rank) {
       // Set: Active -> ReductionAvailable
-      m_pool_state = ThreadsExec::ReductionAvailable;
+      m_pool_state = ThreadState::ReductionAvailable;
 
       // Wait for contributing threads' scan value to be available.
       if ((1 << m_pool_fan_size) < (m_pool_rank + 1)) {
-        ThreadsExec &th = *m_pool_base[rev_rank + (1 << m_pool_fan_size)];
+        ThreadsInternal &th = *m_pool_base[rev_rank + (1 << m_pool_fan_size)];
 
         // Wait: Active             -> ReductionAvailable
         // Wait: ReductionAvailable -> ScanAvailable
-        Impl::spinwait_while_equal<int>(th.m_pool_state, ThreadsExec::Active);
-        Impl::spinwait_while_equal<int>(th.m_pool_state,
-                                        ThreadsExec::ReductionAvailable);
+        spinwait_while_equal(th.m_pool_state, ThreadState::Active);
+        spinwait_while_equal(th.m_pool_state, ThreadState::ReductionAvailable);
 
         f.join(work_value + count, ((scalar_type *)th.reduce_memory()) + count);
       }
 
       // This thread has completed inclusive scan
       // Set: ReductionAvailable -> ScanAvailable
-      m_pool_state = ThreadsExec::ScanAvailable;
+      m_pool_state = ThreadState::ScanAvailable;
 
       // Wait for all threads to complete inclusive scan
       // Wait: ScanAvailable -> Rendezvous
-      Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::ScanAvailable);
+      spinwait_while_equal(m_pool_state, ThreadState::ScanAvailable);
     }
 
     //--------------------------------
 
     for (int i = 0; i < m_pool_fan_size; ++i) {
-      ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)];
+      ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)];
       // Wait: ReductionAvailable -> ScanAvailable
-      Impl::spinwait_while_equal<int>(fan.m_pool_state,
-                                      ThreadsExec::ReductionAvailable);
+      spinwait_while_equal(fan.m_pool_state, ThreadState::ReductionAvailable);
       // Set: ScanAvailable -> Rendezvous
-      fan.m_pool_state = ThreadsExec::Rendezvous;
+      fan.m_pool_state = ThreadState::Rendezvous;
     }
 
     // All threads have completed the inclusive scan.
@@ -346,7 +314,7 @@ class ThreadsExec {
     if ((rev_rank + 1) < m_pool_size) {
       // Exclusive scan: copy the previous thread's inclusive scan value
 
-      ThreadsExec &th = *m_pool_base[rev_rank + 1];  // Not the root thread
+      ThreadsInternal &th = *m_pool_base[rev_rank + 1];  // Not the root thread
 
       const scalar_type *const src_value =
           ((scalar_type *)th.reduce_memory()) + count;
@@ -362,19 +330,18 @@ class ThreadsExec {
     // Wait for all threads to copy previous thread's inclusive scan value
     // Wait for all threads: Rendezvous -> ScanCompleted
     for (int i = 0; i < m_pool_fan_size; ++i) {
-      Impl::spinwait_while_equal<int>(
-          m_pool_base[rev_rank + (1 << i)]->m_pool_state,
-          ThreadsExec::Rendezvous);
+      spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state,
+                           ThreadState::Rendezvous);
     }
     if (rev_rank) {
       // Set: ScanAvailable -> ScanCompleted
-      m_pool_state = ThreadsExec::ScanCompleted;
+      m_pool_state = ThreadState::ScanCompleted;
       // Wait: ScanCompleted -> Active
-      Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::ScanCompleted);
+      spinwait_while_equal(m_pool_state, ThreadState::ScanCompleted);
     }
     // Set: ScanCompleted -> Active
     for (int i = 0; i < m_pool_fan_size; ++i) {
-      m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active;
+      m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadState::Active;
     }
   }
 
@@ -391,8 +358,8 @@ class ThreadsExec {
     // Fan-in reduction with highest ranking thread as the root
     for (int i = 0; i < m_pool_fan_size; ++i) {
       // Wait: Active -> Rendezvous
-      Impl::spinwait_while_equal<int>(
-          m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active);
+      spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state,
+                           ThreadState::Active);
     }
 
     for (unsigned i = 0; i < count; ++i) {
@@ -400,9 +367,9 @@ class ThreadsExec {
     }
 
     if (rev_rank) {
-      m_pool_state = ThreadsExec::Rendezvous;
+      m_pool_state = ThreadState::Rendezvous;
       // Wait: Rendezvous -> Active
-      Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::Rendezvous);
+      spinwait_while_equal(m_pool_state, ThreadState::Rendezvous);
     } else {
       // Root thread does the thread-scan before releasing threads
 
@@ -424,7 +391,7 @@ class ThreadsExec {
     }
 
     for (int i = 0; i < m_pool_fan_size; ++i) {
-      m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active;
+      m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadState::Active;
     }
   }
 
@@ -433,18 +400,14 @@ class ThreadsExec {
    *          complete and release the Threads device.
    *          Acquire the Threads device and start this functor.
    */
-  static void start(void (*)(ThreadsExec &, const void *), const void *);
+  static void start(void (*)(ThreadsInternal &, const void *), const void *);
 
-  static int in_parallel();
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED static int in_parallel();
+#endif
   static void fence();
   static void fence(const std::string &);
-  static void internal_fence(
-      Impl::fence_is_static is_static = Impl::fence_is_static::yes);
-  static void internal_fence(
-      const std::string &,
-      Impl::fence_is_static is_static = Impl::fence_is_static::yes);
-  static bool sleep();
-  static bool wake();
+  static void internal_fence();
 
   /* Dynamic Scheduling related functionality */
   // Initialize the work range for this thread
@@ -583,30 +546,38 @@ class ThreadsExec {
 
 namespace Kokkos {
 
-inline int Threads::in_parallel() { return Impl::ThreadsExec::in_parallel(); }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+KOKKOS_DEPRECATED inline int Threads::in_parallel() {
+  return Impl::ThreadsInternal::in_parallel();
+}
+#endif
 
 inline int Threads::impl_is_initialized() {
-  return Impl::ThreadsExec::is_initialized();
+  return Impl::ThreadsInternal::is_initialized();
 }
 
 inline void Threads::impl_initialize(InitializationSettings const &settings) {
-  Impl::ThreadsExec::initialize(
+  Impl::ThreadsInternal::initialize(
       settings.has_num_threads() ? settings.get_num_threads() : -1);
 }
 
-inline void Threads::impl_finalize() { Impl::ThreadsExec::finalize(); }
+inline void Threads::impl_finalize() { Impl::ThreadsInternal::finalize(); }
 
 inline void Threads::print_configuration(std::ostream &os, bool verbose) const {
   os << "Host Parallel Execution Space:\n";
   os << "  KOKKOS_ENABLE_THREADS: yes\n";
 
   os << "\nThreads Runtime Configuration:\n";
-  Impl::ThreadsExec::print_configuration(os, verbose);
+  Impl::ThreadsInternal::print_configuration(os, verbose);
 }
 
 inline void Threads::impl_static_fence(const std::string &name) {
-  Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::yes);
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      Impl::ThreadsInternal::internal_fence);
 }
 } /* namespace Kokkos */
 
-#endif /* #define KOKKOS_THREADSEXEC_HPP */
+#endif
diff --git a/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp b/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp
index 0828f262993..59577609ab7 100644
--- a/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp
+++ b/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp
@@ -46,54 +46,54 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     }
   }
 
-  static void exec(ThreadsExec &exec, const void *arg) {
-    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  static void exec(ThreadsInternal &instance, const void *arg) {
+    exec_schedule<typename Policy::schedule_type::type>(instance, arg);
   }
 
   template <class Schedule>
   static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
+  exec_schedule(ThreadsInternal &instance, const void *arg) {
     const ParallelFor &self = *((const ParallelFor *)arg);
 
     auto const num_tiles = self.m_iter.m_rp.m_num_tiles;
-    WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(),
-                    exec.pool_size());
+    WorkRange range(Policy(0, num_tiles).set_chunk_size(1),
+                    instance.pool_rank(), instance.pool_size());
 
     self.exec_range(range.begin(), range.end());
 
-    exec.fan_in();
+    instance.fan_in();
   }
 
   template <class Schedule>
   static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
+  exec_schedule(ThreadsInternal &instance, const void *arg) {
     const ParallelFor &self = *((const ParallelFor *)arg);
 
     auto const num_tiles = self.m_iter.m_rp.m_num_tiles;
-    WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(),
-                    exec.pool_size());
+    WorkRange range(Policy(0, num_tiles).set_chunk_size(1),
+                    instance.pool_rank(), instance.pool_size());
 
-    exec.set_work_range(range.begin(), range.end(), 1);
-    exec.reset_steal_target();
-    exec.barrier();
+    instance.set_work_range(range.begin(), range.end(), 1);
+    instance.reset_steal_target();
+    instance.barrier();
 
-    long work_index = exec.get_work_index();
+    long work_index = instance.get_work_index();
 
     while (work_index != -1) {
       const Member begin = static_cast<Member>(work_index);
       const Member end   = begin + 1 < num_tiles ? begin + 1 : num_tiles;
 
       self.exec_range(begin, end);
-      work_index = exec.get_work_index();
+      work_index = instance.get_work_index();
     }
 
-    exec.fan_in();
+    instance.fan_in();
   }
 
  public:
   inline void execute() const {
-    ThreadsExec::start(&ParallelFor::exec, this);
-    ThreadsExec::fence();
+    ThreadsInternal::start(&ParallelFor::exec, this);
+    ThreadsInternal::fence();
   }
 
   ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy)
diff --git a/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp b/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp
index 3698416ef18..4a89c4fad82 100644
--- a/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp
+++ b/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp
@@ -59,37 +59,37 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
     }
   }
 
-  static void exec(ThreadsExec &exec, const void *arg) {
-    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  static void exec(ThreadsInternal &instance, const void *arg) {
+    exec_schedule<typename Policy::schedule_type::type>(instance, arg);
   }
 
   template <class Schedule>
   static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
+  exec_schedule(ThreadsInternal &instance, const void *arg) {
     const ParallelFor &self = *((const ParallelFor *)arg);
 
-    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+    WorkRange range(self.m_policy, instance.pool_rank(), instance.pool_size());
 
     ParallelFor::template exec_range<WorkTag>(self.m_functor, range.begin(),
                                               range.end());
 
-    exec.fan_in();
+    instance.fan_in();
   }
 
   template <class Schedule>
   static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
+  exec_schedule(ThreadsInternal &instance, const void *arg) {
     const ParallelFor &self = *((const ParallelFor *)arg);
 
-    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+    WorkRange range(self.m_policy, instance.pool_rank(), instance.pool_size());
 
-    exec.set_work_range(range.begin() - self.m_policy.begin(),
-                        range.end() - self.m_policy.begin(),
-                        self.m_policy.chunk_size());
-    exec.reset_steal_target();
-    exec.barrier();
+    instance.set_work_range(range.begin() - self.m_policy.begin(),
+                            range.end() - self.m_policy.begin(),
+                            self.m_policy.chunk_size());
+    instance.reset_steal_target();
+    instance.barrier();
 
-    long work_index = exec.get_work_index();
+    long work_index = instance.get_work_index();
 
     while (work_index != -1) {
       const Member begin =
@@ -100,16 +100,16 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
               ? begin + self.m_policy.chunk_size()
               : self.m_policy.end();
       ParallelFor::template exec_range<WorkTag>(self.m_functor, begin, end);
-      work_index = exec.get_work_index();
+      work_index = instance.get_work_index();
     }
 
-    exec.fan_in();
+    instance.fan_in();
   }
 
  public:
   inline void execute() const {
-    ThreadsExec::start(&ParallelFor::exec, this);
-    ThreadsExec::fence();
+    ThreadsInternal::start(&ParallelFor::exec, this);
+    ThreadsInternal::fence();
   }
 
   ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
diff --git a/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp
index 36404857a22..f927d7c6a67 100644
--- a/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp
+++ b/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp
@@ -73,14 +73,14 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     }
   }
 
-  static void exec(ThreadsExec &exec, const void *arg) {
+  static void exec(ThreadsInternal &instance, const void *arg) {
     const ParallelFor &self = *((const ParallelFor *)arg);
 
     ParallelFor::exec_team<WorkTag, typename Policy::schedule_type::type>(
-        self.m_functor, Member(&exec, self.m_policy, self.m_shared));
+        self.m_functor, Member(&instance, self.m_policy, self.m_shared));
 
-    exec.barrier();
-    exec.fan_in();
+    instance.barrier();
+    instance.fan_in();
   }
   template <typename Policy>
   Policy fix_policy(Policy policy) {
@@ -96,12 +96,12 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
  public:
   inline void execute() const {
-    ThreadsExec::resize_scratch(
+    ThreadsInternal::resize_scratch(
         0, Policy::member_type::team_reduce_size() + m_shared);
 
-    ThreadsExec::start(&ParallelFor::exec, this);
+    ThreadsInternal::start(&ParallelFor::exec, this);
 
-    ThreadsExec::fence();
+    ThreadsInternal::fence();
   }
 
   ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
diff --git a/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp b/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp
index 3d06379480f..fa63215a9e5 100644
--- a/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp
+++ b/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp
@@ -54,67 +54,67 @@ class ParallelReduce<CombinedFunctorReducerType,
     }
   }
 
-  static void exec(ThreadsExec &exec, const void *arg) {
-    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  static void exec(ThreadsInternal &instance, const void *arg) {
+    exec_schedule<typename Policy::schedule_type::type>(instance, arg);
   }
 
   template <class Schedule>
   static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
+  exec_schedule(ThreadsInternal &instance, const void *arg) {
     const ParallelReduce &self = *((const ParallelReduce *)arg);
 
     const auto num_tiles = self.m_iter.m_rp.m_num_tiles;
     const WorkRange range(Policy(0, num_tiles).set_chunk_size(1),
-                          exec.pool_rank(), exec.pool_size());
+                          instance.pool_rank(), instance.pool_size());
 
     const ReducerType &reducer = self.m_iter.m_func.get_reducer();
     self.exec_range(
         range.begin(), range.end(),
-        reducer.init(static_cast<pointer_type>(exec.reduce_memory())));
+        reducer.init(static_cast<pointer_type>(instance.reduce_memory())));
 
-    exec.fan_in_reduce(reducer);
+    instance.fan_in_reduce(reducer);
   }
 
   template <class Schedule>
   static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
+  exec_schedule(ThreadsInternal &instance, const void *arg) {
     const ParallelReduce &self = *((const ParallelReduce *)arg);
 
     const auto num_tiles = self.m_iter.m_rp.m_num_tiles;
     const WorkRange range(Policy(0, num_tiles).set_chunk_size(1),
-                          exec.pool_rank(), exec.pool_size());
+                          instance.pool_rank(), instance.pool_size());
 
-    exec.set_work_range(range.begin(), range.end(), 1);
-    exec.reset_steal_target();
-    exec.barrier();
+    instance.set_work_range(range.begin(), range.end(), 1);
+    instance.reset_steal_target();
+    instance.barrier();
 
-    long work_index = exec.get_work_index();
+    long work_index = instance.get_work_index();
 
     const ReducerType &reducer = self.m_iter.m_func.get_reducer();
-    reference_type update =
-        self.m_reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
+    reference_type update      = self.m_reducer.init(
+        static_cast<pointer_type>(instance.reduce_memory()));
     while (work_index != -1) {
       const Member begin = static_cast<Member>(work_index);
       const Member end   = begin + 1 < num_tiles ? begin + 1 : num_tiles;
       self.exec_range(begin, end, update);
-      work_index = exec.get_work_index();
+      work_index = instance.get_work_index();
     }
 
-    exec.fan_in_reduce(self.m_reducer);
+    instance.fan_in_reduce(self.m_reducer);
   }
 
  public:
   inline void execute() const {
     const ReducerType &reducer = m_iter.m_func.get_reducer();
-    ThreadsExec::resize_scratch(reducer.value_size(), 0);
+    ThreadsInternal::resize_scratch(reducer.value_size(), 0);
 
-    ThreadsExec::start(&ParallelReduce::exec, this);
+    ThreadsInternal::start(&ParallelReduce::exec, this);
 
-    ThreadsExec::fence();
+    ThreadsInternal::fence();
 
     if (m_result_ptr) {
       const pointer_type data =
-          (pointer_type)ThreadsExec::root_reduce_scratch();
+          (pointer_type)ThreadsInternal::root_reduce_scratch();
 
       const unsigned n = reducer.value_count();
       for (unsigned i = 0; i < n; ++i) {
diff --git a/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp b/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp
index 5fa97b403c4..bf4c2a532a1 100644
--- a/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp
+++ b/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp
@@ -68,42 +68,44 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
     }
   }
 
-  static void exec(ThreadsExec &exec, const void *arg) {
-    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  static void exec(ThreadsInternal &instance, const void *arg) {
+    exec_schedule<typename Policy::schedule_type::type>(instance, arg);
   }
 
   template <class Schedule>
   static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
+  exec_schedule(ThreadsInternal &instance, const void *arg) {
     const ParallelReduce &self = *((const ParallelReduce *)arg);
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+    const WorkRange range(self.m_policy, instance.pool_rank(),
+                          instance.pool_size());
 
     const ReducerType &reducer = self.m_functor_reducer.get_reducer();
 
     ParallelReduce::template exec_range<WorkTag>(
         self.m_functor_reducer.get_functor(), range.begin(), range.end(),
-        reducer.init(static_cast<pointer_type>(exec.reduce_memory())));
+        reducer.init(static_cast<pointer_type>(instance.reduce_memory())));
 
-    exec.fan_in_reduce(reducer);
+    instance.fan_in_reduce(reducer);
   }
 
   template <class Schedule>
   static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
+  exec_schedule(ThreadsInternal &instance, const void *arg) {
     const ParallelReduce &self = *((const ParallelReduce *)arg);
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+    const WorkRange range(self.m_policy, instance.pool_rank(),
+                          instance.pool_size());
 
-    exec.set_work_range(range.begin() - self.m_policy.begin(),
-                        range.end() - self.m_policy.begin(),
-                        self.m_policy.chunk_size());
-    exec.reset_steal_target();
-    exec.barrier();
+    instance.set_work_range(range.begin() - self.m_policy.begin(),
+                            range.end() - self.m_policy.begin(),
+                            self.m_policy.chunk_size());
+    instance.reset_steal_target();
+    instance.barrier();
 
-    long work_index            = exec.get_work_index();
+    long work_index            = instance.get_work_index();
     const ReducerType &reducer = self.m_functor_reducer.get_reducer();
 
     reference_type update =
-        reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
+        reducer.init(static_cast<pointer_type>(instance.reduce_memory()));
     while (work_index != -1) {
       const Member begin =
           static_cast<Member>(work_index) * self.m_policy.chunk_size() +
@@ -114,10 +116,10 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
               : self.m_policy.end();
       ParallelReduce::template exec_range<WorkTag>(
           self.m_functor_reducer.get_functor(), begin, end, update);
-      work_index = exec.get_work_index();
+      work_index = instance.get_work_index();
     }
 
-    exec.fan_in_reduce(reducer);
+    instance.fan_in_reduce(reducer);
   }
 
  public:
@@ -130,15 +132,15 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
         reducer.final(m_result_ptr);
       }
     } else {
-      ThreadsExec::resize_scratch(reducer.value_size(), 0);
+      ThreadsInternal::resize_scratch(reducer.value_size(), 0);
 
-      ThreadsExec::start(&ParallelReduce::exec, this);
+      ThreadsInternal::start(&ParallelReduce::exec, this);
 
-      ThreadsExec::fence();
+      ThreadsInternal::fence();
 
       if (m_result_ptr) {
         const pointer_type data =
-            (pointer_type)ThreadsExec::root_reduce_scratch();
+            (pointer_type)ThreadsInternal::root_reduce_scratch();
 
         const unsigned n = reducer.value_count();
         for (unsigned i = 0; i < n; ++i) {
diff --git a/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp b/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp
index c4b6100a9df..4db310701f9 100644
--- a/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp
+++ b/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp
@@ -58,16 +58,16 @@ class ParallelReduce<CombinedFunctorReducerType,
     }
   }
 
-  static void exec(ThreadsExec &exec, const void *arg) {
+  static void exec(ThreadsInternal &instance, const void *arg) {
     const ParallelReduce &self = *((const ParallelReduce *)arg);
 
     ParallelReduce::template exec_team<WorkTag>(
         self.m_functor_reducer.get_functor(),
-        Member(&exec, self.m_policy, self.m_shared),
+        Member(&instance, self.m_policy, self.m_shared),
         self.m_functor_reducer.get_reducer().init(
-            static_cast<pointer_type>(exec.reduce_memory())));
+            static_cast<pointer_type>(instance.reduce_memory())));
 
-    exec.fan_in_reduce(self.m_functor_reducer.get_reducer());
+    instance.fan_in_reduce(self.m_functor_reducer.get_reducer());
   }
 
  public:
@@ -80,17 +80,17 @@ class ParallelReduce<CombinedFunctorReducerType,
         reducer.final(m_result_ptr);
       }
     } else {
-      ThreadsExec::resize_scratch(
+      ThreadsInternal::resize_scratch(
           reducer.value_size(),
           Policy::member_type::team_reduce_size() + m_shared);
 
-      ThreadsExec::start(&ParallelReduce::exec, this);
+      ThreadsInternal::start(&ParallelReduce::exec, this);
 
-      ThreadsExec::fence();
+      ThreadsInternal::fence();
 
       if (m_result_ptr) {
         const pointer_type data =
-            (pointer_type)ThreadsExec::root_reduce_scratch();
+            (pointer_type)ThreadsInternal::root_reduce_scratch();
 
         const unsigned n = reducer.value_count();
         for (unsigned i = 0; i < n; ++i) {
diff --git a/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp b/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp
index 74d8561a34b..62f34d741ff 100644
--- a/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp
+++ b/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp
@@ -65,33 +65,33 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
     }
   }
 
-  static void exec(ThreadsExec &exec, const void *arg) {
+  static void exec(ThreadsInternal &instance, const void *arg) {
     const ParallelScan &self = *((const ParallelScan *)arg);
 
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+    const WorkRange range(self.m_policy, instance.pool_rank(),
+                          instance.pool_size());
 
     typename Analysis::Reducer final_reducer(self.m_functor);
 
     reference_type update =
-        final_reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
+        final_reducer.init(static_cast<pointer_type>(instance.reduce_memory()));
 
     ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(),
                                                range.end(), update, false);
 
-    //  exec.template scan_large( final_reducer );
-    exec.scan_small(final_reducer);
+    instance.scan_small(final_reducer);
 
     ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(),
                                                range.end(), update, true);
 
-    exec.fan_in();
+    instance.fan_in();
   }
 
  public:
   inline void execute() const {
-    ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0);
-    ThreadsExec::start(&ParallelScan::exec, this);
-    ThreadsExec::fence();
+    ThreadsInternal::resize_scratch(2 * Analysis::value_size(m_functor), 0);
+    ThreadsInternal::start(&ParallelScan::exec, this);
+    ThreadsInternal::fence();
   }
 
   ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy)
@@ -145,37 +145,37 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     }
   }
 
-  static void exec(ThreadsExec &exec, const void *arg) {
+  static void exec(ThreadsInternal &instance, const void *arg) {
     const ParallelScanWithTotal &self = *((const ParallelScanWithTotal *)arg);
 
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+    const WorkRange range(self.m_policy, instance.pool_rank(),
+                          instance.pool_size());
 
     typename Analysis::Reducer final_reducer(self.m_functor);
 
     reference_type update =
-        final_reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
+        final_reducer.init(static_cast<pointer_type>(instance.reduce_memory()));
 
     ParallelScanWithTotal::template exec_range<WorkTag>(
         self.m_functor, range.begin(), range.end(), update, false);
 
-    //  exec.template scan_large(final_reducer);
-    exec.scan_small(final_reducer);
+    instance.scan_small(final_reducer);
 
     ParallelScanWithTotal::template exec_range<WorkTag>(
         self.m_functor, range.begin(), range.end(), update, true);
 
-    exec.fan_in();
+    instance.fan_in();
 
-    if (exec.pool_rank() == exec.pool_size() - 1) {
+    if (instance.pool_rank() == instance.pool_size() - 1) {
       *self.m_result_ptr = update;
     }
   }
 
  public:
   inline void execute() const {
-    ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0);
-    ThreadsExec::start(&ParallelScanWithTotal::exec, this);
-    ThreadsExec::fence();
+    ThreadsInternal::resize_scratch(2 * Analysis::value_size(m_functor), 0);
+    ThreadsInternal::start(&ParallelScanWithTotal::exec, this);
+    ThreadsInternal::fence();
   }
 
   template <class ViewType>
diff --git a/core/src/impl/Kokkos_Spinwait.cpp b/core/src/Threads/Kokkos_Threads_Spinwait.cpp
similarity index 90%
rename from core/src/impl/Kokkos_Spinwait.cpp
rename to core/src/Threads/Kokkos_Threads_Spinwait.cpp
index 0a7eda29bcf..3df9dc07bf4 100644
--- a/core/src/impl/Kokkos_Spinwait.cpp
+++ b/core/src/Threads/Kokkos_Threads_Spinwait.cpp
@@ -21,7 +21,7 @@
 #include <Kokkos_Macros.hpp>
 
 #include <Kokkos_Atomic.hpp>
-#include <impl/Kokkos_Spinwait.hpp>
+#include <Threads/Kokkos_Threads_Spinwait.hpp>
 #include <impl/Kokkos_BitOps.hpp>
 
 #include <thread>
@@ -108,5 +108,15 @@ void host_thread_yield(const uint32_t i, const WaitMode mode) {
 #endif /* defined( KOKKOS_ENABLE_ASM ) */
 }
 
+void spinwait_while_equal(ThreadState const volatile& flag,
+                          ThreadState const value) {
+  Kokkos::store_fence();
+  uint32_t i = 0;
+  while (value == flag) {
+    host_thread_yield(++i, WaitMode::ACTIVE);
+  }
+  Kokkos::load_fence();
+}
+
 }  // namespace Impl
 }  // namespace Kokkos
diff --git a/algorithms/src/std_algorithms/Kokkos_Swap.hpp b/core/src/Threads/Kokkos_Threads_Spinwait.hpp
similarity index 52%
rename from algorithms/src/std_algorithms/Kokkos_Swap.hpp
rename to core/src/Threads/Kokkos_Threads_Spinwait.hpp
index acd2a572c8c..b98b6dbb73b 100644
--- a/algorithms/src/std_algorithms/Kokkos_Swap.hpp
+++ b/core/src/Threads/Kokkos_Threads_Spinwait.hpp
@@ -14,28 +14,30 @@
 //
 //@HEADER
 
-#ifndef KOKKOS_STD_ALGORITHMS_SWAP_HPP
-#define KOKKOS_STD_ALGORITHMS_SWAP_HPP
+#ifndef KOKKOS_THREADS_SPINWAIT_HPP
+#define KOKKOS_THREADS_SPINWAIT_HPP
 
-#include <Kokkos_Core.hpp>
+#include <Threads/Kokkos_Threads_State.hpp>
+
+#include <cstdint>
 
 namespace Kokkos {
-namespace Experimental {
-
-// swap
-template <class T>
-KOKKOS_INLINE_FUNCTION void swap(T& a, T& b) noexcept {
-  static_assert(
-      std::is_move_assignable<T>::value && std::is_move_constructible<T>::value,
-      "Kokkos::Experimental::swap arguments must be move assignable "
-      "and move constructible");
-
-  T tmp = std::move(a);
-  a     = std::move(b);
-  b     = std::move(tmp);
-}
-
-}  // namespace Experimental
+namespace Impl {
+
+enum class WaitMode : int {
+  ACTIVE  // Used for tight loops to keep threads active longest
+  ,
+  PASSIVE  // Used to quickly yield the thread to quite down the system
+  ,
+  ROOT  // Never sleep or yield the root thread
+};
+
+void host_thread_yield(const uint32_t i, const WaitMode mode);
+
+void spinwait_while_equal(ThreadState const volatile& flag,
+                          ThreadState const value);
+
+}  // namespace Impl
 }  // namespace Kokkos
 
 #endif
diff --git a/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp b/core/src/Threads/Kokkos_Threads_State.hpp
similarity index 59%
rename from core/src/fwd/Kokkos_Fwd_HBWSpace.hpp
rename to core/src/Threads/Kokkos_Threads_State.hpp
index 21ba7fad01c..148e9aa4e05 100644
--- a/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp
+++ b/core/src/Threads/Kokkos_Threads_State.hpp
@@ -14,16 +14,26 @@
 //
 //@HEADER
 
-#ifndef KOKKOS_HBWSPACE_FWD_HPP_
-#define KOKKOS_HBWSPACE_FWD_HPP_
+#ifndef KOKKOS_THREADS_STATE_HPP
+#define KOKKOS_THREADS_STATE_HPP
 
-#ifdef KOKKOS_ENABLE_HBWSPACE
 namespace Kokkos {
-
-namespace Experimental {
-class HBWSpace;  /// Memory space for hbw_malloc from memkind (e.g. for KNL
-                 /// processor)
-}  // namespace Experimental
+namespace Impl {
+/** \brief States of a worker thread */
+enum class ThreadState {
+  Terminating  ///<  Termination in progress
+  ,
+  Inactive  ///<  Exists, waiting for work
+  ,
+  Active  ///<  Exists, performing work
+  ,
+  Rendezvous  ///<  Exists, waiting in a barrier or reduce
+  ,
+  ScanCompleted,
+  ScanAvailable,
+  ReductionAvailable
+};
+}  // namespace Impl
 }  // namespace Kokkos
-#endif
+
 #endif
diff --git a/core/src/Threads/Kokkos_ThreadsTeam.hpp b/core/src/Threads/Kokkos_Threads_Team.hpp
similarity index 95%
rename from core/src/Threads/Kokkos_ThreadsTeam.hpp
rename to core/src/Threads/Kokkos_Threads_Team.hpp
index b1cadc7c485..fd0f221365b 100644
--- a/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/core/src/Threads/Kokkos_Threads_Team.hpp
@@ -22,10 +22,11 @@
 #include <cstdio>
 
 #include <utility>
-#include <impl/Kokkos_Spinwait.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
 
 #include <Kokkos_Atomic.hpp>
+#include <Threads/Kokkos_Threads_Spinwait.hpp>
+#include <Threads/Kokkos_Threads_State.hpp>
 
 //----------------------------------------------------------------------------
 
@@ -50,8 +51,8 @@ class ThreadsExecTeamMember {
 
  private:
   using space = execution_space::scratch_memory_space;
-  ThreadsExec* const m_exec;
-  ThreadsExec* const* m_team_base;  ///< Base for team fan-in
+  ThreadsInternal* const m_instance;
+  ThreadsInternal* const* m_team_base;  ///< Base for team fan-in
   space m_team_shared;
   size_t m_team_shared_size;
   int m_team_size;
@@ -84,14 +85,13 @@ class ThreadsExecTeamMember {
     for (n = 1;
          (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size);
          n <<= 1) {
-      Impl::spinwait_while_equal<int>(m_team_base[j]->state(),
-                                      ThreadsExec::Active);
+      spinwait_while_equal(m_team_base[j]->state(), ThreadState::Active);
     }
 
     // If not root then wait for release
     if (m_team_rank_rev) {
-      m_exec->state() = ThreadsExec::Rendezvous;
-      Impl::spinwait_while_equal<int>(m_exec->state(), ThreadsExec::Rendezvous);
+      m_instance->state() = ThreadState::Rendezvous;
+      spinwait_while_equal(m_instance->state(), ThreadState::Rendezvous);
     }
 
     return !m_team_rank_rev;
@@ -102,7 +102,7 @@ class ThreadsExecTeamMember {
     for (n = 1;
          (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size);
          n <<= 1) {
-      m_team_base[j]->state() = ThreadsExec::Active;
+      m_team_base[j]->state() = ThreadState::Active;
     }
   }
 
@@ -188,10 +188,10 @@ class ThreadsExecTeamMember {
         using type =
             typename if_c<sizeof(Type) < TEAM_REDUCE_SIZE, Type, void>::type;
 
-        if (nullptr == m_exec) return value;
+        if (m_instance == nullptr) return value;
 
         if (team_rank() != team_size() - 1) *
-            ((volatile type*)m_exec->scratch_memory()) = value;
+            ((volatile type*)m_instance->scratch_memory()) = value;
 
         memory_fence();
 
@@ -229,9 +229,9 @@ class ThreadsExecTeamMember {
         using type = typename if_c<sizeof(value_type) < TEAM_REDUCE_SIZE,
                                    value_type, void>::type;
 
-        if (nullptr == m_exec) return;
+        if (m_instance == nullptr) return;
 
-        type* const local_value = ((type*)m_exec->scratch_memory());
+        type* const local_value = ((type*)m_instance->scratch_memory());
 
         // Set this thread's contribution
         if (team_rank() != team_size() - 1) { *local_value = contribution; }
@@ -285,9 +285,9 @@ class ThreadsExecTeamMember {
         using type = typename if_c<sizeof(ArgType) < TEAM_REDUCE_SIZE, ArgType,
                                    void>::type;
 
-        if (nullptr == m_exec) return type(0);
+        if (m_instance == nullptr) return type(0);
 
-        volatile type* const work_value = ((type*)m_exec->scratch_memory());
+        volatile type* const work_value = ((type*)m_instance->scratch_memory());
 
         *work_value = value;
 
@@ -342,10 +342,10 @@ class ThreadsExecTeamMember {
 
   template <class... Properties>
   ThreadsExecTeamMember(
-      Impl::ThreadsExec* exec,
+      Impl::ThreadsInternal* instance,
       const TeamPolicyInternal<Kokkos::Threads, Properties...>& team,
       const size_t shared_size)
-      : m_exec(exec),
+      : m_instance(instance),
         m_team_base(nullptr),
         m_team_shared(nullptr, 0),
         m_team_shared_size(shared_size),
@@ -361,9 +361,11 @@ class ThreadsExecTeamMember {
     if (team.league_size()) {
       // Execution is using device-team interface:
 
-      const int pool_rank_rev = m_exec->pool_size() - (m_exec->pool_rank() + 1);
+      const int pool_rank_rev =
+          m_instance->pool_size() - (m_instance->pool_rank() + 1);
       const int team_rank_rev = pool_rank_rev % team.team_alloc();
-      const size_t pool_league_size = m_exec->pool_size() / team.team_alloc();
+      const size_t pool_league_size =
+          m_instance->pool_size() / team.team_alloc();
       const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc();
       if (pool_league_rank_rev >= pool_league_size) {
         m_invalid_thread = 1;
@@ -372,7 +374,7 @@ class ThreadsExecTeamMember {
       const size_t pool_league_rank =
           pool_league_size - (pool_league_rank_rev + 1);
 
-      const int pool_num_teams = m_exec->pool_size() / team.team_alloc();
+      const int pool_num_teams = m_instance->pool_size() / team.team_alloc();
       const int chunk_size =
           team.chunk_size() > 0 ? team.chunk_size() : team.team_iter();
       const int chunks_per_team =
@@ -387,8 +389,8 @@ class ThreadsExecTeamMember {
 
       if ((team.team_alloc() > size_t(m_team_size))
               ? (team_rank_rev >= m_team_size)
-              : (m_exec->pool_size() - pool_num_teams * m_team_size >
-                 m_exec->pool_rank()))
+              : (m_instance->pool_size() - pool_num_teams * m_team_size >
+                 m_instance->pool_rank()))
         m_invalid_thread = 1;
       else
         m_invalid_thread = 0;
@@ -398,7 +400,7 @@ class ThreadsExecTeamMember {
 
       if (team_rank_rev < team.team_size() && !m_invalid_thread) {
         m_team_base =
-            m_exec->pool_base() + team.team_alloc() * pool_league_rank_rev;
+            m_instance->pool_base() + team.team_alloc() * pool_league_rank_rev;
         m_team_size     = team.team_size();
         m_team_rank     = team.team_size() - (team_rank_rev + 1);
         m_team_rank_rev = team_rank_rev;
@@ -413,13 +415,13 @@ class ThreadsExecTeamMember {
       }
 
       if ((m_team_rank_rev == 0) && (m_invalid_thread == 0)) {
-        m_exec->set_work_range(m_league_rank, m_league_end, m_chunk_size);
-        m_exec->reset_steal_target(m_team_size);
+        m_instance->set_work_range(m_league_rank, m_league_end, m_chunk_size);
+        m_instance->reset_steal_target(m_team_size);
       }
       if (std::is_same<typename TeamPolicyInternal<
                            Kokkos::Threads, Properties...>::schedule_type::type,
                        Kokkos::Dynamic>::value) {
-        m_exec->barrier();
+        m_instance->barrier();
       }
     } else {
       m_invalid_thread = 1;
@@ -427,7 +429,7 @@ class ThreadsExecTeamMember {
   }
 
   ThreadsExecTeamMember()
-      : m_exec(nullptr),
+      : m_instance(nullptr),
         m_team_base(nullptr),
         m_team_shared(nullptr, 0),
         m_team_shared_size(0),
@@ -442,8 +444,8 @@ class ThreadsExecTeamMember {
         m_invalid_thread(0),
         m_team_alloc(0) {}
 
-  inline ThreadsExec& threads_exec_team_base() const {
-    return m_team_base ? **m_team_base : *m_exec;
+  inline ThreadsInternal& threads_exec_team_base() const {
+    return m_team_base ? **m_team_base : *m_instance;
   }
 
   bool valid_static() const { return m_league_rank < m_league_end; }
@@ -999,8 +1001,10 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
     lambda(i, scan_val, false);
   }
 
+  auto& team_member = loop_bounds.thread;
+
   // 'scan_val' output is the exclusive prefix sum
-  scan_val = loop_bounds.thread.team_scan(scan_val);
+  scan_val = team_member.team_scan(scan_val);
 
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
@@ -1010,6 +1014,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
     lambda(i, scan_val, true);
   }
 
+  team_member.team_broadcast(scan_val, team_member.team_size() - 1);
+
   return_val = scan_val;
 }
 
diff --git a/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
index d4ce697548f..c88d66db5f9 100644
--- a/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
+++ b/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
@@ -18,7 +18,7 @@
 #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP
 
 #include <Kokkos_Core_fwd.hpp>
-#include <Threads/Kokkos_ThreadsExec.hpp>
+#include <Threads/Kokkos_Threads_Instance.hpp>
 
 namespace Kokkos {
 namespace Impl {
@@ -61,16 +61,17 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
     }
   }
 
-  static inline void thread_main(ThreadsExec& exec, const void* arg) noexcept {
+  static inline void thread_main(ThreadsInternal& instance,
+                                 const void* arg) noexcept {
     const Self& self = *(static_cast<const Self*>(arg));
     self.exec_one_thread();
-    exec.fan_in();
+    instance.fan_in();
   }
 
  public:
   inline void execute() {
-    ThreadsExec::start(&Self::thread_main, this);
-    ThreadsExec::fence();
+    ThreadsInternal::start(&Self::thread_main, this);
+    ThreadsInternal::fence();
   }
 
   inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
diff --git a/core/src/decl/Kokkos_Declare_HBWSpace.hpp b/core/src/decl/Kokkos_Declare_HBWSpace.hpp
deleted file mode 100644
index 1328c931352..00000000000
--- a/core/src/decl/Kokkos_Declare_HBWSpace.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_DECLARE_HBWSPACE_HPP
-#define KOKKOS_DECLARE_HBWSPACE_HPP
-
-#ifdef KOKKOS_ENABLE_HBWSPACE
-#include <Kokkos_HBWSpace.hpp>
-#endif
-
-#endif
diff --git a/core/src/decl/Kokkos_Declare_HIP.hpp b/core/src/decl/Kokkos_Declare_HIP.hpp
index e115f7051f3..cf405e57b8f 100644
--- a/core/src/decl/Kokkos_Declare_HIP.hpp
+++ b/core/src/decl/Kokkos_Declare_HIP.hpp
@@ -25,9 +25,13 @@
 #include <HIP/Kokkos_HIP_Half_Conversion.hpp>
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <HIP/Kokkos_HIP_MDRangePolicy.hpp>
-#include <HIP/Kokkos_HIP_Parallel_Range.hpp>
-#include <HIP/Kokkos_HIP_Parallel_MDRange.hpp>
-#include <HIP/Kokkos_HIP_Parallel_Team.hpp>
+#include <HIP/Kokkos_HIP_ParallelFor_Range.hpp>
+#include <HIP/Kokkos_HIP_ParallelFor_MDRange.hpp>
+#include <HIP/Kokkos_HIP_ParallelFor_Team.hpp>
+#include <HIP/Kokkos_HIP_ParallelReduce_Range.hpp>
+#include <HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp>
+#include <HIP/Kokkos_HIP_ParallelReduce_Team.hpp>
+#include <HIP/Kokkos_HIP_ParallelScan_Range.hpp>
 #include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp>
 #include <HIP/Kokkos_HIP_UniqueToken.hpp>
 #include <HIP/Kokkos_HIP_ZeroMemset.hpp>
diff --git a/core/src/decl/Kokkos_Declare_THREADS.hpp b/core/src/decl/Kokkos_Declare_THREADS.hpp
index f5cbc0c1d1d..4d7caec6f5f 100644
--- a/core/src/decl/Kokkos_Declare_THREADS.hpp
+++ b/core/src/decl/Kokkos_Declare_THREADS.hpp
@@ -19,7 +19,7 @@
 
 #if defined(KOKKOS_ENABLE_THREADS)
 #include <Threads/Kokkos_Threads.hpp>
-#include <Threads/Kokkos_ThreadsExec.hpp>
+#include <Threads/Kokkos_Threads_Instance.hpp>
 #include <Threads/Kokkos_Threads_MDRangePolicy.hpp>
 #include <Threads/Kokkos_Threads_ParallelFor_Range.hpp>
 #include <Threads/Kokkos_Threads_ParallelFor_MDRange.hpp>
@@ -28,7 +28,7 @@
 #include <Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp>
 #include <Threads/Kokkos_Threads_ParallelReduce_Team.hpp>
 #include <Threads/Kokkos_Threads_ParallelScan_Range.hpp>
-#include <Threads/Kokkos_ThreadsTeam.hpp>
+#include <Threads/Kokkos_Threads_Team.hpp>
 #include <Threads/Kokkos_Threads_UniqueToken.hpp>
 #endif
 
diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp
index 5c182db5663..4a696526161 100644
--- a/core/src/impl/Kokkos_Core.cpp
+++ b/core/src/impl/Kokkos_Core.cpp
@@ -90,8 +90,6 @@ void combine(Kokkos::InitializationSettings& out,
   KOKKOS_IMPL_COMBINE_SETTING(num_threads);
   KOKKOS_IMPL_COMBINE_SETTING(map_device_id_by);
   KOKKOS_IMPL_COMBINE_SETTING(device_id);
-  KOKKOS_IMPL_COMBINE_SETTING(num_devices);
-  KOKKOS_IMPL_COMBINE_SETTING(skip_device);
   KOKKOS_IMPL_COMBINE_SETTING(disable_warnings);
   KOKKOS_IMPL_COMBINE_SETTING(tune_internals);
   KOKKOS_IMPL_COMBINE_SETTING(tools_help);
@@ -131,11 +129,15 @@ void combine(Kokkos::Tools::InitArguments& out,
 
 int get_device_count() {
 #if defined(KOKKOS_ENABLE_CUDA)
-  return Kokkos::Cuda::detect_device_count();
+  int count;
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count));
+  return count;
 #elif defined(KOKKOS_ENABLE_HIP)
-  return Kokkos::HIP::detect_device_count();
+  int count;
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count));
+  return count;
 #elif defined(KOKKOS_ENABLE_SYCL)
-  return sycl::device::get_devices(sycl::info::device_type::gpu).size();
+  return Kokkos::Experimental::Impl::get_sycl_devices().size();
 #elif defined(KOKKOS_ENABLE_OPENACC)
   return acc_get_num_devices(
       Kokkos::Experimental::Impl::OpenACC_Traits::dev_type);
@@ -165,20 +167,43 @@ bool is_valid_map_device_id_by(std::string const& x) {
 
 }  // namespace
 
+std::vector<int> const& Kokkos::Impl::get_visible_devices() {
+  static auto devices = get_visible_devices(get_device_count());
+  return devices;
+}
+
 [[nodiscard]] int Kokkos::device_id() noexcept {
 #if defined(KOKKOS_ENABLE_CUDA)
-  return Cuda().cuda_device();
+  int device = Cuda().cuda_device();
 #elif defined(KOKKOS_ENABLE_HIP)
-  return HIP().hip_device();
+  int device = HIP().hip_device();
 #elif defined(KOKKOS_ENABLE_OPENACC)
-  return Experimental::OpenACC().acc_device_number();
+  int device = Experimental::OpenACC().acc_device_number();
 #elif defined(KOKKOS_ENABLE_OPENMPTARGET)
-  return omp_get_default_device();  // FIXME_OPENMPTARGET
+  int device = omp_get_default_device();  // FIXME_OPENMPTARGET
 #elif defined(KOKKOS_ENABLE_SYCL)
-  return Experimental::Impl::SYCLInternal::m_syclDev;
+  int device = Experimental::Impl::SYCLInternal::m_syclDev;
 #else
-  return -1;
+  int device = -1;
+  return device;
 #endif
+  auto const& visible_devices = Impl::get_visible_devices();
+  for (std::size_t i = 0; i < visible_devices.size(); ++i) {
+    if (visible_devices[i] == device) {
+      return i;
+    }
+  }
+  Kokkos::abort("Unexpected error: cannot determine device id");
+  return -1;
+}
+
+[[nodiscard]] int Kokkos::num_devices() noexcept {
+  if constexpr (std::is_same_v<DefaultExecutionSpace,
+                               DefaultHostExecutionSpace>) {
+    return -1;  // no GPU backend enabled
+  } else {
+    return Impl::get_visible_devices().size();
+  }
 }
 
 [[nodiscard]] int Kokkos::num_threads() noexcept {
@@ -313,8 +338,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) {
   return std::stoi(id.c_str());
 }
 
-std::vector<int> Kokkos::Impl::get_visible_devices(
-    Kokkos::InitializationSettings const& settings, int device_count) {
+std::vector<int> Kokkos::Impl::get_visible_devices(int device_count) {
   std::vector<int> visible_devices;
   char* env_visible_devices = std::getenv("KOKKOS_VISIBLE_DEVICES");
   if (env_visible_devices) {
@@ -341,30 +365,9 @@ std::vector<int> Kokkos::Impl::get_visible_devices(
       }
     }
   } else {
-    int num_devices =
-        settings.has_num_devices() ? settings.get_num_devices() : device_count;
-    if (num_devices > device_count) {
-      std::stringstream ss;
-      ss << "Error: Specified number of devices '" << num_devices
-         << "' exceeds the actual number of GPUs available for execution '"
-         << device_count << "'."
-         << " Raised by Kokkos::initialize().\n";
-      Kokkos::abort(ss.str().c_str());
-    }
-    for (int i = 0; i < num_devices; ++i) {
+    for (int i = 0; i < device_count; ++i) {
       visible_devices.push_back(i);
     }
-    if (settings.has_skip_device()) {
-      if (visible_devices.size() == 1 && settings.get_skip_device() == 0) {
-        Kokkos::abort(
-            "Error: skipping the only GPU available for execution.\n"
-            " Raised by Kokkos::initialize().\n");
-      }
-      visible_devices.erase(
-          std::remove(visible_devices.begin(), visible_devices.end(),
-                      settings.get_skip_device()),
-          visible_devices.end());
-    }
   }
   if (visible_devices.empty()) {
     Kokkos::abort(
@@ -374,10 +377,10 @@ std::vector<int> Kokkos::Impl::get_visible_devices(
   return visible_devices;
 }
 
-int Kokkos::Impl::get_gpu(const InitializationSettings& settings) {
-  std::vector<int> visible_devices =
-      get_visible_devices(settings, get_device_count());
-  int const num_devices = visible_devices.size();
+std::optional<int> Kokkos::Impl::get_gpu(
+    const InitializationSettings& settings) {
+  std::vector<int> visible_devices = get_visible_devices(get_device_count());
+  int const num_devices            = visible_devices.size();
   // device_id is provided
   if (settings.has_device_id()) {
     int const id = settings.get_device_id();
@@ -423,14 +426,15 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) {
 
   int const mpi_local_rank = mpi_local_rank_on_node();
 
-  // use first GPU available for execution if unable to detect local MPI rank
+  // if unable to detect local MPI rank return nullopt to delegate device
+  // selection to the backend
   if (mpi_local_rank < 0) {
     if (settings.has_map_device_id_by()) {
       std::cerr << "Warning: unable to detect local MPI rank."
                 << " Falling back to the first GPU available for execution."
                 << " Raised by Kokkos::initialize()." << std::endl;
     }
-    return visible_devices[0];
+    return std::nullopt;
   }
 
   // use device assigned by CTest when resource allocation is activated
@@ -445,13 +449,6 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) {
 namespace {
 
 void initialize_backends(const Kokkos::InitializationSettings& settings) {
-// This is an experimental setting
-// For KNL in Flat mode this variable should be set, so that
-// memkind allocates high bandwidth memory correctly.
-#ifdef KOKKOS_ENABLE_HBWSPACE
-  setenv("MEMKIND_HBW_NODES", "1", 0);
-#endif
-
   Kokkos::Impl::ExecSpaceManager::get_instance().initialize_spaces(settings);
 }
 
@@ -571,19 +568,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) {
                                  "no");
 #endif
 
-#ifdef KOKKOS_ENABLE_HBWSPACE
-  declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "yes");
-#else
-  declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "no");
-#endif
-#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC
-  declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC",
-                                 "yes");
-#else
-  declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC",
-                                 "no");
-#endif
-
 #ifdef KOKKOS_ENABLE_ASM
   declare_configuration_metadata("options", "KOKKOS_ENABLE_ASM", "yes");
 #else
@@ -604,6 +588,11 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) {
 #else
   declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX23", "no");
 #endif
+#ifdef KOKKOS_ENABLE_CXX26
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX26", "yes");
+#else
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX26", "no");
+#endif
 #ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
   declare_configuration_metadata("options", "KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK",
                                  "yes");
@@ -616,11 +605,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) {
 #else
   declare_configuration_metadata("options", "KOKKOS_ENABLE_HWLOC", "no");
 #endif
-#ifdef KOKKOS_ENABLE_LIBRT
-  declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "yes");
-#else
-  declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "no");
-#endif
 #ifdef KOKKOS_ENABLE_LIBDL
   declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBDL", "yes");
 #else
@@ -645,8 +629,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) {
                                  "ARMV8_THUNDERX2");
 #elif defined(KOKKOS_ARCH_BDW)
   declare_configuration_metadata("architecture", "CPU architecture", "BDW");
-#elif defined(KOKKOS_ARCH_BGQ)
-  declare_configuration_metadata("architecture", "CPU architecture", "BGQ");
 #elif defined(KOKKOS_ARCH_HSW)
   declare_configuration_metadata("architecture", "CPU architecture", "HSW");
 #elif defined(KOKKOS_ARCH_ICL)
@@ -659,8 +641,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) {
   declare_configuration_metadata("architecture", "CPU architecture", "KNL");
 #elif defined(KOKKOS_ARCH_NATIVE)
   declare_configuration_metadata("architecture", "CPU architecture", "NATIVE");
-#elif defined(KOKKOS_ARCH_POWER7)
-  declare_configuration_metadata("architecture", "CPU architecture", "POWER7");
 #elif defined(KOKKOS_ARCH_POWER8)
   declare_configuration_metadata("architecture", "CPU architecture", "POWER8");
 #elif defined(KOKKOS_ARCH_POWER9)
@@ -673,8 +653,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) {
   declare_configuration_metadata("architecture", "CPU architecture", "SNB");
 #elif defined(KOKKOS_ARCH_SPR)
   declare_configuration_metadata("architecture", "CPU architecture", "SPR");
-#elif defined(KOKKOS_ARCH_WSM)
-  declare_configuration_metadata("architecture", "CPU architecture", "WSM");
 #elif defined(KOKKOS_ARCH_AMD_ZEN)
   declare_configuration_metadata("architecture", "CPU architecture", "AMD_ZEN");
 #elif defined(KOKKOS_ARCH_AMD_ZEN2)
@@ -683,6 +661,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) {
 #elif defined(KOKKOS_ARCH_AMD_ZEN3)
   declare_configuration_metadata("architecture", "CPU architecture",
                                  "AMD_ZEN3");
+#elif defined(KOKKOS_ARCH_RISCV_SG2042)
+  declare_configuration_metadata("architecture", "CPU architecture",
+                                 "SG2042 (RISC-V)")
 #else
   declare_configuration_metadata("architecture", "CPU architecture", "none");
 #endif
@@ -752,8 +733,8 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) {
 #elif defined(KOKKOS_ARCH_ADA89)
   declare_configuration_metadata("architecture", "GPU architecture", "ADA89");
 #elif defined(KOKKOS_ARCH_HOPPER90)
-  declare_configuration_metadata("architecture", "GPU architecture",
-                                 "HOPPER90");
+      declare_configuration_metadata("architecture", "GPU architecture",
+                                     "HOPPER90");
 #elif defined(KOKKOS_ARCH_AMD_GFX906)
   declare_configuration_metadata("architecture", "GPU architecture",
                                  "AMD_GFX906");
@@ -911,36 +892,18 @@ void Kokkos::Impl::parse_command_line_arguments(
 
   int num_threads;
   int device_id;
-  int num_devices;  // deprecated
-  int skip_device;  // deprecated
   std::string map_device_id_by;
   bool disable_warnings;
   bool print_configuration;
   bool tune_internals;
 
-  auto get_flag = [](std::string s) -> std::string {
-    return s.erase(s.find('='));
-  };
-
   bool help_flag = false;
 
   int iarg = 0;
   while (iarg < argc) {
     bool remove_flag = false;
 
-    if (check_arg(argv[iarg], "--kokkos-numa") ||
-        check_arg(argv[iarg], "--numa")) {
-      warn_deprecated_command_line_argument(get_flag(argv[iarg]));
-      // remove flag if prefixed with '--kokkos-'
-      remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0;
-    } else if (check_arg_int(argv[iarg], "--kokkos-num-threads", num_threads) ||
-               check_arg_int(argv[iarg], "--num-threads", num_threads) ||
-               check_arg_int(argv[iarg], "--kokkos-threads", num_threads) ||
-               check_arg_int(argv[iarg], "--threads", num_threads)) {
-      if (get_flag(argv[iarg]) != "--kokkos-num-threads") {
-        warn_deprecated_command_line_argument(get_flag(argv[iarg]),
-                                              "--kokkos-num-threads");
-      }
+    if (check_arg_int(argv[iarg], "--kokkos-num-threads", num_threads)) {
       if (!is_valid_num_threads(num_threads)) {
         std::stringstream ss;
         ss << "Error: command line argument '" << argv[iarg] << "' is invalid."
@@ -949,15 +912,8 @@ void Kokkos::Impl::parse_command_line_arguments(
         Kokkos::abort(ss.str().c_str());
       }
       settings.set_num_threads(num_threads);
-      remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0;
-    } else if (check_arg_int(argv[iarg], "--kokkos-device-id", device_id) ||
-               check_arg_int(argv[iarg], "--device-id", device_id) ||
-               check_arg_int(argv[iarg], "--kokkos-device", device_id) ||
-               check_arg_int(argv[iarg], "--device", device_id)) {
-      if (get_flag(argv[iarg]) != "--kokkos-device-id") {
-        warn_deprecated_command_line_argument(get_flag(argv[iarg]),
-                                              "--kokkos-device-id");
-      }
+      remove_flag = true;
+    } else if (check_arg_int(argv[iarg], "--kokkos-device-id", device_id)) {
       if (!is_valid_device_id(device_id)) {
         std::stringstream ss;
         ss << "Error: command line argument '" << argv[iarg] << "' is invalid."
@@ -966,70 +922,7 @@ void Kokkos::Impl::parse_command_line_arguments(
         Kokkos::abort(ss.str().c_str());
       }
       settings.set_device_id(device_id);
-      remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0;
-    } else if (check_arg(argv[iarg], "--kokkos-num-devices") ||
-               check_arg(argv[iarg], "--num-devices") ||
-               check_arg(argv[iarg], "--kokkos-ndevices") ||
-               check_arg(argv[iarg], "--ndevices")) {
-      if (check_arg(argv[iarg], "--num-devices")) {
-        warn_deprecated_command_line_argument("--num-devices",
-                                              "--kokkos-num-devices");
-      }
-      if (check_arg(argv[iarg], "--ndevices")) {
-        warn_deprecated_command_line_argument("--ndevices",
-                                              "--kokkos-num-devices");
-      }
-      if (check_arg(argv[iarg], "--kokkos-ndevices")) {
-        warn_deprecated_command_line_argument("--kokkos-ndevices",
-                                              "--kokkos-num-devices");
-      }
-      warn_deprecated_command_line_argument(
-          "--kokkos-num-devices", "--kokkos-map-device-id-by=mpi_rank");
-      // Find the number of device (expecting --device=XX)
-      if (!((strncmp(argv[iarg], "--kokkos-num-devices=", 21) == 0) ||
-            (strncmp(argv[iarg], "--num-devices=", 14) == 0) ||
-            (strncmp(argv[iarg], "--kokkos-ndevices=", 18) == 0) ||
-            (strncmp(argv[iarg], "--ndevices=", 11) == 0)))
-        throw_runtime_exception(
-            "Error: expecting an '=INT[,INT]' after command line argument "
-            "'--kokkos-num-devices'."
-            " Raised by Kokkos::initialize().");
-
-      char* num1      = strchr(argv[iarg], '=') + 1;
-      char* num2      = strpbrk(num1, ",");
-      int num1_len    = num2 == nullptr ? strlen(num1) : num2 - num1;
-      char* num1_only = new char[num1_len + 1];
-      strncpy(num1_only, num1, num1_len);
-      num1_only[num1_len] = '\0';
-
-      if (!is_unsigned_int(num1_only) || (strlen(num1_only) == 0)) {
-        throw_runtime_exception(
-            "Error: expecting an integer number after command line argument "
-            "'--kokkos-num-devices'."
-            " Raised by Kokkos::initialize().");
-      }
-      if (check_arg(argv[iarg], "--kokkos-num-devices") ||
-          check_arg(argv[iarg], "--kokkos-ndevices")) {
-        num_devices = std::stoi(num1_only);
-        settings.set_num_devices(num_devices);
-        settings.set_map_device_id_by("mpi_rank");
-      }
-      delete[] num1_only;
-
-      if (num2 != nullptr) {
-        if ((!is_unsigned_int(num2 + 1)) || (strlen(num2) == 1))
-          throw_runtime_exception(
-              "Error: expecting an integer number after command line argument "
-              "'--kokkos-num-devices=XX,'."
-              " Raised by Kokkos::initialize().");
-
-        if (check_arg(argv[iarg], "--kokkos-num-devices") ||
-            check_arg(argv[iarg], "--kokkos-ndevices")) {
-          skip_device = std::stoi(num2 + 1);
-          settings.set_skip_device(skip_device);
-        }
-      }
-      remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0;
+      remove_flag = true;
     } else if (check_arg_bool(argv[iarg], "--kokkos-disable-warnings",
                               disable_warnings)) {
       settings.set_disable_warnings(disable_warnings);
@@ -1098,9 +991,6 @@ void Kokkos::Impl::parse_environment_variables(
   }
   combine(settings, tools_init_arguments);
 
-  if (std::getenv("KOKKOS_NUMA")) {
-    warn_deprecated_environment_variable("KOKKOS_NUMA");
-  }
   int num_threads;
   if (check_env_int("KOKKOS_NUM_THREADS", num_threads)) {
     if (!is_valid_num_threads(num_threads)) {
@@ -1125,34 +1015,6 @@ void Kokkos::Impl::parse_environment_variables(
     }
     settings.set_device_id(device_id);
   }
-  int num_devices;
-  int rand_devices;
-  bool has_num_devices  = check_env_int("KOKKOS_NUM_DEVICES", num_devices);
-  bool has_rand_devices = check_env_int("KOKKOS_RAND_DEVICES", rand_devices);
-  if (has_rand_devices && has_num_devices) {
-    Impl::throw_runtime_exception(
-        "Error: cannot specify both KOKKOS_NUM_DEVICES and "
-        "KOKKOS_RAND_DEVICES."
-        " Raised by Kokkos::initialize().");
-  }
-  if (has_num_devices) {
-    warn_deprecated_environment_variable("KOKKOS_NUM_DEVICES",
-                                         "KOKKOS_MAP_DEVICE_ID_BY=mpi_rank");
-    settings.set_map_device_id_by("mpi_rank");
-    settings.set_num_devices(num_devices);
-  }
-  if (has_rand_devices) {
-    warn_deprecated_environment_variable("KOKKOS_RAND_DEVICES",
-                                         "KOKKOS_MAP_DEVICE_ID_BY=random");
-    settings.set_map_device_id_by("random");
-    settings.set_num_devices(rand_devices);
-  }
-  if (has_num_devices || has_rand_devices) {
-    int skip_device;
-    if (check_env_int("KOKKOS_SKIP_DEVICE", skip_device)) {
-      settings.set_skip_device(skip_device);
-    }
-  }
   bool disable_warnings;
   if (check_env_bool("KOKKOS_DISABLE_WARNINGS", disable_warnings)) {
     settings.set_disable_warnings(disable_warnings);
diff --git a/core/src/impl/Kokkos_DeviceManagement.hpp b/core/src/impl/Kokkos_DeviceManagement.hpp
index bd89c8b19ca..70dca5d8fad 100644
--- a/core/src/impl/Kokkos_DeviceManagement.hpp
+++ b/core/src/impl/Kokkos_DeviceManagement.hpp
@@ -17,17 +17,17 @@
 #ifndef KOKKOS_DEVICE_MANAGEMENT_HPP
 #define KOKKOS_DEVICE_MANAGEMENT_HPP
 
+#include <optional>
 #include <vector>
 
 namespace Kokkos {
 class InitializationSettings;
 namespace Impl {
-int get_gpu(const Kokkos::InitializationSettings& settings);
+std::optional<int> get_gpu(const Kokkos::InitializationSettings& settings);
 // This declaration is provided for testing purposes only
 int get_ctest_gpu(int local_rank);
-// ditto
-std::vector<int> get_visible_devices(
-    Kokkos::InitializationSettings const& settings, int device_count);
+std::vector<int> get_visible_devices(int device_count);  // test-only
+std::vector<int> const& get_visible_devices();           // use this instead
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/core/src/impl/Kokkos_Error.cpp b/core/src/impl/Kokkos_Error.cpp
index 4babe2d72bd..de6e83ed1f2 100644
--- a/core/src/impl/Kokkos_Error.cpp
+++ b/core/src/impl/Kokkos_Error.cpp
@@ -21,10 +21,11 @@
 #include <cstring>
 #include <cstdlib>
 
-#include <ostream>
+#include <iostream>
 #include <sstream>
 #include <iomanip>
 #include <stdexcept>
+#include <Kokkos_Core.hpp>  // show_warnings
 #include <impl/Kokkos_Error.hpp>
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 
@@ -38,6 +39,12 @@ void throw_runtime_exception(const std::string &msg) {
   throw std::runtime_error(msg);
 }
 
+void log_warning(const std::string &msg) {
+  if (show_warnings()) {
+    std::cerr << msg << std::flush;
+  }
+}
+
 std::string human_memory_size(size_t arg_bytes) {
   double bytes   = arg_bytes;
   const double K = 1024;
@@ -64,7 +71,8 @@ std::string human_memory_size(size_t arg_bytes) {
 
 void Experimental::RawMemoryAllocationFailure::print_error_message(
     std::ostream &o) const {
-  o << "Allocation of size " << Impl::human_memory_size(m_attempted_size);
+  o << "Allocation of size "
+    << ::Kokkos::Impl::human_memory_size(m_attempted_size);
   o << " failed";
   switch (m_failure_mode) {
     case FailureMode::OutOfMemoryError:
diff --git a/core/src/impl/Kokkos_Error.hpp b/core/src/impl/Kokkos_Error.hpp
index 3d0b1d3274c..1058fd98dbf 100644
--- a/core/src/impl/Kokkos_Error.hpp
+++ b/core/src/impl/Kokkos_Error.hpp
@@ -28,6 +28,8 @@ namespace Impl {
 
 [[noreturn]] void throw_runtime_exception(const std::string &msg);
 
+void log_warning(const std::string &msg);
+
 std::string human_memory_size(size_t arg_bytes);
 
 }  // namespace Impl
@@ -58,7 +60,8 @@ class RawMemoryAllocationFailure : public std::bad_alloc {
     HIPMallocManaged,
     SYCLMallocDevice,
     SYCLMallocShared,
-    SYCLMallocHost
+    SYCLMallocHost,
+    OpenACCMalloc,
   };
 
  private:
diff --git a/core/src/impl/Kokkos_HBWSpace.cpp b/core/src/impl/Kokkos_HBWSpace.cpp
deleted file mode 100644
index cd640b88cb9..00000000000
--- a/core/src/impl/Kokkos_HBWSpace.cpp
+++ /dev/null
@@ -1,313 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
-#define KOKKOS_IMPL_PUBLIC_INCLUDE
-#endif
-
-#include <Kokkos_Macros.hpp>
-
-#include <cstddef>
-#include <cstdlib>
-#include <cstdint>
-#include <cstring>
-
-#include <iostream>
-#include <sstream>
-#include <cstring>
-#include <algorithm>
-
-#include <Kokkos_HBWSpace.hpp>
-#include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_MemorySpace.hpp>
-#include <Kokkos_Atomic.hpp>
-#ifdef KOKKOS_ENABLE_HBWSPACE
-#include <memkind.h>
-#endif
-
-#include <impl/Kokkos_Tools.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-#ifdef KOKKOS_ENABLE_HBWSPACE
-#define MEMKIND_TYPE MEMKIND_HBW  // hbw_get_kind(HBW_PAGESIZE_4KB)
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Experimental {
-
-/* Default allocation mechanism */
-HBWSpace::HBWSpace() : m_alloc_mech(HBWSpace::STD_MALLOC) {
-  printf("Init\n");
-  setenv("MEMKIND_HBW_NODES", "1", 0);
-}
-
-/* Default allocation mechanism */
-HBWSpace::HBWSpace(const HBWSpace::AllocationMechanism &arg_alloc_mech)
-    : m_alloc_mech(HBWSpace::STD_MALLOC) {
-  printf("Init2\n");
-  setenv("MEMKIND_HBW_NODES", "1", 0);
-  if (arg_alloc_mech == STD_MALLOC) {
-    m_alloc_mech = HBWSpace::STD_MALLOC;
-  }
-}
-
-void *HBWSpace::allocate(const size_t arg_alloc_size) const {
-  return allocate("[unlabeled]", arg_alloc_size);
-}
-void *HBWSpace::allocate(const char *arg_label, const size_t arg_alloc_size,
-                         const size_t arg_logical_size) const {
-  return impl_allocate(arg_label, arg_alloc_size, arg_logical_size);
-}
-void *HBWSpace::impl_allocate(
-    const char *arg_label, const size_t arg_alloc_size,
-    const size_t arg_logical_size,
-    const Kokkos::Tools::SpaceHandle arg_handle) const {
-  static_assert(sizeof(void *) == sizeof(uintptr_t),
-                "Error sizeof(void*) != sizeof(uintptr_t)");
-
-  static_assert(
-      Kokkos::Impl::power_of_two<Kokkos::Impl::MEMORY_ALIGNMENT>::value,
-      "Memory alignment must be power of two");
-
-  constexpr uintptr_t alignment      = Kokkos::Impl::MEMORY_ALIGNMENT;
-  constexpr uintptr_t alignment_mask = alignment - 1;
-
-  void *ptr = nullptr;
-
-  if (arg_alloc_size) {
-    if (m_alloc_mech == STD_MALLOC) {
-      // Over-allocate to and round up to guarantee proper alignment.
-      size_t size_padded = arg_alloc_size + sizeof(void *) + alignment;
-
-      void *alloc_ptr = memkind_malloc(MEMKIND_TYPE, size_padded);
-
-      if (alloc_ptr) {
-        uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
-
-        // offset enough to record the alloc_ptr
-        address += sizeof(void *);
-        uintptr_t rem    = address % alignment;
-        uintptr_t offset = rem ? (alignment - rem) : 0u;
-        address += offset;
-        ptr = reinterpret_cast<void *>(address);
-        // record the alloc'd pointer
-        address -= sizeof(void *);
-        *reinterpret_cast<void **>(address) = alloc_ptr;
-      }
-    }
-  }
-
-  if ((ptr == nullptr) || (reinterpret_cast<uintptr_t>(ptr) == ~uintptr_t(0)) ||
-      (reinterpret_cast<uintptr_t>(ptr) & alignment_mask)) {
-    std::ostringstream msg;
-    msg << "Kokkos::Experimental::HBWSpace::allocate[ ";
-    switch (m_alloc_mech) {
-      case STD_MALLOC: msg << "STD_MALLOC"; break;
-      case POSIX_MEMALIGN: msg << "POSIX_MEMALIGN"; break;
-      case POSIX_MMAP: msg << "POSIX_MMAP"; break;
-      case INTEL_MM_ALLOC: msg << "INTEL_MM_ALLOC"; break;
-    }
-    msg << " ]( " << arg_alloc_size << " ) FAILED";
-    if (ptr == nullptr) {
-      msg << " nullptr";
-    } else {
-      msg << " NOT ALIGNED " << ptr;
-    }
-
-    std::cerr << msg.str() << std::endl;
-    std::cerr.flush();
-
-    Kokkos::Impl::throw_runtime_exception(msg.str());
-  }
-  if (Kokkos::Profiling::profileLibraryLoaded()) {
-    const size_t reported_size =
-        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
-    Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size);
-  }
-
-  return ptr;
-}
-
-void HBWSpace::deallocate(void *const arg_alloc_ptr,
-                          const size_t arg_alloc_size) const {
-  deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
-}
-void HBWSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr,
-                          const size_t arg_alloc_size,
-                          const size_t arg_logical_size) const {
-  impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size);
-}
-void HBWSpace::impl_deallocate(
-    const char *arg_label, void *const arg_alloc_ptr,
-    const size_t arg_alloc_size, const size_t arg_logical_size,
-    const Kokkos::Tools::SpaceHandle arg_handle) const {
-  if (arg_alloc_ptr) {
-    if (Kokkos::Profiling::profileLibraryLoaded()) {
-      const size_t reported_size =
-          (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
-      Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
-                                        reported_size);
-    }
-
-    if (m_alloc_mech == STD_MALLOC) {
-      void *alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) - 1);
-      memkind_free(MEMKIND_TYPE, alloc_ptr);
-    }
-  }
-}
-
-}  // namespace Experimental
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-#ifdef KOKKOS_ENABLE_DEBUG
-SharedAllocationRecord<void, void>
-    SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::s_root_record;
-#endif
-
-void SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::deallocate(
-    SharedAllocationRecord<void, void> *arg_rec) {
-  delete static_cast<SharedAllocationRecord *>(arg_rec);
-}
-
-SharedAllocationRecord<Kokkos::Experimental::HBWSpace,
-                       void>::~SharedAllocationRecord() {
-  m_space.deallocate(m_label.c_str(),
-                     SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     SharedAllocationRecord<void, void>::m_alloc_size,
-                     (SharedAllocationRecord<void, void>::m_alloc_size -
-                      sizeof(SharedAllocationHeader)));
-}
-
-SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::
-    SharedAllocationRecord(
-        const Kokkos::Experimental::HBWSpace &arg_space,
-        const std::string &arg_label, const size_t arg_alloc_size,
-        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : SharedAllocationRecord<void, void>(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::Experimental::HBWSpace,
-                                  void>::s_root_record,
-#endif
-          Impl::checked_allocation_with_header(arg_space, arg_label,
-                                               arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-  // Fill in the Header information
-  RecordBase::m_alloc_ptr->m_record =
-      static_cast<SharedAllocationRecord<void, void> *>(this);
-
-  strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(),
-          SharedAllocationHeader::maximum_label_length - 1);
-  // Set last element zero, in case c_str is too long
-  RecordBase::m_alloc_ptr
-      ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0';
-}
-
-//----------------------------------------------------------------------------
-
-void *
-SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::allocate_tracked(
-    const Kokkos::Experimental::HBWSpace &arg_space,
-    const std::string &arg_alloc_label, const size_t arg_alloc_size) {
-  if (!arg_alloc_size) return nullptr;
-
-  SharedAllocationRecord *const r =
-      allocate(arg_space, arg_alloc_label, arg_alloc_size);
-
-  RecordBase::increment(r);
-
-  return r->data();
-}
-
-void SharedAllocationRecord<Kokkos::Experimental::HBWSpace,
-                            void>::deallocate_tracked(void *const
-                                                          arg_alloc_ptr) {
-  if (arg_alloc_ptr != nullptr) {
-    SharedAllocationRecord *const r = get_record(arg_alloc_ptr);
-
-    RecordBase::decrement(r);
-  }
-}
-
-void *SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::
-    reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) {
-  SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr);
-  SharedAllocationRecord *const r_new =
-      allocate(r_old->m_space, r_old->get_label(), arg_alloc_size);
-
-  Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace,
-                         Kokkos::Experimental::HBWSpace>(
-      r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size()));
-  Kokkos::fence(
-      "SharedAllocationRecord<Kokkos::Experimental::HBWSpace, "
-      "void>::reallocate_tracked(): fence after copying data");
-
-  RecordBase::increment(r_new);
-  RecordBase::decrement(r_old);
-
-  return r_new->data();
-}
-
-SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>
-    *SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::get_record(
-        void *alloc_ptr) {
-  using Header = SharedAllocationHeader;
-  using RecordHost =
-      SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>;
-
-  SharedAllocationHeader const *const head =
-      alloc_ptr ? Header::get_header(alloc_ptr) : nullptr;
-  RecordHost *const record =
-      head ? static_cast<RecordHost *>(head->m_record) : nullptr;
-
-  if (!alloc_ptr || record->m_alloc_ptr != head) {
-    Kokkos::Impl::throw_runtime_exception(std::string(
-        "Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace "
-        ", void >::get_record ERROR"));
-  }
-
-  return record;
-}
-
-// Iterate records to print orphaned memory ...
-void SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::
-    print_records(std::ostream &s, const Kokkos::Experimental::HBWSpace &space,
-                  bool detail) {
-#ifdef KOKKOS_ENABLE_DEBUG
-  SharedAllocationRecord<void, void>::print_host_accessible_records(
-      s, "HBWSpace", &s_root_record, detail);
-#else
-  throw_runtime_exception(
-      "SharedAllocationRecord<HBWSpace>::print_records"
-      " only works with KOKKOS_ENABLE_DEBUG enabled");
-#endif
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-#endif
diff --git a/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp
index 4a22898d168..bcce013b00e 100644
--- a/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp
+++ b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp
@@ -196,12 +196,12 @@ KOKKOS_INLINE_FUNCTION
 
 template <class T>
 static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper(
-    T x, const volatile Kokkos::Impl::half_impl_t::type&);
+    T x, const Kokkos::Impl::half_impl_t::type&);
 
 #ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED
 template <class T>
 static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper(
-    T x, const volatile Kokkos::Impl::bhalf_impl_t::type&);
+    T x, const Kokkos::Impl::bhalf_impl_t::type&);
 #endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
 
 template <class T>
@@ -283,13 +283,6 @@ class alignas(FloatType) floating_point_wrapper {
 
  private:
   impl_type val;
-  using fixed_width_integer_type = std::conditional_t<
-      sizeof(impl_type) == 2, uint16_t,
-      std::conditional_t<
-          sizeof(impl_type) == 4, uint32_t,
-          std::conditional_t<sizeof(impl_type) == 8, uint64_t, void>>>;
-  static_assert(!std::is_void<fixed_width_integer_type>::value,
-                "Invalid impl_type");
 
  public:
   // In-class initialization and defaulted default constructors not used
@@ -318,18 +311,6 @@ class alignas(FloatType) floating_point_wrapper {
       default;
 #endif
 
-  KOKKOS_INLINE_FUNCTION
-  floating_point_wrapper(const volatile floating_point_wrapper& rhs) {
-#if defined(KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH) && !defined(KOKKOS_ENABLE_SYCL)
-    val = rhs.val;
-#else
-    const volatile fixed_width_integer_type* rv_ptr =
-        reinterpret_cast<const volatile fixed_width_integer_type*>(&rhs.val);
-    const fixed_width_integer_type rv_val = *rv_ptr;
-    val       = reinterpret_cast<const impl_type&>(rv_val);
-#endif  // KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-  }
-
   KOKKOS_FUNCTION
   floating_point_wrapper(bit_comparison_type rhs) {
     val = Kokkos::bit_cast<impl_type>(rhs);
@@ -492,15 +473,6 @@ class alignas(FloatType) floating_point_wrapper {
     return *this;
   }
 
-  template <class T>
-  KOKKOS_FUNCTION void operator=(T rhs) volatile {
-    impl_type new_val = cast_to_wrapper(rhs, val).val;
-    volatile fixed_width_integer_type* val_ptr =
-        reinterpret_cast<volatile fixed_width_integer_type*>(
-            const_cast<impl_type*>(&val));
-    *val_ptr = reinterpret_cast<fixed_width_integer_type&>(new_val);
-  }
-
   // Compound operators
   KOKKOS_FUNCTION
   floating_point_wrapper& operator+=(floating_point_wrapper rhs) {
@@ -515,15 +487,6 @@ class alignas(FloatType) floating_point_wrapper {
     return *this;
   }
 
-  KOKKOS_FUNCTION
-  void operator+=(const volatile floating_point_wrapper& rhs) volatile {
-    floating_point_wrapper tmp_rhs = rhs;
-    floating_point_wrapper tmp_lhs = *this;
-
-    tmp_lhs += tmp_rhs;
-    *this = tmp_lhs;
-  }
-
   // Compound operators: upcast overloads for +=
   template <class T>
   KOKKOS_FUNCTION friend std::enable_if_t<
@@ -560,15 +523,6 @@ class alignas(FloatType) floating_point_wrapper {
     return *this;
   }
 
-  KOKKOS_FUNCTION
-  void operator-=(const volatile floating_point_wrapper& rhs) volatile {
-    floating_point_wrapper tmp_rhs = rhs;
-    floating_point_wrapper tmp_lhs = *this;
-
-    tmp_lhs -= tmp_rhs;
-    *this = tmp_lhs;
-  }
-
   // Compund operators: upcast overloads for -=
   template <class T>
   KOKKOS_FUNCTION friend std::enable_if_t<
@@ -605,15 +559,6 @@ class alignas(FloatType) floating_point_wrapper {
     return *this;
   }
 
-  KOKKOS_FUNCTION
-  void operator*=(const volatile floating_point_wrapper& rhs) volatile {
-    floating_point_wrapper tmp_rhs = rhs;
-    floating_point_wrapper tmp_lhs = *this;
-
-    tmp_lhs *= tmp_rhs;
-    *this = tmp_lhs;
-  }
-
   // Compund operators: upcast overloads for *=
   template <class T>
   KOKKOS_FUNCTION friend std::enable_if_t<
@@ -650,15 +595,6 @@ class alignas(FloatType) floating_point_wrapper {
     return *this;
   }
 
-  KOKKOS_FUNCTION
-  void operator/=(const volatile floating_point_wrapper& rhs) volatile {
-    floating_point_wrapper tmp_rhs = rhs;
-    floating_point_wrapper tmp_lhs = *this;
-
-    tmp_lhs /= tmp_rhs;
-    *this = tmp_lhs;
-  }
-
   // Compund operators: upcast overloads for /=
   template <class T>
   KOKKOS_FUNCTION friend std::enable_if_t<
@@ -884,27 +820,6 @@ class alignas(FloatType) floating_point_wrapper {
 #endif
   }
 
-  KOKKOS_FUNCTION
-  friend bool operator==(const volatile floating_point_wrapper& lhs,
-                         const volatile floating_point_wrapper& rhs) {
-    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
-    return tmp_lhs == tmp_rhs;
-  }
-
-  KOKKOS_FUNCTION
-  friend bool operator!=(const volatile floating_point_wrapper& lhs,
-                         const volatile floating_point_wrapper& rhs) {
-    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
-    return tmp_lhs != tmp_rhs;
-  }
-
-  KOKKOS_FUNCTION
-  friend bool operator<(const volatile floating_point_wrapper& lhs,
-                        const volatile floating_point_wrapper& rhs) {
-    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
-    return tmp_lhs < tmp_rhs;
-  }
-
   template <class T>
   KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> &&
                                               (std::is_same_v<T, float> ||
@@ -923,13 +838,6 @@ class alignas(FloatType) floating_point_wrapper {
     return lhs < static_cast<float>(rhs);
   }
 
-  KOKKOS_FUNCTION
-  friend bool operator>(const volatile floating_point_wrapper& lhs,
-                        const volatile floating_point_wrapper& rhs) {
-    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
-    return tmp_lhs > tmp_rhs;
-  }
-
   template <class T>
   KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> &&
                                               (std::is_same_v<T, float> ||
@@ -948,13 +856,6 @@ class alignas(FloatType) floating_point_wrapper {
     return lhs > static_cast<float>(rhs);
   }
 
-  KOKKOS_FUNCTION
-  friend bool operator<=(const volatile floating_point_wrapper& lhs,
-                         const volatile floating_point_wrapper& rhs) {
-    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
-    return tmp_lhs <= tmp_rhs;
-  }
-
   template <class T>
   KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> &&
                                               (std::is_same_v<T, float> ||
@@ -973,13 +874,6 @@ class alignas(FloatType) floating_point_wrapper {
     return lhs <= static_cast<float>(rhs);
   }
 
-  KOKKOS_FUNCTION
-  friend bool operator>=(const volatile floating_point_wrapper& lhs,
-                         const volatile floating_point_wrapper& rhs) {
-    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
-    return tmp_lhs >= tmp_rhs;
-  }
-
   template <class T>
   KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> &&
                                               (std::is_same_v<T, float> ||
@@ -1018,14 +912,14 @@ class alignas(FloatType) floating_point_wrapper {
 // Declare wrapper overloads now that floating_point_wrapper is declared
 template <class T>
 static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper(
-    T x, const volatile Kokkos::Impl::half_impl_t::type&) {
+    T x, const Kokkos::Impl::half_impl_t::type&) {
   return Kokkos::Experimental::cast_to_half(x);
 }
 
 #ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED
 template <class T>
 static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper(
-    T x, const volatile Kokkos::Impl::bhalf_impl_t::type&) {
+    T x, const Kokkos::Impl::bhalf_impl_t::type&) {
   return Kokkos::Experimental::cast_to_bhalf(x);
 }
 #endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
diff --git a/core/src/impl/Kokkos_HostSpace.cpp b/core/src/impl/Kokkos_HostSpace.cpp
index a9d72160593..1047b773d77 100644
--- a/core/src/impl/Kokkos_HostSpace.cpp
+++ b/core/src/impl/Kokkos_HostSpace.cpp
@@ -20,23 +20,11 @@
 
 #include <Kokkos_Macros.hpp>
 
+#include <Kokkos_Atomic.hpp>
+#include <Kokkos_HostSpace.hpp>
 #include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_MemorySpace.hpp>
 #include <impl/Kokkos_Tools.hpp>
 
-/*--------------------------------------------------------------------------*/
-
-#if (defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)) && \
-    !defined(KOKKOS_ENABLE_CUDA)
-
-// Intel specialized allocator does not interoperate with CUDA memory allocation
-
-#define KOKKOS_ENABLE_INTEL_MM_ALLOC
-
-#endif
-
-/*--------------------------------------------------------------------------*/
-
 #include <cstddef>
 #include <cstdlib>
 #include <cstdint>
@@ -50,10 +38,6 @@
 #include <aligned_new>
 #endif
 
-#include <Kokkos_HostSpace.hpp>
-#include <impl/Kokkos_Error.hpp>
-#include <Kokkos_Atomic.hpp>
-
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
@@ -150,84 +134,6 @@ void HostSpace::impl_deallocate(
 
 }  // namespace Kokkos
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-#ifdef KOKKOS_ENABLE_DEBUG
-SharedAllocationRecord<void, void>
-    SharedAllocationRecord<Kokkos::HostSpace, void>::s_root_record;
-#endif
-
-SharedAllocationRecord<Kokkos::HostSpace, void>::~SharedAllocationRecord() {
-  m_space.deallocate(m_label.c_str(),
-                     SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     SharedAllocationRecord<void, void>::m_alloc_size,
-                     (SharedAllocationRecord<void, void>::m_alloc_size -
-                      sizeof(SharedAllocationHeader)));
-}
-
-SharedAllocationHeader *_do_allocation(Kokkos::HostSpace const &space,
-                                       std::string const &label,
-                                       size_t alloc_size) {
-  try {
-    return reinterpret_cast<SharedAllocationHeader *>(
-        space.allocate(alloc_size));
-  } catch (Experimental::RawMemoryAllocationFailure const &failure) {
-    if (failure.failure_mode() == Experimental::RawMemoryAllocationFailure::
-                                      FailureMode::AllocationNotAligned) {
-      // TODO: delete the misaligned memory
-    }
-
-    std::cerr << "Kokkos failed to allocate memory for label \"" << label
-              << "\".  Allocation using MemorySpace named \"" << space.name()
-              << " failed with the following error:  ";
-    failure.print_error_message(std::cerr);
-    std::cerr.flush();
-    Kokkos::Impl::throw_runtime_exception("Memory allocation failure");
-  }
-  return nullptr;  // unreachable
-}
-
-SharedAllocationRecord<Kokkos::HostSpace, void>::SharedAllocationRecord(
-    const Kokkos::HostSpace &arg_space, const std::string &arg_label,
-    const size_t arg_alloc_size,
-    const SharedAllocationRecord<void, void>::function_type arg_dealloc)
-    // Pass through allocated [ SharedAllocationHeader , user_memory ]
-    // Pass through deallocation function
-    : base_t(
-#ifdef KOKKOS_ENABLE_DEBUG
-          &SharedAllocationRecord<Kokkos::HostSpace, void>::s_root_record,
-#endif
-          Impl::checked_allocation_with_header(arg_space, arg_label,
-                                               arg_alloc_size),
-          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
-          arg_label),
-      m_space(arg_space) {
-  this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr,
-                                                  arg_label);
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//==============================================================================
-// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
-
 #include <impl/Kokkos_SharedAlloc_timpl.hpp>
 
-namespace Kokkos {
-namespace Impl {
-
-// To avoid additional compilation cost for something that's (mostly?) not
-// performance sensitive, we explicity instantiate these CRTP base classes here,
-// where we have access to the associated *_timpl.hpp header files.
-template class SharedAllocationRecordCommon<Kokkos::HostSpace>;
-
-}  // end namespace Impl
-}  // end namespace Kokkos
-
-// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1
-//==============================================================================
+KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::HostSpace);
diff --git a/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp b/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp
index f740c408fb8..3072e2ce825 100644
--- a/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp
+++ b/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp
@@ -26,8 +26,7 @@ namespace Impl {
 
 template <class T, class... P>
 struct ZeroMemset<HostSpace::execution_space, View<T, P...>> {
-  ZeroMemset(const HostSpace::execution_space& exec, const View<T, P...>& dst,
-             typename View<T, P...>::const_value_type&) {
+  ZeroMemset(const HostSpace::execution_space& exec, const View<T, P...>& dst) {
     // Host spaces, except for HPX, are synchronous and we need to fence for HPX
     // since we can't properly enqueue a std::memset otherwise.
     // We can't use exec.fence() directly since we don't have a full definition
@@ -36,12 +35,6 @@ struct ZeroMemset<HostSpace::execution_space, View<T, P...>> {
     using ValueType = typename View<T, P...>::value_type;
     std::memset(dst.data(), 0, sizeof(ValueType) * dst.size());
   }
-
-  ZeroMemset(const View<T, P...>& dst,
-             typename View<T, P...>::const_value_type&) {
-    using ValueType = typename View<T, P...>::value_type;
-    std::memset(dst.data(), 0, sizeof(ValueType) * dst.size());
-  }
 };
 
 }  // end namespace Impl
diff --git a/core/src/impl/Kokkos_HostThreadTeam.cpp b/core/src/impl/Kokkos_HostThreadTeam.cpp
index bfe5902bf7f..11bf701b57a 100644
--- a/core/src/impl/Kokkos_HostThreadTeam.cpp
+++ b/core/src/impl/Kokkos_HostThreadTeam.cpp
@@ -22,7 +22,6 @@
 #include <Kokkos_Macros.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
 #include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_Spinwait.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/core/src/impl/Kokkos_HostThreadTeam.hpp b/core/src/impl/Kokkos_HostThreadTeam.hpp
index 51f25a8b60f..25f09b82865 100644
--- a/core/src/impl/Kokkos_HostThreadTeam.hpp
+++ b/core/src/impl/Kokkos_HostThreadTeam.hpp
@@ -885,7 +885,7 @@ KOKKOS_INLINE_FUNCTION
     closure(i, accum, false);
   }
 
-  auto team_member = loop_boundaries.thread;
+  auto& team_member = loop_boundaries.thread;
 
   // 'accum' output is the exclusive prefix sum
   accum = team_member.team_scan(accum);
diff --git a/core/src/impl/Kokkos_InitializationSettings.hpp b/core/src/impl/Kokkos_InitializationSettings.hpp
index ab4350f3a7a..11a93c6bb56 100644
--- a/core/src/impl/Kokkos_InitializationSettings.hpp
+++ b/core/src/impl/Kokkos_InitializationSettings.hpp
@@ -24,32 +24,6 @@
 
 namespace Kokkos {
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-struct InitArguments {
-  int num_threads;
-  int num_numa;
-  int device_id;
-  int ndevices;
-  int skip_device;
-  bool disable_warnings;
-  bool tune_internals;
-  bool tool_help        = false;
-  std::string tool_lib  = {};
-  std::string tool_args = {};
-
-  KOKKOS_DEPRECATED_WITH_COMMENT("Use InitializationSettings instead!")
-  InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false,
-                bool ti = false)
-      : num_threads{nt},
-        num_numa{nn},
-        device_id{dv},
-        ndevices{-1},
-        skip_device{9999},
-        disable_warnings{dw},
-        tune_internals{ti} {}
-};
-#endif
-
 class InitializationSettings {
 #define KOKKOS_IMPL_DECLARE(TYPE, NAME)                                    \
  private:                                                                  \
@@ -64,12 +38,32 @@ class InitializationSettings {
   TYPE get_##NAME() const noexcept { return *m_##NAME; }                   \
   static_assert(true, "no-op to require trailing semicolon")
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME)                         \
+ private:                                                                  \
+  std::optional<TYPE> m_##NAME;                                            \
+                                                                           \
+ public:                                                                   \
+  KOKKOS_DEPRECATED InitializationSettings& set_##NAME(TYPE NAME) {        \
+    m_##NAME = NAME;                                                       \
+    return *this;                                                          \
+  }                                                                        \
+  KOKKOS_DEPRECATED bool has_##NAME() const noexcept {                     \
+    return static_cast<bool>(m_##NAME);                                    \
+  }                                                                        \
+  KOKKOS_DEPRECATED TYPE get_##NAME() const noexcept { return *m_##NAME; } \
+  static_assert(true, "no-op to require trailing semicolon")
+#else
+#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME) \
+  static_assert(true, "no-op to require trailing semicolon")
+#endif
+
  public:
   KOKKOS_IMPL_DECLARE(int, num_threads);
   KOKKOS_IMPL_DECLARE(int, device_id);
   KOKKOS_IMPL_DECLARE(std::string, map_device_id_by);
-  KOKKOS_IMPL_DECLARE(int, num_devices);  // deprecated
-  KOKKOS_IMPL_DECLARE(int, skip_device);  // deprecated
+  KOKKOS_IMPL_DECLARE_DEPRECATED(int, num_devices);
+  KOKKOS_IMPL_DECLARE_DEPRECATED(int, skip_device);
   KOKKOS_IMPL_DECLARE(bool, disable_warnings);
   KOKKOS_IMPL_DECLARE(bool, print_configuration);
   KOKKOS_IMPL_DECLARE(bool, tune_internals);
@@ -80,41 +74,6 @@ class InitializationSettings {
 #undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER_TYPE
 #undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER
 #undef KOKKOS_IMPL_DECLARE
-
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
- public:
-  InitializationSettings() = default;
-
-  InitializationSettings(InitArguments const& old) {
-    if (old.num_threads != -1) {
-      set_num_threads(old.num_threads);
-    }
-    if (old.device_id != -1) {
-      set_device_id(old.device_id);
-    }
-    if (old.ndevices != -1) {
-      set_num_devices(old.ndevices);
-    }
-    if (old.skip_device != 9999) {
-      set_skip_device(old.skip_device);
-    }
-    if (old.disable_warnings) {
-      set_disable_warnings(true);
-    }
-    if (old.tune_internals) {
-      set_tune_internals(true);
-    }
-    if (old.tool_help) {
-      set_tools_help(true);
-    }
-    if (!old.tool_lib.empty()) {
-      set_tools_libs(old.tool_lib);
-    }
-    if (!old.tool_args.empty()) {
-      set_tools_args(old.tool_args);
-    }
-  }
-#endif
 };
 
 }  // namespace Kokkos
diff --git a/core/src/impl/Kokkos_MemorySpace.cpp b/core/src/impl/Kokkos_MemorySpace.cpp
deleted file mode 100644
index 2f0e01c5b28..00000000000
--- a/core/src/impl/Kokkos_MemorySpace.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-/** @file Kokkos_MemorySpace.cpp
- *
- *  Operations common to memory space instances, or at least default
- *  implementations thereof.
- */
-
-#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
-#define KOKKOS_IMPL_PUBLIC_INCLUDE
-#endif
-
-#include <impl/Kokkos_MemorySpace.hpp>
-
-#include <iostream>
-#include <string>
-#include <sstream>
-
-namespace Kokkos {
-namespace Impl {
-
-void safe_throw_allocation_with_header_failure(
-    std::string const& space_name, std::string const& label,
-    Kokkos::Experimental::RawMemoryAllocationFailure const& failure) {
-  auto generate_failure_message = [&](std::ostream& o) {
-    o << "Kokkos failed to allocate memory for label \"" << label
-      << "\".  Allocation using MemorySpace named \"" << space_name
-      << "\" failed with the following error:  ";
-    failure.print_error_message(o);
-    if (failure.failure_mode() ==
-        Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode::
-            AllocationNotAligned) {
-      // TODO: delete the misaligned memory?
-      o << "Warning: Allocation failed due to misalignment; memory may "
-           "be leaked.\n";
-    }
-    o.flush();
-  };
-  try {
-    std::ostringstream sstr;
-    generate_failure_message(sstr);
-    Kokkos::Impl::throw_runtime_exception(sstr.str());
-  } catch (std::bad_alloc const&) {
-    // Probably failed to allocate the string because we're so close to out
-    // of memory. Try printing to std::cerr instead
-    try {
-      generate_failure_message(std::cerr);
-    } catch (std::bad_alloc const&) {
-      // oh well, we tried...
-    }
-    Kokkos::Impl::throw_runtime_exception(
-        "Kokkos encountered an allocation failure, then another allocation "
-        "failure while trying to create the error message.");
-  }
-}
-
-}  // end namespace Impl
-}  // end namespace Kokkos
diff --git a/core/src/impl/Kokkos_MemorySpace.hpp b/core/src/impl/Kokkos_MemorySpace.hpp
deleted file mode 100644
index 44956dd7c5d..00000000000
--- a/core/src/impl/Kokkos_MemorySpace.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-/** @file Kokkos_MemorySpace.hpp
- *
- *  Operations common to memory space instances, or at least default
- *  implementations thereof.
- */
-
-#ifndef KOKKOS_IMPL_MEMORYSPACE_HPP
-#define KOKKOS_IMPL_MEMORYSPACE_HPP
-
-#include <Kokkos_Macros.hpp>
-#include <impl/Kokkos_SharedAlloc.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-#include <string>
-
-namespace Kokkos {
-namespace Impl {
-
-// Defined in implementation file to avoid having to include iostream
-void safe_throw_allocation_with_header_failure(
-    std::string const &space_name, std::string const &label,
-    Kokkos::Experimental::RawMemoryAllocationFailure const &failure);
-
-template <class MemorySpace>
-SharedAllocationHeader *checked_allocation_with_header(MemorySpace const &space,
-                                                       std::string const &label,
-                                                       size_t alloc_size) {
-  try {
-    return reinterpret_cast<SharedAllocationHeader *>(space.allocate(
-        label.c_str(), alloc_size + sizeof(SharedAllocationHeader),
-        alloc_size));
-  } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) {
-    safe_throw_allocation_with_header_failure(space.name(), label, failure);
-  }
-  return nullptr;  // unreachable
-}
-
-template <class ExecutionSpace, class MemorySpace>
-SharedAllocationHeader *checked_allocation_with_header(
-    ExecutionSpace const &exec_space, MemorySpace const &space,
-    std::string const &label, size_t alloc_size) {
-  try {
-    return reinterpret_cast<SharedAllocationHeader *>(space.allocate(
-        exec_space, label.c_str(), alloc_size + sizeof(SharedAllocationHeader),
-        alloc_size));
-  } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) {
-    safe_throw_allocation_with_header_failure(space.name(), label, failure);
-  }
-  return nullptr;  // unreachable
-}
-
-}  // end namespace Impl
-}  // end namespace Kokkos
-
-#endif  // KOKKOS_IMPL_MEMORYSPACE_HPP
diff --git a/core/src/impl/Kokkos_Memory_Fence.hpp b/core/src/impl/Kokkos_Memory_Fence.hpp
deleted file mode 100644
index 42a53b04fb2..00000000000
--- a/core/src/impl/Kokkos_Memory_Fence.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_MEMORY_FENCE_HPP)
-#define KOKKOS_MEMORY_FENCE_HPP
-namespace Kokkos {
-
-//////////////////////////////////////////////////////
-// store_fence()
-//
-// If possible use a store fence on the architecture, if not run a full memory
-// fence
-
-KOKKOS_FORCEINLINE_FUNCTION
-void store_fence() {
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-  asm volatile("sfence" ::: "memory");
-#else
-  memory_fence();
-#endif
-}
-
-//////////////////////////////////////////////////////
-// load_fence()
-//
-// If possible use a load fence on the architecture, if not run a full memory
-// fence
-
-KOKKOS_FORCEINLINE_FUNCTION
-void load_fence() {
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-  asm volatile("lfence" ::: "memory");
-#else
-  memory_fence();
-#endif
-}
-
-}  // namespace Kokkos
-
-#endif
diff --git a/core/src/impl/Kokkos_Profiling_C_Interface.h b/core/src/impl/Kokkos_Profiling_C_Interface.h
index 731a11e917a..15c466b27ed 100644
--- a/core/src/impl/Kokkos_Profiling_C_Interface.h
+++ b/core/src/impl/Kokkos_Profiling_C_Interface.h
@@ -154,7 +154,7 @@ enum Kokkos_Tools_OptimizationType {
   Kokkos_Tools_Maximize
 };
 
-struct Kokkos_Tools_OptimzationGoal {
+struct Kokkos_Tools_OptimizationGoal {
   size_t type_id;
   enum Kokkos_Tools_OptimizationType goal;
 };
@@ -220,7 +220,7 @@ typedef void (*Kokkos_Tools_contextBeginFunction)(const size_t);
 typedef void (*Kokkos_Tools_contextEndFunction)(
     const size_t, struct Kokkos_Tools_VariableValue);
 typedef void (*Kokkos_Tools_optimizationGoalDeclarationFunction)(
-    const size_t, const struct Kokkos_Tools_OptimzationGoal goal);
+    const size_t, const struct Kokkos_Tools_OptimizationGoal goal);
 
 struct Kokkos_Profiling_EventSet {
   Kokkos_Profiling_initFunction init;
diff --git a/core/src/impl/Kokkos_Profiling_Interface.hpp b/core/src/impl/Kokkos_Profiling_Interface.hpp
index af71932e47b..b66886d9f7e 100644
--- a/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -226,7 +226,7 @@ using ValueType           = Kokkos_Tools_VariableInfo_ValueType;
 using CandidateValueType  = Kokkos_Tools_VariableInfo_CandidateValueType;
 using SetOrRange          = Kokkos_Tools_VariableInfo_SetOrRange;
 using VariableInfo        = Kokkos_Tools_VariableInfo;
-using OptimizationGoal    = Kokkos_Tools_OptimzationGoal;
+using OptimizationGoal    = Kokkos_Tools_OptimizationGoal;
 using TuningString        = Kokkos_Tools_Tuning_String;
 using VariableValue       = Kokkos_Tools_VariableValue;
 
diff --git a/core/src/impl/Kokkos_SharedAlloc.cpp b/core/src/impl/Kokkos_SharedAlloc.cpp
index 255f5125f4a..0bc3814b3a1 100644
--- a/core/src/impl/Kokkos_SharedAlloc.cpp
+++ b/core/src/impl/Kokkos_SharedAlloc.cpp
@@ -20,6 +20,8 @@
 
 #include <Kokkos_Core.hpp>
 #include <iomanip>
+#include <iostream>
+#include <sstream>
 
 namespace Kokkos {
 namespace Impl {
@@ -321,5 +323,53 @@ void SharedAllocationRecord<void, void>::print_host_accessible_records(
 }
 #endif
 
+void safe_throw_allocation_with_header_failure(
+    std::string const& space_name, std::string const& label,
+    Kokkos::Experimental::RawMemoryAllocationFailure const& failure) {
+  auto generate_failure_message = [&](std::ostream& o) {
+    o << "Kokkos failed to allocate memory for label \"" << label
+      << "\".  Allocation using MemorySpace named \"" << space_name
+      << "\" failed with the following error:  ";
+    failure.print_error_message(o);
+    if (failure.failure_mode() ==
+        Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode::
+            AllocationNotAligned) {
+      // TODO: delete the misaligned memory?
+      o << "Warning: Allocation failed due to misalignment; memory may "
+           "be leaked.\n";
+    }
+    o.flush();
+  };
+  try {
+    std::ostringstream sstr;
+    generate_failure_message(sstr);
+    Kokkos::Impl::throw_runtime_exception(sstr.str());
+  } catch (std::bad_alloc const&) {
+    // Probably failed to allocate the string because we're so close to out
+    // of memory. Try printing to std::cerr instead
+    try {
+      generate_failure_message(std::cerr);
+    } catch (std::bad_alloc const&) {
+      // oh well, we tried...
+    }
+    Kokkos::Impl::throw_runtime_exception(
+        "Kokkos encountered an allocation failure, then another allocation "
+        "failure while trying to create the error message.");
+  }
+}
+
+void fill_host_accessible_header_info(
+    SharedAllocationRecord<void, void>* arg_record,
+    SharedAllocationHeader& arg_header, std::string const& arg_label) {
+  // Fill in the Header information, directly accessible on the host
+
+  arg_header.m_record = arg_record;
+
+  strncpy(arg_header.m_label, arg_label.c_str(),
+          SharedAllocationHeader::maximum_label_length);
+  // Set last element zero, in case c_str is too long
+  arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0';
+}
+
 } /* namespace Impl */
 } /* namespace Kokkos */
diff --git a/core/src/impl/Kokkos_SharedAlloc.hpp b/core/src/impl/Kokkos_SharedAlloc.hpp
index 043505a158e..99ab660213f 100644
--- a/core/src/impl/Kokkos_SharedAlloc.hpp
+++ b/core/src/impl/Kokkos_SharedAlloc.hpp
@@ -51,6 +51,9 @@ class SharedAllocationHeader {
   friend class SharedAllocationRecordCommon;
   template <class>
   friend class HostInaccessibleSharedAllocationRecordCommon;
+  friend void fill_host_accessible_header_info(
+      SharedAllocationRecord<void, void>*, SharedAllocationHeader&,
+      std::string const&);
 
   Record* m_record;
   char m_label[maximum_label_length];
@@ -145,25 +148,23 @@ class SharedAllocationRecord<void, void> {
   SharedAllocationRecord()
       : m_alloc_ptr(nullptr),
         m_alloc_size(0),
-        m_dealloc(nullptr)
+        m_dealloc(nullptr),
 #ifdef KOKKOS_ENABLE_DEBUG
-        ,
         m_root(this),
         m_prev(this),
-        m_next(this)
+        m_next(this),
 #endif
-        ,
         m_count(0) {
   }
 
   static constexpr unsigned maximum_label_length =
       SharedAllocationHeader::maximum_label_length;
 
-  KOKKOS_INLINE_FUNCTION
+  KOKKOS_FUNCTION
   const SharedAllocationHeader* head() const { return m_alloc_ptr; }
 
   /* User's memory begins at the end of the header */
-  KOKKOS_INLINE_FUNCTION
+  KOKKOS_FUNCTION
   void* data() const { return static_cast<void*>(m_alloc_ptr + 1); }
 
   /* User's memory begins at the end of the header */
@@ -195,23 +196,79 @@ class SharedAllocationRecord<void, void> {
       const SharedAllocationRecord* const root, const bool detail);
 };
 
+void safe_throw_allocation_with_header_failure(
+    std::string const& space_name, std::string const& label,
+    Kokkos::Experimental::RawMemoryAllocationFailure const& failure);
+
+template <class MemorySpace>
+SharedAllocationHeader* checked_allocation_with_header(MemorySpace const& space,
+                                                       std::string const& label,
+                                                       size_t alloc_size) {
+  try {
+    return reinterpret_cast<SharedAllocationHeader*>(space.allocate(
+        label.c_str(), alloc_size + sizeof(SharedAllocationHeader),
+        alloc_size));
+  } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) {
+    safe_throw_allocation_with_header_failure(space.name(), label, failure);
+  }
+  return nullptr;  // unreachable
+}
+
+template <class ExecutionSpace, class MemorySpace>
+SharedAllocationHeader* checked_allocation_with_header(
+    ExecutionSpace const& exec_space, MemorySpace const& space,
+    std::string const& label, size_t alloc_size) {
+  try {
+    return reinterpret_cast<SharedAllocationHeader*>(space.allocate(
+        exec_space, label.c_str(), alloc_size + sizeof(SharedAllocationHeader),
+        alloc_size));
+  } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) {
+    safe_throw_allocation_with_header_failure(space.name(), label, failure);
+  }
+  return nullptr;  // unreachable
+}
+
+void fill_host_accessible_header_info(SharedAllocationHeader& arg_header,
+                                      std::string const& arg_label);
+
 template <class MemorySpace>
 class SharedAllocationRecordCommon : public SharedAllocationRecord<void, void> {
  private:
   using derived_t     = SharedAllocationRecord<MemorySpace, void>;
   using record_base_t = SharedAllocationRecord<void, void>;
-  derived_t& self() { return *static_cast<derived_t*>(this); }
-  derived_t const& self() const { return *static_cast<derived_t const*>(this); }
 
  protected:
   using record_base_t::record_base_t;
 
-  void _fill_host_accessible_header_info(SharedAllocationHeader& arg_header,
-                                         std::string const& arg_label);
+  MemorySpace m_space;
+
+#ifdef KOKKOS_ENABLE_DEBUG
+  static record_base_t s_root_record;
+#endif
 
   static void deallocate(record_base_t* arg_rec);
 
  public:
+  ~SharedAllocationRecordCommon();
+  template <class ExecutionSpace>
+  SharedAllocationRecordCommon(
+      ExecutionSpace const& exec, MemorySpace const& space,
+      std::string const& label, std::size_t alloc_size,
+      record_base_t::function_type dealloc = &deallocate)
+      : SharedAllocationRecord<void, void>(
+#ifdef KOKKOS_ENABLE_DEBUG
+            &s_root_record,
+#endif
+            checked_allocation_with_header(exec, space, label, alloc_size),
+            sizeof(SharedAllocationHeader) + alloc_size, dealloc, label),
+        m_space(space) {
+    auto& header = *SharedAllocationRecord<void, void>::m_alloc_ptr;
+    fill_host_accessible_header_info(this, header, label);
+  }
+  SharedAllocationRecordCommon(
+      MemorySpace const& space, std::string const& label, std::size_t size,
+      record_base_t::function_type dealloc = &deallocate);
+
   static auto allocate(MemorySpace const& arg_space,
                        std::string const& arg_label, size_t arg_alloc_size)
       -> derived_t*;
@@ -231,22 +288,103 @@ class SharedAllocationRecordCommon : public SharedAllocationRecord<void, void> {
 
 template <class MemorySpace>
 class HostInaccessibleSharedAllocationRecordCommon
-    : public SharedAllocationRecordCommon<MemorySpace> {
+    : public SharedAllocationRecord<void, void> {
  private:
-  using base_t        = SharedAllocationRecordCommon<MemorySpace>;
   using derived_t     = SharedAllocationRecord<MemorySpace, void>;
   using record_base_t = SharedAllocationRecord<void, void>;
 
  protected:
-  using base_t::base_t;
+  using record_base_t::record_base_t;
+
+  MemorySpace m_space;
+
+#ifdef KOKKOS_ENABLE_DEBUG
+  static record_base_t s_root_record;
+#endif
+
+  static void deallocate(record_base_t* arg_rec);
 
  public:
+  ~HostInaccessibleSharedAllocationRecordCommon();
+  template <class ExecutionSpace>
+  HostInaccessibleSharedAllocationRecordCommon(
+      ExecutionSpace const& exec, MemorySpace const& space,
+      std::string const& label, std::size_t alloc_size,
+      record_base_t::function_type dealloc = &deallocate)
+      : SharedAllocationRecord<void, void>(
+#ifdef KOKKOS_ENABLE_DEBUG
+            &s_root_record,
+#endif
+            checked_allocation_with_header(exec, space, label, alloc_size),
+            sizeof(SharedAllocationHeader) + alloc_size, dealloc, label),
+        m_space(space) {
+    SharedAllocationHeader header;
+
+    fill_host_accessible_header_info(this, header, label);
+
+    Kokkos::Impl::DeepCopy<MemorySpace, HostSpace>(
+        exec, SharedAllocationRecord<void, void>::m_alloc_ptr, &header,
+        sizeof(SharedAllocationHeader));
+  }
+  HostInaccessibleSharedAllocationRecordCommon(
+      MemorySpace const& space, std::string const& label, std::size_t size,
+      record_base_t::function_type dealloc = &deallocate);
+
+  static auto allocate(MemorySpace const& arg_space,
+                       std::string const& arg_label, size_t arg_alloc_size)
+      -> derived_t*;
+  /**\brief  Allocate tracked memory in the space */
+  static void* allocate_tracked(MemorySpace const& arg_space,
+                                std::string const& arg_alloc_label,
+                                size_t arg_alloc_size);
+  /**\brief  Reallocate tracked memory in the space */
+  static void deallocate_tracked(void* arg_alloc_ptr);
+  /**\brief  Deallocate tracked memory in the space */
+  static void* reallocate_tracked(void* arg_alloc_ptr, size_t arg_alloc_size);
+
   static void print_records(std::ostream& s, MemorySpace const&,
                             bool detail = false);
   static auto get_record(void* alloc_ptr) -> derived_t*;
   std::string get_label() const;
 };
 
+#ifdef KOKKOS_ENABLE_DEBUG
+template <class MemorySpace>
+SharedAllocationRecord<void, void>
+    SharedAllocationRecordCommon<MemorySpace>::s_root_record;
+
+template <class MemorySpace>
+SharedAllocationRecord<void, void>
+    HostInaccessibleSharedAllocationRecordCommon<MemorySpace>::s_root_record;
+#endif
+
+#define KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(MEMORY_SPACE)        \
+  template <>                                                             \
+  class Kokkos::Impl::SharedAllocationRecord<MEMORY_SPACE, void>          \
+      : public Kokkos::Impl::SharedAllocationRecordCommon<MEMORY_SPACE> { \
+    using SharedAllocationRecordCommon<                                   \
+        MEMORY_SPACE>::SharedAllocationRecordCommon;                      \
+  }
+
+#define KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(    \
+    MEMORY_SPACE)                                                          \
+  template <>                                                              \
+  class Kokkos::Impl::SharedAllocationRecord<MEMORY_SPACE, void>           \
+      : public Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< \
+            MEMORY_SPACE> {                                                \
+    using HostInaccessibleSharedAllocationRecordCommon<                    \
+        MEMORY_SPACE>::HostInaccessibleSharedAllocationRecordCommon;       \
+  }
+
+#define KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( \
+    MEMORY_SPACE)                                                    \
+  template class Kokkos::Impl::SharedAllocationRecordCommon<MEMORY_SPACE>
+
+#define KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( \
+    MEMORY_SPACE)                                                                      \
+  template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon<           \
+      MEMORY_SPACE>
+
 namespace {
 
 /* Taking the address of this function so make sure it is unique */
diff --git a/core/src/impl/Kokkos_SharedAlloc_timpl.hpp b/core/src/impl/Kokkos_SharedAlloc_timpl.hpp
index d403ef9db06..41036ab0678 100644
--- a/core/src/impl/Kokkos_SharedAlloc_timpl.hpp
+++ b/core/src/impl/Kokkos_SharedAlloc_timpl.hpp
@@ -31,6 +31,66 @@
 namespace Kokkos {
 namespace Impl {
 
+template <class MemorySpace>
+SharedAllocationRecordCommon<MemorySpace>::~SharedAllocationRecordCommon() {
+  auto alloc_ptr  = SharedAllocationRecord<void, void>::m_alloc_ptr;
+  auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
+  auto label      = SharedAllocationRecord<void, void>::m_label;
+  m_space.deallocate(label.c_str(), alloc_ptr, alloc_size,
+                     alloc_size - sizeof(SharedAllocationHeader));
+}
+template <class MemorySpace>
+HostInaccessibleSharedAllocationRecordCommon<
+    MemorySpace>::~HostInaccessibleSharedAllocationRecordCommon() {
+  auto alloc_ptr  = SharedAllocationRecord<void, void>::m_alloc_ptr;
+  auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
+  auto label      = SharedAllocationRecord<void, void>::m_label;
+  m_space.deallocate(label.c_str(), alloc_ptr, alloc_size,
+                     alloc_size - sizeof(SharedAllocationHeader));
+}
+
+template <class MemorySpace>
+SharedAllocationRecordCommon<MemorySpace>::SharedAllocationRecordCommon(
+    MemorySpace const& space, std::string const& label, std::size_t alloc_size,
+    SharedAllocationRecord<void, void>::function_type dealloc)
+    : SharedAllocationRecord<void, void>(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &s_root_record,
+#endif
+          checked_allocation_with_header(space, label, alloc_size),
+          sizeof(SharedAllocationHeader) + alloc_size, dealloc, label),
+      m_space(space) {
+  auto& header = *SharedAllocationRecord<void, void>::m_alloc_ptr;
+  fill_host_accessible_header_info(this, header, label);
+}
+
+template <class MemorySpace>
+HostInaccessibleSharedAllocationRecordCommon<MemorySpace>::
+    HostInaccessibleSharedAllocationRecordCommon(
+        MemorySpace const& space, std::string const& label,
+        std::size_t alloc_size,
+        SharedAllocationRecord<void, void>::function_type dealloc)
+    : SharedAllocationRecord<void, void>(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &s_root_record,
+#endif
+          checked_allocation_with_header(space, label, alloc_size),
+          sizeof(SharedAllocationHeader) + alloc_size, dealloc, label),
+      m_space(space) {
+  SharedAllocationHeader header;
+
+  fill_host_accessible_header_info(this, header, label);
+
+  typename MemorySpace::execution_space exec;
+  Kokkos::Impl::DeepCopy<MemorySpace, HostSpace>(
+      exec, SharedAllocationRecord<void, void>::m_alloc_ptr, &header,
+      sizeof(SharedAllocationHeader));
+  exec.fence(std::string("SharedAllocationRecord<Kokkos::") +
+             MemorySpace::name() +
+             "Space, void>::SharedAllocationRecord(): "
+             "fence after copying header from HostSpace");
+}
+
 template <class MemorySpace>
 auto SharedAllocationRecordCommon<MemorySpace>::allocate(
     MemorySpace const& arg_space, std::string const& arg_label,
@@ -76,9 +136,64 @@ void* SharedAllocationRecordCommon<MemorySpace>::reallocate_tracked(
 
   Kokkos::Impl::DeepCopy<MemorySpace, MemorySpace>(
       r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size()));
-  Kokkos::fence(
-      "SharedAllocationRecord<Kokkos::Experimental::HBWSpace, "
-      "void>::reallocate_tracked(): fence after copying data");
+  Kokkos::fence(std::string("SharedAllocationRecord<") + MemorySpace::name() +
+                ", void>::reallocate_tracked(): fence after copying data");
+
+  record_base_t::increment(r_new);
+  record_base_t::decrement(r_old);
+
+  return r_new->data();
+}
+
+template <class MemorySpace>
+auto HostInaccessibleSharedAllocationRecordCommon<MemorySpace>::allocate(
+    MemorySpace const& arg_space, std::string const& arg_label,
+    size_t arg_alloc_size) -> derived_t* {
+  return new derived_t(arg_space, arg_label, arg_alloc_size);
+}
+
+template <class MemorySpace>
+void* HostInaccessibleSharedAllocationRecordCommon<
+    MemorySpace>::allocate_tracked(const MemorySpace& arg_space,
+                                   const std::string& arg_alloc_label,
+                                   size_t arg_alloc_size) {
+  if (!arg_alloc_size) return nullptr;
+
+  SharedAllocationRecord* const r =
+      allocate(arg_space, arg_alloc_label, arg_alloc_size);
+
+  record_base_t::increment(r);
+
+  return r->data();
+}
+
+template <class MemorySpace>
+void HostInaccessibleSharedAllocationRecordCommon<MemorySpace>::deallocate(
+    HostInaccessibleSharedAllocationRecordCommon::record_base_t* arg_rec) {
+  delete static_cast<derived_t*>(arg_rec);
+}
+
+template <class MemorySpace>
+void HostInaccessibleSharedAllocationRecordCommon<
+    MemorySpace>::deallocate_tracked(void* arg_alloc_ptr) {
+  if (arg_alloc_ptr != nullptr) {
+    SharedAllocationRecord* const r = derived_t::get_record(arg_alloc_ptr);
+    record_base_t::decrement(r);
+  }
+}
+
+template <class MemorySpace>
+void* HostInaccessibleSharedAllocationRecordCommon<
+    MemorySpace>::reallocate_tracked(void* arg_alloc_ptr,
+                                     size_t arg_alloc_size) {
+  derived_t* const r_old = derived_t::get_record(arg_alloc_ptr);
+  derived_t* const r_new =
+      allocate(r_old->m_space, r_old->get_label(), arg_alloc_size);
+
+  Kokkos::Impl::DeepCopy<MemorySpace, MemorySpace>(
+      r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size()));
+  Kokkos::fence(std::string("SharedAllocationRecord<") + MemorySpace::name() +
+                ", void>::reallocate_tracked(): fence after copying data");
 
   record_base_t::increment(r_new);
   record_base_t::decrement(r_old);
@@ -108,20 +223,6 @@ std::string SharedAllocationRecordCommon<MemorySpace>::get_label() const {
   return record_base_t::m_label;
 }
 
-template <class MemorySpace>
-void SharedAllocationRecordCommon<MemorySpace>::
-    _fill_host_accessible_header_info(SharedAllocationHeader& arg_header,
-                                      std::string const& arg_label) {
-  // Fill in the Header information, directly accessible on the host
-
-  arg_header.m_record = &self();
-
-  strncpy(arg_header.m_label, arg_label.c_str(),
-          SharedAllocationHeader::maximum_label_length);
-  // Set last element zero, in case c_str is too long
-  arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0';
-}
-
 template <class MemorySpace>
 void SharedAllocationRecordCommon<MemorySpace>::print_records(
     std::ostream& s, const MemorySpace&, bool detail) {
diff --git a/core/src/impl/Kokkos_Spinwait.hpp b/core/src/impl/Kokkos_Spinwait.hpp
deleted file mode 100644
index c57b17d646a..00000000000
--- a/core/src/impl/Kokkos_Spinwait.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_SPINWAIT_HPP
-#define KOKKOS_SPINWAIT_HPP
-
-#include <Kokkos_Macros.hpp>
-#include <Kokkos_Atomic.hpp>
-
-#include <cstdint>
-
-#include <type_traits>
-
-namespace Kokkos {
-namespace Impl {
-
-enum class WaitMode : int {
-  ACTIVE  // Used for tight loops to keep threads active longest
-  ,
-  PASSIVE  // Used to quickly yield the thread to quite down the system
-  ,
-  ROOT  // Never sleep or yield the root thread
-};
-
-void host_thread_yield(const uint32_t i, const WaitMode mode);
-
-template <typename T>
-std::enable_if_t<std::is_integral<T>::value, void> root_spinwait_while_equal(
-    T const volatile& flag, const T value) {
-  Kokkos::store_fence();
-  uint32_t i = 0;
-  while (value == flag) {
-    host_thread_yield(++i, WaitMode::ROOT);
-  }
-  Kokkos::load_fence();
-}
-
-template <typename T>
-std::enable_if_t<std::is_integral<T>::value, void> root_spinwait_until_equal(
-    T const volatile& flag, const T value) {
-  Kokkos::store_fence();
-  uint32_t i = 0;
-  while (value != flag) {
-    host_thread_yield(++i, WaitMode::ROOT);
-  }
-  Kokkos::load_fence();
-}
-
-template <typename T>
-std::enable_if_t<std::is_integral<T>::value, void> spinwait_while_equal(
-    T const volatile& flag, const T value) {
-  Kokkos::store_fence();
-  uint32_t i = 0;
-  while (value == flag) {
-    host_thread_yield(++i, WaitMode::ACTIVE);
-  }
-  Kokkos::load_fence();
-}
-
-template <typename T>
-std::enable_if_t<std::is_integral<T>::value, void> yield_while_equal(
-    T const volatile& flag, const T value) {
-  Kokkos::store_fence();
-  uint32_t i = 0;
-  while (value == flag) {
-    host_thread_yield(++i, WaitMode::PASSIVE);
-  }
-  Kokkos::load_fence();
-}
-
-template <typename T>
-std::enable_if_t<std::is_integral<T>::value, void> spinwait_until_equal(
-    T const volatile& flag, const T value) {
-  Kokkos::store_fence();
-  uint32_t i = 0;
-  while (value != flag) {
-    host_thread_yield(++i, WaitMode::ACTIVE);
-  }
-  Kokkos::load_fence();
-}
-
-template <typename T>
-std::enable_if_t<std::is_integral<T>::value, void> yield_until_equal(
-    T const volatile& flag, const T value) {
-  Kokkos::store_fence();
-  uint32_t i = 0;
-  while (value != flag) {
-    host_thread_yield(++i, WaitMode::PASSIVE);
-  }
-  Kokkos::load_fence();
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-#endif /* #ifndef KOKKOS_SPINWAIT_HPP */
diff --git a/core/src/impl/Kokkos_Utilities.hpp b/core/src/impl/Kokkos_Utilities.hpp
index 7e2f130564f..cadeed1a6d8 100644
--- a/core/src/impl/Kokkos_Utilities.hpp
+++ b/core/src/impl/Kokkos_Utilities.hpp
@@ -49,6 +49,11 @@ struct integral_constant {
 template <typename... Is>
 struct always_true : std::true_type {};
 
+// type-dependent expression that is always false intended for use in
+// static_assert to check "we should never get there"
+template <typename... Deps>
+struct always_false : std::false_type {};
+
 //==============================================================================
 
 #if defined(__cpp_lib_type_identity)
diff --git a/core/src/impl/Kokkos_ViewArray.hpp b/core/src/impl/Kokkos_ViewArray.hpp
index 725ba5de092..fe43b630184 100644
--- a/core/src/impl/Kokkos_ViewArray.hpp
+++ b/core/src/impl/Kokkos_ViewArray.hpp
@@ -27,10 +27,9 @@ struct ViewDataAnalysis<DataType, ArrayLayout, Kokkos::Array<V, N, P>> {
  private:
   using array_analysis = ViewArrayAnalysis<DataType>;
 
-  static_assert(std::is_void<P>::value, "");
+  static_assert(std::is_void<P>::value);
   static_assert(std::is_same<typename array_analysis::non_const_value_type,
-                             Kokkos::Array<V, N, P>>::value,
-                "");
+                             Kokkos::Array<V, N, P>>::value);
   static_assert(std::is_scalar<V>::value,
                 "View of Array type must be of a scalar type");
 
@@ -130,6 +129,12 @@ class ViewMapping<Traits, Kokkos::Array<>> {
     return m_impl_offset.m_dim.extent(r);
   }
 
+  static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent(
+      const unsigned r) noexcept {
+    using dim_type = typename offset_type::dimension_type;
+    return dim_type::static_extent(r);
+  }
+
   KOKKOS_INLINE_FUNCTION constexpr typename Traits::array_layout layout()
       const {
     return m_impl_offset.layout();
@@ -507,7 +512,7 @@ class ViewMapping<
                       Kokkos::LayoutStride>::value))>,
     SrcTraits, Args...> {
  private:
-  static_assert(SrcTraits::rank == sizeof...(Args), "");
+  static_assert(SrcTraits::rank == sizeof...(Args));
 
   enum : bool {
     R0 = is_integral_extent<0, Args...>::value,
diff --git a/core/src/impl/Kokkos_ViewDataAnalysis.hpp b/core/src/impl/Kokkos_ViewDataAnalysis.hpp
new file mode 100644
index 00000000000..04c0c9aeede
--- /dev/null
+++ b/core/src/impl/Kokkos_ViewDataAnalysis.hpp
@@ -0,0 +1,402 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#endif
+
+#ifndef KOKKOS_VIEW_DATA_ANALYSIS_HPP
+#define KOKKOS_VIEW_DATA_ANALYSIS_HPP
+
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos::Impl {
+
+template <unsigned I, size_t... Args>
+struct variadic_size_t {
+  enum : size_t { value = KOKKOS_INVALID_INDEX };
+};
+
+template <size_t Val, size_t... Args>
+struct variadic_size_t<0, Val, Args...> {
+  enum : size_t { value = Val };
+};
+
+template <unsigned I, size_t Val, size_t... Args>
+struct variadic_size_t<I, Val, Args...> {
+  enum : size_t { value = variadic_size_t<I - 1, Args...>::value };
+};
+
+template <size_t... Args>
+struct rank_dynamic;
+
+template <>
+struct rank_dynamic<> {
+  enum : unsigned { value = 0 };
+};
+
+template <size_t Val, size_t... Args>
+struct rank_dynamic<Val, Args...> {
+  enum : unsigned { value = (Val == 0 ? 1 : 0) + rank_dynamic<Args...>::value };
+};
+
+#define KOKKOS_IMPL_VIEW_DIMENSION(R)                                       \
+  template <size_t V, unsigned>                                             \
+  struct ViewDimension##R {                                                 \
+    static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1);  \
+    static constexpr size_t N##R    = (V != KOKKOS_INVALID_INDEX ? V : 1);  \
+    KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {}             \
+    ViewDimension##R()                        = default;                    \
+    ViewDimension##R(const ViewDimension##R&) = default;                    \
+    ViewDimension##R& operator=(const ViewDimension##R&) = default;         \
+  };                                                                        \
+  template <size_t V, unsigned RD>                                          \
+  constexpr size_t ViewDimension##R<V, RD>::ArgN##R;                        \
+  template <size_t V, unsigned RD>                                          \
+  constexpr size_t ViewDimension##R<V, RD>::N##R;                           \
+  template <unsigned RD>                                                    \
+  struct ViewDimension##R<0u, RD> {                                         \
+    static constexpr size_t ArgN##R = 0;                                    \
+    std::conditional_t<(RD < 3), size_t, unsigned> N##R;                    \
+    ViewDimension##R()                        = default;                    \
+    ViewDimension##R(const ViewDimension##R&) = default;                    \
+    ViewDimension##R& operator=(const ViewDimension##R&) = default;         \
+    KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \
+  };                                                                        \
+  template <unsigned RD>                                                    \
+  constexpr size_t ViewDimension##R<0u, RD>::ArgN##R;
+
+KOKKOS_IMPL_VIEW_DIMENSION(0)
+KOKKOS_IMPL_VIEW_DIMENSION(1)
+KOKKOS_IMPL_VIEW_DIMENSION(2)
+KOKKOS_IMPL_VIEW_DIMENSION(3)
+KOKKOS_IMPL_VIEW_DIMENSION(4)
+KOKKOS_IMPL_VIEW_DIMENSION(5)
+KOKKOS_IMPL_VIEW_DIMENSION(6)
+KOKKOS_IMPL_VIEW_DIMENSION(7)
+
+#undef KOKKOS_IMPL_VIEW_DIMENSION
+
+// MSVC does not do empty base class optimization by default.
+// Per standard it is required for standard layout types
+template <size_t... Vals>
+struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension
+    : public ViewDimension0<variadic_size_t<0u, Vals...>::value,
+                            rank_dynamic<Vals...>::value>,
+      public ViewDimension1<variadic_size_t<1u, Vals...>::value,
+                            rank_dynamic<Vals...>::value>,
+      public ViewDimension2<variadic_size_t<2u, Vals...>::value,
+                            rank_dynamic<Vals...>::value>,
+      public ViewDimension3<variadic_size_t<3u, Vals...>::value,
+                            rank_dynamic<Vals...>::value>,
+      public ViewDimension4<variadic_size_t<4u, Vals...>::value,
+                            rank_dynamic<Vals...>::value>,
+      public ViewDimension5<variadic_size_t<5u, Vals...>::value,
+                            rank_dynamic<Vals...>::value>,
+      public ViewDimension6<variadic_size_t<6u, Vals...>::value,
+                            rank_dynamic<Vals...>::value>,
+      public ViewDimension7<variadic_size_t<7u, Vals...>::value,
+                            rank_dynamic<Vals...>::value> {
+  using D0 = ViewDimension0<variadic_size_t<0U, Vals...>::value,
+                            rank_dynamic<Vals...>::value>;
+  using D1 = ViewDimension1<variadic_size_t<1U, Vals...>::value,
+                            rank_dynamic<Vals...>::value>;
+  using D2 = ViewDimension2<variadic_size_t<2U, Vals...>::value,
+                            rank_dynamic<Vals...>::value>;
+  using D3 = ViewDimension3<variadic_size_t<3U, Vals...>::value,
+                            rank_dynamic<Vals...>::value>;
+  using D4 = ViewDimension4<variadic_size_t<4U, Vals...>::value,
+                            rank_dynamic<Vals...>::value>;
+  using D5 = ViewDimension5<variadic_size_t<5U, Vals...>::value,
+                            rank_dynamic<Vals...>::value>;
+  using D6 = ViewDimension6<variadic_size_t<6U, Vals...>::value,
+                            rank_dynamic<Vals...>::value>;
+  using D7 = ViewDimension7<variadic_size_t<7U, Vals...>::value,
+                            rank_dynamic<Vals...>::value>;
+
+  using D0::ArgN0;
+  using D1::ArgN1;
+  using D2::ArgN2;
+  using D3::ArgN3;
+  using D4::ArgN4;
+  using D5::ArgN5;
+  using D6::ArgN6;
+  using D7::ArgN7;
+
+  using D0::N0;
+  using D1::N1;
+  using D2::N2;
+  using D3::N3;
+  using D4::N4;
+  using D5::N5;
+  using D6::N6;
+  using D7::N7;
+
+  static constexpr unsigned rank         = sizeof...(Vals);
+  static constexpr unsigned rank_dynamic = Impl::rank_dynamic<Vals...>::value;
+
+  ViewDimension()                     = default;
+  ViewDimension(const ViewDimension&) = default;
+  ViewDimension& operator=(const ViewDimension&) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4,
+                          size_t n5, size_t n6, size_t n7)
+      : D0(n0 == KOKKOS_INVALID_INDEX ? 1 : n0),
+        D1(n1 == KOKKOS_INVALID_INDEX ? 1 : n1),
+        D2(n2 == KOKKOS_INVALID_INDEX ? 1 : n2),
+        D3(n3 == KOKKOS_INVALID_INDEX ? 1 : n3),
+        D4(n4 == KOKKOS_INVALID_INDEX ? 1 : n4),
+        D5(n5 == KOKKOS_INVALID_INDEX ? 1 : n5),
+        D6(n6 == KOKKOS_INVALID_INDEX ? 1 : n6),
+        D7(n7 == KOKKOS_INVALID_INDEX ? 1 : n7) {}
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_t extent(const unsigned r) const noexcept {
+    return r == 0
+               ? N0
+               : (r == 1
+                      ? N1
+                      : (r == 2
+                             ? N2
+                             : (r == 3
+                                    ? N3
+                                    : (r == 4
+                                           ? N4
+                                           : (r == 5
+                                                  ? N5
+                                                  : (r == 6
+                                                         ? N6
+                                                         : (r == 7 ? N7
+                                                                   : 0)))))));
+  }
+
+  static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent(
+      const unsigned r) noexcept {
+    return r == 0
+               ? ArgN0
+               : (r == 1
+                      ? ArgN1
+                      : (r == 2
+                             ? ArgN2
+                             : (r == 3
+                                    ? ArgN3
+                                    : (r == 4
+                                           ? ArgN4
+                                           : (r == 5
+                                                  ? ArgN5
+                                                  : (r == 6
+                                                         ? ArgN6
+                                                         : (r == 7 ? ArgN7
+                                                                   : 0)))))));
+  }
+
+  template <size_t N>
+  struct prepend {
+    using type = ViewDimension<N, Vals...>;
+  };
+
+  template <size_t N>
+  struct append {
+    using type = ViewDimension<Vals..., N>;
+  };
+};
+
+template <class A, class B>
+struct ViewDimensionJoin;
+
+template <size_t... A, size_t... B>
+struct ViewDimensionJoin<ViewDimension<A...>, ViewDimension<B...>> {
+  using type = ViewDimension<A..., B...>;
+};
+
+//----------------------------------------------------------------------------
+
+template <class DstDim, class SrcDim>
+struct ViewDimensionAssignable;
+
+template <size_t... DstArgs, size_t... SrcArgs>
+struct ViewDimensionAssignable<ViewDimension<DstArgs...>,
+                               ViewDimension<SrcArgs...>> {
+  using dst = ViewDimension<DstArgs...>;
+  using src = ViewDimension<SrcArgs...>;
+
+  enum {
+    value = unsigned(dst::rank) == unsigned(src::rank) &&
+            (
+                // Compile time check that potential static dimensions match
+                ((1 > dst::rank_dynamic && 1 > src::rank_dynamic)
+                     ? (size_t(dst::ArgN0) == size_t(src::ArgN0))
+                     : true) &&
+                ((2 > dst::rank_dynamic && 2 > src::rank_dynamic)
+                     ? (size_t(dst::ArgN1) == size_t(src::ArgN1))
+                     : true) &&
+                ((3 > dst::rank_dynamic && 3 > src::rank_dynamic)
+                     ? (size_t(dst::ArgN2) == size_t(src::ArgN2))
+                     : true) &&
+                ((4 > dst::rank_dynamic && 4 > src::rank_dynamic)
+                     ? (size_t(dst::ArgN3) == size_t(src::ArgN3))
+                     : true) &&
+                ((5 > dst::rank_dynamic && 5 > src::rank_dynamic)
+                     ? (size_t(dst::ArgN4) == size_t(src::ArgN4))
+                     : true) &&
+                ((6 > dst::rank_dynamic && 6 > src::rank_dynamic)
+                     ? (size_t(dst::ArgN5) == size_t(src::ArgN5))
+                     : true) &&
+                ((7 > dst::rank_dynamic && 7 > src::rank_dynamic)
+                     ? (size_t(dst::ArgN6) == size_t(src::ArgN6))
+                     : true) &&
+                ((8 > dst::rank_dynamic && 8 > src::rank_dynamic)
+                     ? (size_t(dst::ArgN7) == size_t(src::ArgN7))
+                     : true))
+  };
+};
+
+/** \brief  Given a value type and dimension generate the View data type */
+template <class T, class Dim>
+struct ViewDataType;
+
+template <class T>
+struct ViewDataType<T, ViewDimension<>> {
+  using type = T;
+};
+
+template <class T, size_t... Args>
+struct ViewDataType<T, ViewDimension<0, Args...>> {
+  using type = typename ViewDataType<T*, ViewDimension<Args...>>::type;
+};
+
+template <class T, size_t N, size_t... Args>
+struct ViewDataType<T, ViewDimension<N, Args...>> {
+  using type = typename ViewDataType<T, ViewDimension<Args...>>::type[N];
+};
+
+/**\brief  Analysis of View data type.
+ *
+ *  Data type conforms to one of the following patterns :
+ *    {const} value_type [][#][#][#]
+ *    {const} value_type ***[#][#][#]
+ *  Where the sum of counts of '*' and '[#]' is at most ten.
+ *
+ *  Provide alias for ViewDimension<...> and value_type.
+ */
+template <class T>
+struct ViewArrayAnalysis {
+  using value_type           = T;
+  using const_value_type     = std::add_const_t<T>;
+  using non_const_value_type = std::remove_const_t<T>;
+  using static_dimension     = ViewDimension<>;
+  using dynamic_dimension    = ViewDimension<>;
+  using dimension            = ViewDimension<>;
+};
+
+template <class T, size_t N>
+struct ViewArrayAnalysis<T[N]> {
+ private:
+  using nested = ViewArrayAnalysis<T>;
+
+ public:
+  using value_type           = typename nested::value_type;
+  using const_value_type     = typename nested::const_value_type;
+  using non_const_value_type = typename nested::non_const_value_type;
+
+  using static_dimension =
+      typename nested::static_dimension::template prepend<N>::type;
+
+  using dynamic_dimension = typename nested::dynamic_dimension;
+
+  using dimension =
+      typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type;
+};
+
+template <class T>
+struct ViewArrayAnalysis<T[]> {
+ private:
+  using nested           = ViewArrayAnalysis<T>;
+  using nested_dimension = typename nested::dimension;
+
+ public:
+  using value_type           = typename nested::value_type;
+  using const_value_type     = typename nested::const_value_type;
+  using non_const_value_type = typename nested::non_const_value_type;
+
+  using dynamic_dimension =
+      typename nested::dynamic_dimension::template prepend<0>::type;
+
+  using static_dimension = typename nested::static_dimension;
+
+  using dimension =
+      typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type;
+};
+
+template <class T>
+struct ViewArrayAnalysis<T*> {
+ private:
+  using nested = ViewArrayAnalysis<T>;
+
+ public:
+  using value_type           = typename nested::value_type;
+  using const_value_type     = typename nested::const_value_type;
+  using non_const_value_type = typename nested::non_const_value_type;
+
+  using dynamic_dimension =
+      typename nested::dynamic_dimension::template prepend<0>::type;
+
+  using static_dimension = typename nested::static_dimension;
+
+  using dimension =
+      typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type;
+};
+
+template <class DataType, class ArrayLayout, class ValueType>
+struct ViewDataAnalysis {
+ private:
+  using array_analysis = ViewArrayAnalysis<DataType>;
+
+  // ValueType is opportunity for partial specialization.
+  // Must match array analysis when this default template is used.
+  static_assert(
+      std::is_same<ValueType,
+                   typename array_analysis::non_const_value_type>::value);
+
+ public:
+  using specialize = void;  // No specialization
+
+  using dimension            = typename array_analysis::dimension;
+  using value_type           = typename array_analysis::value_type;
+  using const_value_type     = typename array_analysis::const_value_type;
+  using non_const_value_type = typename array_analysis::non_const_value_type;
+
+  // Generate analogous multidimensional array specification type.
+  using type       = typename ViewDataType<value_type, dimension>::type;
+  using const_type = typename ViewDataType<const_value_type, dimension>::type;
+  using non_const_type =
+      typename ViewDataType<non_const_value_type, dimension>::type;
+
+  // Generate "flattened" multidimensional array specification type.
+  using scalar_array_type           = type;
+  using const_scalar_array_type     = const_type;
+  using non_const_scalar_array_type = non_const_type;
+};
+
+template <class Dimension, class Layout, class Enable = void>
+struct ViewOffset {
+  using is_mapping_plugin = std::false_type;
+};
+}  // namespace Kokkos::Impl
+
+#endif  // KOKKOS_VIEW_DATA_ANALYSIS_HPP
diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp
index 01d0dc4f681..3217c76e380 100644
--- a/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/core/src/impl/Kokkos_ViewMapping.hpp
@@ -33,255 +33,7 @@
 #include <impl/Kokkos_Tools.hpp>
 #include <impl/Kokkos_StringManipulation.hpp>
 #include <impl/Kokkos_ZeroMemset_fwd.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <unsigned I, size_t... Args>
-struct variadic_size_t {
-  enum : size_t { value = KOKKOS_INVALID_INDEX };
-};
-
-template <size_t Val, size_t... Args>
-struct variadic_size_t<0, Val, Args...> {
-  enum : size_t { value = Val };
-};
-
-template <unsigned I, size_t Val, size_t... Args>
-struct variadic_size_t<I, Val, Args...> {
-  enum : size_t { value = variadic_size_t<I - 1, Args...>::value };
-};
-
-template <size_t... Args>
-struct rank_dynamic;
-
-template <>
-struct rank_dynamic<> {
-  enum : unsigned { value = 0 };
-};
-
-template <size_t Val, size_t... Args>
-struct rank_dynamic<Val, Args...> {
-  enum : unsigned { value = (Val == 0 ? 1 : 0) + rank_dynamic<Args...>::value };
-};
-
-#define KOKKOS_IMPL_VIEW_DIMENSION(R)                                       \
-  template <size_t V, unsigned>                                             \
-  struct ViewDimension##R {                                                 \
-    static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1);  \
-    static constexpr size_t N##R    = (V != KOKKOS_INVALID_INDEX ? V : 1);  \
-    KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {}             \
-    ViewDimension##R()                        = default;                    \
-    ViewDimension##R(const ViewDimension##R&) = default;                    \
-    ViewDimension##R& operator=(const ViewDimension##R&) = default;         \
-  };                                                                        \
-  template <size_t V, unsigned RD>                                          \
-  constexpr size_t ViewDimension##R<V, RD>::ArgN##R;                        \
-  template <size_t V, unsigned RD>                                          \
-  constexpr size_t ViewDimension##R<V, RD>::N##R;                           \
-  template <unsigned RD>                                                    \
-  struct ViewDimension##R<0u, RD> {                                         \
-    static constexpr size_t ArgN##R = 0;                                    \
-    std::conditional_t<(RD < 3), size_t, unsigned> N##R;                    \
-    ViewDimension##R()                        = default;                    \
-    ViewDimension##R(const ViewDimension##R&) = default;                    \
-    ViewDimension##R& operator=(const ViewDimension##R&) = default;         \
-    KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \
-  };                                                                        \
-  template <unsigned RD>                                                    \
-  constexpr size_t ViewDimension##R<0u, RD>::ArgN##R;
-
-KOKKOS_IMPL_VIEW_DIMENSION(0)
-KOKKOS_IMPL_VIEW_DIMENSION(1)
-KOKKOS_IMPL_VIEW_DIMENSION(2)
-KOKKOS_IMPL_VIEW_DIMENSION(3)
-KOKKOS_IMPL_VIEW_DIMENSION(4)
-KOKKOS_IMPL_VIEW_DIMENSION(5)
-KOKKOS_IMPL_VIEW_DIMENSION(6)
-KOKKOS_IMPL_VIEW_DIMENSION(7)
-
-#undef KOKKOS_IMPL_VIEW_DIMENSION
-
-// MSVC does not do empty base class optimization by default.
-// Per standard it is required for standard layout types
-template <size_t... Vals>
-struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension
-    : public ViewDimension0<variadic_size_t<0u, Vals...>::value,
-                            rank_dynamic<Vals...>::value>,
-      public ViewDimension1<variadic_size_t<1u, Vals...>::value,
-                            rank_dynamic<Vals...>::value>,
-      public ViewDimension2<variadic_size_t<2u, Vals...>::value,
-                            rank_dynamic<Vals...>::value>,
-      public ViewDimension3<variadic_size_t<3u, Vals...>::value,
-                            rank_dynamic<Vals...>::value>,
-      public ViewDimension4<variadic_size_t<4u, Vals...>::value,
-                            rank_dynamic<Vals...>::value>,
-      public ViewDimension5<variadic_size_t<5u, Vals...>::value,
-                            rank_dynamic<Vals...>::value>,
-      public ViewDimension6<variadic_size_t<6u, Vals...>::value,
-                            rank_dynamic<Vals...>::value>,
-      public ViewDimension7<variadic_size_t<7u, Vals...>::value,
-                            rank_dynamic<Vals...>::value> {
-  using D0 = ViewDimension0<variadic_size_t<0U, Vals...>::value,
-                            rank_dynamic<Vals...>::value>;
-  using D1 = ViewDimension1<variadic_size_t<1U, Vals...>::value,
-                            rank_dynamic<Vals...>::value>;
-  using D2 = ViewDimension2<variadic_size_t<2U, Vals...>::value,
-                            rank_dynamic<Vals...>::value>;
-  using D3 = ViewDimension3<variadic_size_t<3U, Vals...>::value,
-                            rank_dynamic<Vals...>::value>;
-  using D4 = ViewDimension4<variadic_size_t<4U, Vals...>::value,
-                            rank_dynamic<Vals...>::value>;
-  using D5 = ViewDimension5<variadic_size_t<5U, Vals...>::value,
-                            rank_dynamic<Vals...>::value>;
-  using D6 = ViewDimension6<variadic_size_t<6U, Vals...>::value,
-                            rank_dynamic<Vals...>::value>;
-  using D7 = ViewDimension7<variadic_size_t<7U, Vals...>::value,
-                            rank_dynamic<Vals...>::value>;
-
-  using D0::ArgN0;
-  using D1::ArgN1;
-  using D2::ArgN2;
-  using D3::ArgN3;
-  using D4::ArgN4;
-  using D5::ArgN5;
-  using D6::ArgN6;
-  using D7::ArgN7;
-
-  using D0::N0;
-  using D1::N1;
-  using D2::N2;
-  using D3::N3;
-  using D4::N4;
-  using D5::N5;
-  using D6::N6;
-  using D7::N7;
-
-  static constexpr unsigned rank         = sizeof...(Vals);
-  static constexpr unsigned rank_dynamic = Impl::rank_dynamic<Vals...>::value;
-
-  ViewDimension()                     = default;
-  ViewDimension(const ViewDimension&) = default;
-  ViewDimension& operator=(const ViewDimension&) = default;
-
-  KOKKOS_INLINE_FUNCTION
-  constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4,
-                          size_t n5, size_t n6, size_t n7)
-      : D0(n0 == KOKKOS_INVALID_INDEX ? 1 : n0),
-        D1(n1 == KOKKOS_INVALID_INDEX ? 1 : n1),
-        D2(n2 == KOKKOS_INVALID_INDEX ? 1 : n2),
-        D3(n3 == KOKKOS_INVALID_INDEX ? 1 : n3),
-        D4(n4 == KOKKOS_INVALID_INDEX ? 1 : n4),
-        D5(n5 == KOKKOS_INVALID_INDEX ? 1 : n5),
-        D6(n6 == KOKKOS_INVALID_INDEX ? 1 : n6),
-        D7(n7 == KOKKOS_INVALID_INDEX ? 1 : n7) {}
-
-  KOKKOS_INLINE_FUNCTION
-  constexpr size_t extent(const unsigned r) const noexcept {
-    return r == 0
-               ? N0
-               : (r == 1
-                      ? N1
-                      : (r == 2
-                             ? N2
-                             : (r == 3
-                                    ? N3
-                                    : (r == 4
-                                           ? N4
-                                           : (r == 5
-                                                  ? N5
-                                                  : (r == 6
-                                                         ? N6
-                                                         : (r == 7 ? N7
-                                                                   : 0)))))));
-  }
-
-  static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent(
-      const unsigned r) noexcept {
-    return r == 0
-               ? ArgN0
-               : (r == 1
-                      ? ArgN1
-                      : (r == 2
-                             ? ArgN2
-                             : (r == 3
-                                    ? ArgN3
-                                    : (r == 4
-                                           ? ArgN4
-                                           : (r == 5
-                                                  ? ArgN5
-                                                  : (r == 6
-                                                         ? ArgN6
-                                                         : (r == 7 ? ArgN7
-                                                                   : 0)))))));
-  }
-
-  template <size_t N>
-  struct prepend {
-    using type = ViewDimension<N, Vals...>;
-  };
-
-  template <size_t N>
-  struct append {
-    using type = ViewDimension<Vals..., N>;
-  };
-};
-
-template <class A, class B>
-struct ViewDimensionJoin;
-
-template <size_t... A, size_t... B>
-struct ViewDimensionJoin<ViewDimension<A...>, ViewDimension<B...>> {
-  using type = ViewDimension<A..., B...>;
-};
-
-//----------------------------------------------------------------------------
-
-template <class DstDim, class SrcDim>
-struct ViewDimensionAssignable;
-
-template <size_t... DstArgs, size_t... SrcArgs>
-struct ViewDimensionAssignable<ViewDimension<DstArgs...>,
-                               ViewDimension<SrcArgs...>> {
-  using dst = ViewDimension<DstArgs...>;
-  using src = ViewDimension<SrcArgs...>;
-
-  enum {
-    value = unsigned(dst::rank) == unsigned(src::rank) &&
-            (
-                // Compile time check that potential static dimensions match
-                ((1 > dst::rank_dynamic && 1 > src::rank_dynamic)
-                     ? (size_t(dst::ArgN0) == size_t(src::ArgN0))
-                     : true) &&
-                ((2 > dst::rank_dynamic && 2 > src::rank_dynamic)
-                     ? (size_t(dst::ArgN1) == size_t(src::ArgN1))
-                     : true) &&
-                ((3 > dst::rank_dynamic && 3 > src::rank_dynamic)
-                     ? (size_t(dst::ArgN2) == size_t(src::ArgN2))
-                     : true) &&
-                ((4 > dst::rank_dynamic && 4 > src::rank_dynamic)
-                     ? (size_t(dst::ArgN3) == size_t(src::ArgN3))
-                     : true) &&
-                ((5 > dst::rank_dynamic && 5 > src::rank_dynamic)
-                     ? (size_t(dst::ArgN4) == size_t(src::ArgN4))
-                     : true) &&
-                ((6 > dst::rank_dynamic && 6 > src::rank_dynamic)
-                     ? (size_t(dst::ArgN5) == size_t(src::ArgN5))
-                     : true) &&
-                ((7 > dst::rank_dynamic && 7 > src::rank_dynamic)
-                     ? (size_t(dst::ArgN6) == size_t(src::ArgN6))
-                     : true) &&
-                ((8 > dst::rank_dynamic && 8 > src::rank_dynamic)
-                     ? (size_t(dst::ArgN7) == size_t(src::ArgN7))
-                     : true))
-  };
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
+#include <impl/Kokkos_ViewDataAnalysis.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -657,21 +409,20 @@ struct SubviewExtents {
   template <size_t... DimArgs, class... Args>
   KOKKOS_INLINE_FUNCTION SubviewExtents(const ViewDimension<DimArgs...>& dim,
                                         Args... args) {
-    static_assert(DomainRank == sizeof...(DimArgs), "");
-    static_assert(DomainRank == sizeof...(Args), "");
+    static_assert(DomainRank == sizeof...(DimArgs));
+    static_assert(DomainRank == sizeof...(Args));
 
     // Verifies that all arguments, up to 8, are integral types,
     // integral extents, or don't exist.
-    static_assert(
-        RangeRank == unsigned(is_integral_extent<0, Args...>::value) +
-                         unsigned(is_integral_extent<1, Args...>::value) +
-                         unsigned(is_integral_extent<2, Args...>::value) +
-                         unsigned(is_integral_extent<3, Args...>::value) +
-                         unsigned(is_integral_extent<4, Args...>::value) +
-                         unsigned(is_integral_extent<5, Args...>::value) +
-                         unsigned(is_integral_extent<6, Args...>::value) +
-                         unsigned(is_integral_extent<7, Args...>::value),
-        "");
+    static_assert(RangeRank ==
+                  unsigned(is_integral_extent<0, Args...>::value) +
+                      unsigned(is_integral_extent<1, Args...>::value) +
+                      unsigned(is_integral_extent<2, Args...>::value) +
+                      unsigned(is_integral_extent<3, Args...>::value) +
+                      unsigned(is_integral_extent<4, Args...>::value) +
+                      unsigned(is_integral_extent<5, Args...>::value) +
+                      unsigned(is_integral_extent<6, Args...>::value) +
+                      unsigned(is_integral_extent<7, Args...>::value));
 
     if (RangeRank == 0) {
       m_length[0] = 0;
@@ -708,149 +459,6 @@ struct SubviewExtents {
 
 namespace Kokkos {
 namespace Impl {
-
-/** \brief  Given a value type and dimension generate the View data type */
-template <class T, class Dim>
-struct ViewDataType;
-
-template <class T>
-struct ViewDataType<T, ViewDimension<>> {
-  using type = T;
-};
-
-template <class T, size_t... Args>
-struct ViewDataType<T, ViewDimension<0, Args...>> {
-  using type = typename ViewDataType<T*, ViewDimension<Args...>>::type;
-};
-
-template <class T, size_t N, size_t... Args>
-struct ViewDataType<T, ViewDimension<N, Args...>> {
-  using type = typename ViewDataType<T, ViewDimension<Args...>>::type[N];
-};
-
-/**\brief  Analysis of View data type.
- *
- *  Data type conforms to one of the following patterns :
- *    {const} value_type [][#][#][#]
- *    {const} value_type ***[#][#][#]
- *  Where the sum of counts of '*' and '[#]' is at most ten.
- *
- *  Provide alias for ViewDimension<...> and value_type.
- */
-template <class T>
-struct ViewArrayAnalysis {
-  using value_type           = T;
-  using const_value_type     = std::add_const_t<T>;
-  using non_const_value_type = std::remove_const_t<T>;
-  using static_dimension     = ViewDimension<>;
-  using dynamic_dimension    = ViewDimension<>;
-  using dimension            = ViewDimension<>;
-};
-
-template <class T, size_t N>
-struct ViewArrayAnalysis<T[N]> {
- private:
-  using nested = ViewArrayAnalysis<T>;
-
- public:
-  using value_type           = typename nested::value_type;
-  using const_value_type     = typename nested::const_value_type;
-  using non_const_value_type = typename nested::non_const_value_type;
-
-  using static_dimension =
-      typename nested::static_dimension::template prepend<N>::type;
-
-  using dynamic_dimension = typename nested::dynamic_dimension;
-
-  using dimension =
-      typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type;
-};
-
-template <class T>
-struct ViewArrayAnalysis<T[]> {
- private:
-  using nested           = ViewArrayAnalysis<T>;
-  using nested_dimension = typename nested::dimension;
-
- public:
-  using value_type           = typename nested::value_type;
-  using const_value_type     = typename nested::const_value_type;
-  using non_const_value_type = typename nested::non_const_value_type;
-
-  using dynamic_dimension =
-      typename nested::dynamic_dimension::template prepend<0>::type;
-
-  using static_dimension = typename nested::static_dimension;
-
-  using dimension =
-      typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type;
-};
-
-template <class T>
-struct ViewArrayAnalysis<T*> {
- private:
-  using nested = ViewArrayAnalysis<T>;
-
- public:
-  using value_type           = typename nested::value_type;
-  using const_value_type     = typename nested::const_value_type;
-  using non_const_value_type = typename nested::non_const_value_type;
-
-  using dynamic_dimension =
-      typename nested::dynamic_dimension::template prepend<0>::type;
-
-  using static_dimension = typename nested::static_dimension;
-
-  using dimension =
-      typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type;
-};
-
-template <class DataType, class ArrayLayout, class ValueType>
-struct ViewDataAnalysis {
- private:
-  using array_analysis = ViewArrayAnalysis<DataType>;
-
-  // ValueType is opportunity for partial specialization.
-  // Must match array analysis when this default template is used.
-  static_assert(
-      std::is_same<ValueType,
-                   typename array_analysis::non_const_value_type>::value,
-      "");
-
- public:
-  using specialize = void;  // No specialization
-
-  using dimension            = typename array_analysis::dimension;
-  using value_type           = typename array_analysis::value_type;
-  using const_value_type     = typename array_analysis::const_value_type;
-  using non_const_value_type = typename array_analysis::non_const_value_type;
-
-  // Generate analogous multidimensional array specification type.
-  using type       = typename ViewDataType<value_type, dimension>::type;
-  using const_type = typename ViewDataType<const_value_type, dimension>::type;
-  using non_const_type =
-      typename ViewDataType<non_const_value_type, dimension>::type;
-
-  // Generate "flattened" multidimensional array specification type.
-  using scalar_array_type           = type;
-  using const_scalar_array_type     = const_type;
-  using non_const_scalar_array_type = non_const_type;
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <class Dimension, class Layout, class Enable = void>
-struct ViewOffset {
-  using is_mapping_plugin = std::false_type;
-};
-
 //----------------------------------------------------------------------------
 // LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding
 template <class Dimension>
@@ -2919,13 +2527,9 @@ struct ViewValueFunctor<DeviceType, ValueType, false /* is_scalar */> {
             "Kokkos::View::initialization [" + name + "] via memset",
             Kokkos::Profiling::Experimental::device_id(space), &kpID);
       }
-      (void)ZeroMemset<
-          ExecSpace, Kokkos::View<ValueType*, typename DeviceType::memory_space,
-                                  Kokkos::MemoryTraits<Kokkos::Unmanaged>>>(
-          space,
-          Kokkos::View<ValueType*, typename DeviceType::memory_space,
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n),
-          value);
+      (void)ZeroMemset(
+          space, Kokkos::View<ValueType*, typename DeviceType::memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n));
 
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         Kokkos::Profiling::endParallelFor(kpID);
@@ -2949,37 +2553,33 @@ struct ViewValueFunctor<DeviceType, ValueType, false /* is_scalar */> {
 
   template <typename Tag>
   void parallel_for_implementation() {
-    if (!space.in_parallel()) {
-      using PolicyType =
-          Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>, Tag>;
-      PolicyType policy(space, 0, n);
-      uint64_t kpID = 0;
-      if (Kokkos::Profiling::profileLibraryLoaded()) {
-        const std::string functor_name =
-            (std::is_same_v<Tag, DestroyTag>
-                 ? "Kokkos::View::destruction [" + name + "]"
-                 : "Kokkos::View::initialization [" + name + "]");
-        Kokkos::Profiling::beginParallelFor(
-            functor_name, Kokkos::Profiling::Experimental::device_id(space),
-            &kpID);
-      }
+    using PolicyType =
+        Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>, Tag>;
+    PolicyType policy(space, 0, n);
+    uint64_t kpID = 0;
+    if (Kokkos::Profiling::profileLibraryLoaded()) {
+      const std::string functor_name =
+          (std::is_same_v<Tag, DestroyTag>
+               ? "Kokkos::View::destruction [" + name + "]"
+               : "Kokkos::View::initialization [" + name + "]");
+      Kokkos::Profiling::beginParallelFor(
+          functor_name, Kokkos::Profiling::Experimental::device_id(space),
+          &kpID);
+    }
 
 #ifdef KOKKOS_ENABLE_CUDA
-      if (std::is_same<ExecSpace, Kokkos::Cuda>::value) {
-        Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n,
-                                            true);
-      }
+    if (std::is_same<ExecSpace, Kokkos::Cuda>::value) {
+      Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n,
+                                          true);
+    }
 #endif
-      const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
-          *this, policy);
-      closure.execute();
-      if (default_exec_space || std::is_same_v<Tag, DestroyTag>)
-        space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence");
-      if (Kokkos::Profiling::profileLibraryLoaded()) {
-        Kokkos::Profiling::endParallelFor(kpID);
-      }
-    } else {
-      for (size_t i = 0; i < n; ++i) operator()(Tag{}, i);
+    const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
+        *this, policy);
+    closure.execute();
+    if (default_exec_space || std::is_same_v<Tag, DestroyTag>)
+      space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence");
+    if (Kokkos::Profiling::profileLibraryLoaded()) {
+      Kokkos::Profiling::endParallelFor(kpID);
     }
   }
 
@@ -3057,13 +2657,9 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
             Kokkos::Profiling::Experimental::device_id(space), &kpID);
       }
 
-      (void)ZeroMemset<
-          ExecSpace, Kokkos::View<ValueType*, typename DeviceType::memory_space,
-                                  Kokkos::MemoryTraits<Kokkos::Unmanaged>>>(
-          space,
-          Kokkos::View<ValueType*, typename DeviceType::memory_space,
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n),
-          value);
+      (void)ZeroMemset(
+          space, Kokkos::View<ValueType*, typename DeviceType::memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n));
 
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         Kokkos::Profiling::endParallelFor(kpID);
@@ -3086,32 +2682,28 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
   }
 
   void parallel_for_implementation() {
-    if (!space.in_parallel()) {
-      PolicyType policy(0, n);
-      uint64_t kpID = 0;
-      if (Kokkos::Profiling::profileLibraryLoaded()) {
-        Kokkos::Profiling::beginParallelFor(
-            "Kokkos::View::initialization [" + name + "]",
-            Kokkos::Profiling::Experimental::device_id(space), &kpID);
-      }
+    PolicyType policy(0, n);
+    uint64_t kpID = 0;
+    if (Kokkos::Profiling::profileLibraryLoaded()) {
+      Kokkos::Profiling::beginParallelFor(
+          "Kokkos::View::initialization [" + name + "]",
+          Kokkos::Profiling::Experimental::device_id(space), &kpID);
+    }
 #ifdef KOKKOS_ENABLE_CUDA
-      if (std::is_same<ExecSpace, Kokkos::Cuda>::value) {
-        Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n,
-                                            true);
-      }
+    if (std::is_same<ExecSpace, Kokkos::Cuda>::value) {
+      Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n,
+                                          true);
+    }
 #endif
-      const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
-          *this, PolicyType(0, n));
-      closure.execute();
-      if (default_exec_space)
-        space.fence(
-            "Kokkos::Impl::ViewValueFunctor: Fence after setting values in "
-            "view");
-      if (Kokkos::Profiling::profileLibraryLoaded()) {
-        Kokkos::Profiling::endParallelFor(kpID);
-      }
-    } else {
-      for (size_t i = 0; i < n; ++i) operator()(i);
+    const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
+        *this, PolicyType(0, n));
+    closure.execute();
+    if (default_exec_space)
+      space.fence(
+          "Kokkos::Impl::ViewValueFunctor: Fence after setting values in "
+          "view");
+    if (Kokkos::Profiling::profileLibraryLoaded()) {
+      Kokkos::Profiling::endParallelFor(kpID);
     }
   }
 
@@ -3896,7 +3488,7 @@ class ViewMapping<
 
   template <class MemoryTraits>
   struct apply {
-    static_assert(Kokkos::is_memory_traits<MemoryTraits>::value, "");
+    static_assert(Kokkos::is_memory_traits<MemoryTraits>::value);
 
     using traits_type =
         Kokkos::ViewTraits<data_type, array_layout,
@@ -3945,110 +3537,86 @@ class ViewMapping<
 namespace Kokkos {
 namespace Impl {
 
-template <unsigned, class MapType>
-KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType&) {
-  return true;
+template <class Map, class... Indices, std::size_t... Enumerate>
+KOKKOS_FUNCTION bool within_range(Map const& map,
+                                  std::index_sequence<Enumerate...>,
+                                  Indices... indices) {
+  return (((std::size_t)indices < map.extent(Enumerate)) && ...);
 }
 
-template <unsigned R, class MapType, class iType, class... Args>
-KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType& map,
-                                                        const iType& i,
-                                                        Args... args) {
-  return (size_t(i) < map.extent(R)) &&
-         view_verify_operator_bounds<R + 1>(map, args...);
+template <class... Indices>
+KOKKOS_FUNCTION constexpr char* append_formatted_multidimensional_index(
+    char* dest, Indices... indices) {
+  char* d = dest;
+  strcat(d, "[");
+  (
+      [&] {
+        d += strlen(d);
+        to_chars_i(d,
+                   d + 20,  // 20 digits ought to be enough
+                   indices);
+        strcat(d, ",");
+      }(),
+      ...);
+  d[strlen(d) - 1] = ']';  // overwrite trailing comma
+  return dest;
 }
 
-template <unsigned, class MapType>
-inline void view_error_operator_bounds(char*, int, const MapType&) {}
-
-template <unsigned R, class MapType, class iType, class... Args>
-inline void view_error_operator_bounds(char* buf, int len, const MapType& map,
-                                       const iType& i, Args... args) {
-  const int n = snprintf(
-      buf, len, " %ld < %ld %c", static_cast<unsigned long>(i),
-      static_cast<unsigned long>(map.extent(R)), (sizeof...(Args) ? ',' : ')'));
-  view_error_operator_bounds<R + 1>(buf + n, len - n, map, args...);
+template <class Map, class... Indices, std::size_t... Enumerate>
+KOKKOS_FUNCTION void print_extents(char* dest, Map const& map,
+                                   std::index_sequence<Enumerate...>) {
+  append_formatted_multidimensional_index(dest, map.extent(Enumerate)...);
 }
 
-/* Check #3: is the View managed as determined by the MemoryTraits? */
-template <class MapType, bool is_managed = (MapType::is_managed != 0)>
-struct OperatorBoundsErrorOnDevice;
-
-template <class MapType>
-struct OperatorBoundsErrorOnDevice<MapType, false> {
-  KOKKOS_INLINE_FUNCTION
-  static void run(MapType const&) { Kokkos::abort("View bounds error"); }
-};
-
-template <class MapType>
-struct OperatorBoundsErrorOnDevice<MapType, true> {
-  KOKKOS_INLINE_FUNCTION
-  static void run(MapType const& map) {
-    SharedAllocationHeader const* const header =
-        SharedAllocationHeader::get_header(
-            static_cast<void const*>(map.data()));
-    char const* const label = header->label();
-    enum { LEN = 128 };
-    char msg[LEN];
-    char const* const first_part = "View bounds error of view ";
-    char* p                      = msg;
-    char* const end              = msg + LEN - 1;
-    for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) {
-      *p = *p2;
-    }
-    for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) {
-      *p = *p2;
-    }
-    *p = '\0';
-    Kokkos::abort(msg);
-  }
-};
-
-/* Check #2: does the ViewMapping have the printable_label_typedef defined?
-   See above that only the non-specialized standard-layout ViewMapping has
-   this defined by default.
-   The existence of this alias indicates the existence of MapType::is_managed
- */
 template <class T>
 using printable_label_typedef_t = typename T::printable_label_typedef;
 
-template <class Map>
-KOKKOS_FUNCTION
-    std::enable_if_t<!is_detected<printable_label_typedef_t, Map>::value>
-    operator_bounds_error_on_device(Map const&) {
-  Kokkos::abort("View bounds error");
-}
-
-template <class Map>
-KOKKOS_FUNCTION
-    std::enable_if_t<is_detected<printable_label_typedef_t, Map>::value>
-    operator_bounds_error_on_device(Map const& map) {
-  OperatorBoundsErrorOnDevice<Map>::run(map);
-}
-
 template <class MemorySpace, class ViewType, class MapType, class... Args>
 KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds(
     Kokkos::Impl::ViewTracker<ViewType> const& tracker, const MapType& map,
     Args... args) {
-  if (!view_verify_operator_bounds<0>(map, args...)) {
+  if (!within_range(map, std::make_index_sequence<sizeof...(Args)>(),
+                    args...)) {
+    char err[256] = "";
+    strcat(err, "Kokkos::View ERROR: out of bounds access");
+    strcat(err, " label=(\"");
     KOKKOS_IF_ON_HOST(
-        (enum {LEN = 1024}; char buffer[LEN];
-         const std::string label =
-             tracker.m_tracker.template get_label<MemorySpace>();
-         int n = snprintf(buffer, LEN, "View bounds error of view %s (",
-                          label.c_str());
-         view_error_operator_bounds<0>(buffer + n, LEN - n, map, args...);
-         Kokkos::Impl::throw_runtime_exception(std::string(buffer));))
-
-    KOKKOS_IF_ON_DEVICE((
-        /* Check #1: is there a SharedAllocationRecord?
-           (we won't use it, but if its not there then there isn't
-            a corresponding SharedAllocationHeader containing a label).
-           This check should cover the case of Views that don't
-           have the Unmanaged trait but were initialized by pointer. */
         if (tracker.m_tracker.has_record()) {
-          operator_bounds_error_on_device(map);
-        } else { Kokkos::abort("View bounds error"); }))
+          strncat(err, tracker.m_tracker.template get_label<void>().c_str(),
+                  128);
+        } else { strcat(err, "**UNMANAGED**"); })
+    KOKKOS_IF_ON_DEVICE([&] {
+      // Check #1: is there a SharedAllocationRecord?  (we won't use it, but
+      // if its not there then there isn't a corresponding
+      // SharedAllocationHeader containing a label).  This check should cover
+      // the case of Views that don't have the Unmanaged trait but were
+      // initialized by pointer.
+      if (!tracker.m_tracker.has_record()) {
+        strcat(err, "**UNMANAGED**");
+        return;
+      }
+      // Check #2: does the ViewMapping have the printable_label_typedef
+      // defined? See above that only the non-specialized standard-layout
+      // ViewMapping has this defined by default. The existence of this
+      // alias indicates the existence of MapType::is_managed
+      if constexpr (is_detected_v<printable_label_typedef_t, MapType>) {
+        // Check #3: is the View managed as determined by the MemoryTraits?
+        if constexpr (MapType::is_managed != 0) {
+          SharedAllocationHeader const* const header =
+              SharedAllocationHeader::get_header(
+                  static_cast<void const*>(map.data()));
+          char const* const label = header->label();
+          strcat(err, label);
+          return;
+        }
+        strcat(err, "**UNAVAILABLE**");
+      }
+    }();)
+    strcat(err, "\") with indices ");
+    append_formatted_multidimensional_index(err, args...);
+    strcat(err, " but extents ");
+    print_extents(err, map, std::make_index_sequence<sizeof...(Args)>());
+    Kokkos::abort(err);
   }
 }
 
diff --git a/core/src/setup/Kokkos_Setup_SYCL.hpp b/core/src/setup/Kokkos_Setup_SYCL.hpp
index 7f7957bc61f..30f6fa2ad23 100644
--- a/core/src/setup/Kokkos_Setup_SYCL.hpp
+++ b/core/src/setup/Kokkos_Setup_SYCL.hpp
@@ -38,12 +38,11 @@
 #include <CL/sycl.hpp>
 #endif
 
-#ifdef __SYCL_DEVICE_ONLY__
-#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...)                \
-  do {                                                            \
-    const __attribute__((opencl_constant)) char fmt[] = (format); \
-    sycl::ext::oneapi::experimental::printf(fmt, ##__VA_ARGS__);  \
-  } while (0)
+#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20230200
+#define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) \
+  accessor.get_multi_ptr<sycl::access::decorated::yes>()
+#else
+#define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) accessor.get_pointer()
 #endif
 
 #endif
diff --git a/core/src/traits/Kokkos_IndexTypeTrait.hpp b/core/src/traits/Kokkos_IndexTypeTrait.hpp
index 91820fbccac..e43535451c3 100644
--- a/core/src/traits/Kokkos_IndexTypeTrait.hpp
+++ b/core/src/traits/Kokkos_IndexTypeTrait.hpp
@@ -83,7 +83,7 @@ struct IndexTypePolicyMixin : AnalyzeNextTrait {
                 "Kokkos Error: More than one index type given. Search "
                 "compiler output for 'show_extra_index_type' to see the "
                 "type of the errant tag.");
-  static_assert(std::is_integral<IntegralIndexType>::value, "");
+  static_assert(std::is_integral<IntegralIndexType>::value);
   static constexpr bool index_type_is_defaulted = false;
   using index_type = Kokkos::IndexType<IntegralIndexType>;
 };
diff --git a/core/src/traits/Kokkos_OccupancyControlTrait.hpp b/core/src/traits/Kokkos_OccupancyControlTrait.hpp
index dadf582c372..c2ca5a341f1 100644
--- a/core/src/traits/Kokkos_OccupancyControlTrait.hpp
+++ b/core/src/traits/Kokkos_OccupancyControlTrait.hpp
@@ -163,7 +163,7 @@ auto prefer(Policy const& p, DesiredOccupancy occ) {
 
 template <typename Policy>
 constexpr auto prefer(Policy const& p, MaximizeOccupancy) {
-  static_assert(Kokkos::is_execution_policy<Policy>::value, "");
+  static_assert(Kokkos::is_execution_policy<Policy>::value);
   using new_policy_t =
       Kokkos::Impl::OccupancyControlTrait::policy_with_trait<Policy,
                                                              MaximizeOccupancy>;
diff --git a/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp b/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
index 578e9e762ad..98ad1d7ebbb 100644
--- a/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
+++ b/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
@@ -68,7 +68,7 @@ struct PolicyTraitAdaptorImpl<
     TraitSpec, PolicyTemplate, type_list<ProcessedTraits...>,
     type_list<MatchingTrait, ToProcessTraits...>, NewTrait,
     std::enable_if_t<PolicyTraitMatcher<TraitSpec, MatchingTrait>::value>> {
-  static_assert(PolicyTraitMatcher<TraitSpec, NewTrait>::value, "");
+  static_assert(PolicyTraitMatcher<TraitSpec, NewTrait>::value);
   using type = PolicyTemplate<ProcessedTraits..., NewTrait, ToProcessTraits...>;
 };
 
@@ -92,7 +92,7 @@ template <class TraitSpec, template <class...> class PolicyTemplate,
 struct PolicyTraitAdaptorImpl<TraitSpec, PolicyTemplate,
                               type_list<ProcessedTraits...>, type_list<>,
                               NewTrait> {
-  static_assert(PolicyTraitMatcher<TraitSpec, NewTrait>::value, "");
+  static_assert(PolicyTraitMatcher<TraitSpec, NewTrait>::value);
   using type = PolicyTemplate<ProcessedTraits..., NewTrait>;
 };
 
diff --git a/core/src/traits/Kokkos_ScheduleTrait.hpp b/core/src/traits/Kokkos_ScheduleTrait.hpp
index 86130025530..4e91d89f0f9 100644
--- a/core/src/traits/Kokkos_ScheduleTrait.hpp
+++ b/core/src/traits/Kokkos_ScheduleTrait.hpp
@@ -78,7 +78,7 @@ namespace Experimental {
 
 template <class Policy, class ScheduleType>
 constexpr auto require(Policy const& p, Kokkos::Schedule<ScheduleType>) {
-  static_assert(Kokkos::is_execution_policy<Policy>::value, "");
+  static_assert(Kokkos::is_execution_policy<Policy>::value);
   using new_policy_t = Kokkos::Impl::ScheduleTrait::policy_with_trait<
       Policy, Kokkos::Schedule<ScheduleType>>;
   return new_policy_t{p};
diff --git a/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp b/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
index 8f95385c851..ae7aa6e534f 100644
--- a/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
+++ b/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
@@ -57,7 +57,7 @@ namespace Experimental {
 template <class Policy, unsigned long Property>
 constexpr auto require(const Policy p,
                        WorkItemProperty::ImplWorkItemProperty<Property>) {
-  static_assert(Kokkos::is_execution_policy<Policy>::value, "");
+  static_assert(Kokkos::is_execution_policy<Policy>::value);
   using new_policy_t = Kokkos::Impl::WorkItemPropertyTrait::policy_with_trait<
       Policy, WorkItemProperty::ImplWorkItemProperty<Property>>;
   return new_policy_t{p};
diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt
index b71c72c3c9f..6dfb7505c5d 100644
--- a/core/unit_test/CMakeLists.txt
+++ b/core/unit_test/CMakeLists.txt
@@ -65,7 +65,7 @@ SET(KOKKOS_THREADS_NAME Threads)
 IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
   SET(KOKKOS_OPENACC_FEATURE_LEVEL 9)
 ELSE()
-  SET(KOKKOS_OPENACC_FEATURE_LEVEL 16)
+  SET(KOKKOS_OPENACC_FEATURE_LEVEL 17)
 ENDIF()
 
 SET(KOKKOS_OPENACC_NAME Experimental::OpenACC)
@@ -86,11 +86,13 @@ SET(COMPILE_ONLY_SOURCES
   TestDetectionIdiom.cpp
   TestBitManipulation.cpp
   TestInterOp.cpp
+  TestRangePolicyCTAD.cpp
   TestStringManipulation.cpp
   TestVersionMacros.cpp
   TestViewRank.cpp
   TestViewTypeTraits.cpp
   TestTypeList.cpp
+  TestMDRangePolicyCTAD.cpp
   view/TestExtentsDatatypeConversion.cpp
 )
 
@@ -184,6 +186,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL)
         MDSpan
         MinMaxClamp
         NumericTraits
+        OccupancyControlTrait
         Other
         ParallelScanRangePolicy
         Printf
@@ -200,6 +203,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL)
         Reductions
         Reductions_DeviceView
         SharedAlloc
+        Swap
         )
       set(file ${dir}/Test${Tag}_${Name}.cpp)
       # Write to a temporary intermediate file and call configure_file to avoid
@@ -233,6 +237,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL)
       ViewCopy_a
       ViewCopy_b
       ViewCtorDimMatch
+      ViewEmptyRuntimeUnmanaged
       ViewHooks
       ViewLayoutStrideAssignment
       ViewMapping_a
@@ -240,6 +245,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL)
       ViewMapping_subview
       ViewMemoryAccessViolation
       ViewOfClass
+      ViewOutOfBoundsAccess
       ViewResize
       WorkGraph
       WithoutInitializing
@@ -372,20 +378,21 @@ if(Kokkos_ENABLE_OPENMPTARGET)
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamCombinedReducers.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_WorkGraph.cpp
-    IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0)
-        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp
-    endif()
     IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0)
         ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_shared.cpp
         ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MinMaxClamp.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp
         ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp
         ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp
         ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp
+        IF (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.0.3)
+            ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp
+            ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp
+            ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp
+            ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp
+        endif()
     endif()
     # FIXME_OPENMPTARGET_CRAY: The following tests fail at compile time when the OpenMPTarget backend is enabled with the Cray compiler.
     # Atomic compare/exchange is used in these tests which can be one of the reasons for the compilation failures.
@@ -522,17 +529,7 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   list(REMOVE_ITEM OpenACC_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_double.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_float.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_int.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longint.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longlongint.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_shared.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedint.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedlongint.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicViews.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BlockSizeDeduction.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp
@@ -549,17 +546,10 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_b.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c02.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c03.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c08.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp
     )
 endif()
@@ -677,7 +667,6 @@ endif()
 if (Kokkos_ENABLE_OPENMP)
   set(OpenMP_EXTRA_SOURCES
     openmp/TestOpenMP_Task.cpp
-    openmp/TestOpenMP_PartitionMaster.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     CoreUnitTest_OpenMP
@@ -724,12 +713,14 @@ if(Kokkos_ENABLE_HPX)
       hpx/TestHPX_IndependentInstancesRefCounting.cpp
       hpx/TestHPX_IndependentInstancesSynchronization.cpp
   )
+if(Kokkos_ENABLE_DEPRECATED_CODE_4)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     CoreUnitTest_HPX_InParallel
     SOURCES
       UnitTestMainInit.cpp
       hpx/TestHPX_InParallel.cpp
   )
+  endif()
 endif()
 
 if(Kokkos_ENABLE_OPENMPTARGET)
@@ -797,6 +788,12 @@ if(Kokkos_ENABLE_CUDA)
       UnitTestMain.cpp
       cuda/TestCuda_InterOp_Streams.cpp
   )
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    CoreUnitTest_CudaInterOpStreamsMultiGPU
+    SOURCES
+      UnitTestMainInit.cpp
+      cuda/TestCuda_InterOp_StreamsMultiGPU.cpp
+  )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     CoreUnitTest_CudaGraph
     SOURCES
@@ -1039,13 +1036,7 @@ KOKKOS_ADD_ADVANCED_TEST( CoreUnitTest_PushFinalizeHook_terminate
       tools/TestCategoricalTuner.cpp
     )
   endif()
-  if((NOT Kokkos_ENABLE_OPENMPTARGET) AND (NOT Kokkos_ENABLE_OPENACC))
-  KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    CoreUnitTest_LogicalSpaces
-    SOURCES
-      tools/TestLogicalSpaces.cpp
-  )
-  endif()
+
   SET(KOKKOSP_SOURCES
     UnitTestMainInit.cpp
     tools/TestEventCorrectness.cpp
@@ -1167,15 +1158,6 @@ KOKKOS_ADD_TEST( NAME CoreUnitTest_StackTraceTest
                )
 endif()
 
-if(Kokkos_ENABLE_DEPRECATED_CODE_3)
-  foreach(INITTESTS_NUM RANGE 1 18)
-  KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    CoreUnitTest_DefaultInit_${INITTESTS_NUM}
-    SOURCES UnitTestMain.cpp default/TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp
-  )
-  endforeach(INITTESTS_NUM)
-endif()
-
 if (KOKKOS_ENABLE_HWLOC)
 KOKKOS_ADD_EXECUTABLE_AND_TEST(
   CoreUnitTest_HWLOC
@@ -1259,12 +1241,10 @@ if (NOT KOKKOS_HAS_TRILINOS)
       INPUT TestDeviceAndThreads.py
       ${USE_SOURCE_PERMISSIONS_WHEN_SUPPORTED}
     )
-    if(NOT Kokkos_ENABLE_OPENMPTARGET)  # FIXME_OPENMPTARGET does not select the right device
-      add_test(
-        NAME Kokkos_CoreUnitTest_DeviceAndThreads
-        COMMAND ${Python3_EXECUTABLE} -m unittest -v $<TARGET_FILE_DIR:Kokkos_CoreUnitTest_DeviceAndThreads>/TestDeviceAndThreads.py
-      )
-    endif()
+    add_test(
+      NAME Kokkos_CoreUnitTest_DeviceAndThreads
+      COMMAND ${Python3_EXECUTABLE} -m unittest -v $<TARGET_FILE_DIR:Kokkos_CoreUnitTest_DeviceAndThreads>/TestDeviceAndThreads.py
+    )
   endif()
 endif()
 
diff --git a/core/unit_test/Makefile b/core/unit_test/Makefile
index 33a84b61f92..202809d3fc9 100644
--- a/core/unit_test/Makefile
+++ b/core/unit_test/Makefile
@@ -67,8 +67,8 @@ TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longi
 tmp := $(foreach device, $(KOKKOS_DEVICELIST), \
   tmp2 := $(foreach test, $(TESTS), \
     $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\
-      $(shell echo "\#include <Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \
-      $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \
+      $(shell echo "$(H)include <Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \
+      $(shell echo "$(H)include <Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \
      ) \
   ) \
 )
@@ -82,8 +82,8 @@ KOKKOS_SUBVIEW_DEVICELIST := $(filter-out Cuda, $(KOKKOS_DEVICELIST))
 tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \
   tmp2 := $(foreach test, $(SUBVIEW_TESTS), \
     $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),, \
-      $(shell echo "\#include <Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \
-      $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \
+      $(shell echo "$(H)include <Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \
+      $(shell echo "$(H)include <Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \
      ) \
   )\
 )
@@ -91,8 +91,8 @@ tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
     tmp2 := $(foreach test, $(SUBVIEW_TESTS), \
       $(if $(filter TestCuda_$(test).cpp, $(shell ls TestCuda_$(test).cpp 2>/dev/null)),,\
-        $(shell echo "\#include <TestCudaUVM_Category.hpp>" > TestCuda_$(test).cpp); \
-        $(shell echo "\#include <Test"$(test)".hpp>" >> TestCuda_$(test).cpp); \
+        $(shell echo "$(H)include <TestCudaUVM_Category.hpp>" > TestCuda_$(test).cpp); \
+        $(shell echo "$(H)include <Test"$(test)".hpp>" >> TestCuda_$(test).cpp); \
       )\
     )
 
@@ -100,8 +100,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
     tmp := $(foreach space, $(GPU_SPACES), \
       tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \
         $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\
-          $(shell echo "\#include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \
-          $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \
+          $(shell echo "$(H)include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \
+          $(shell echo "$(H)include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \
         )\
       )\
     )
@@ -277,8 +277,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
 	tmp := $(foreach space, $(GPU_SPACES), \
 	  tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \
 	    $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\
-	      $(shell echo "\#include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \
-	      $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \
+	      $(shell echo "$(H)include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \
+	      $(shell echo "$(H)include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \
 	    )\
 	  )\
 	)
diff --git a/core/unit_test/TestAggregate.hpp b/core/unit_test/TestAggregate.hpp
index 4f67b2eddce..f1316a7426a 100644
--- a/core/unit_test/TestAggregate.hpp
+++ b/core/unit_test/TestAggregate.hpp
@@ -29,35 +29,31 @@ void TestViewAggregate() {
                                      value_type>;
 
   static_assert(
-      std::is_same<typename analysis_1d::specialize, Kokkos::Array<> >::value,
-      "");
+      std::is_same<typename analysis_1d::specialize, Kokkos::Array<> >::value);
 
   using a32_traits = Kokkos::ViewTraits<value_type **, DeviceType>;
   using flat_traits =
       Kokkos::ViewTraits<typename a32_traits::scalar_array_type, DeviceType>;
 
   static_assert(
-      std::is_same<typename a32_traits::specialize, Kokkos::Array<> >::value,
-      "");
+      std::is_same<typename a32_traits::specialize, Kokkos::Array<> >::value);
   static_assert(
-      std::is_same<typename a32_traits::value_type, value_type>::value, "");
-  static_assert(a32_traits::rank == 2, "");
-  static_assert(a32_traits::rank_dynamic == 2, "");
+      std::is_same<typename a32_traits::value_type, value_type>::value);
+  static_assert(a32_traits::rank == 2);
+  static_assert(a32_traits::rank_dynamic == 2);
 
-  static_assert(std::is_void<typename flat_traits::specialize>::value, "");
-  static_assert(flat_traits::rank == 3, "");
-  static_assert(flat_traits::rank_dynamic == 2, "");
-  static_assert(flat_traits::dimension::N2 == 32, "");
+  static_assert(std::is_void<typename flat_traits::specialize>::value);
+  static_assert(flat_traits::rank == 3);
+  static_assert(flat_traits::rank_dynamic == 2);
+  static_assert(flat_traits::dimension::N2 == 32);
 
   using a32_type      = Kokkos::View<Kokkos::Array<double, 32> **, DeviceType>;
   using a32_flat_type = typename a32_type::array_type;
 
-  static_assert(std::is_same<typename a32_type::value_type, value_type>::value,
-                "");
-  static_assert(std::is_same<typename a32_type::pointer_type, double *>::value,
-                "");
-  static_assert(a32_type::rank == 2, "");
-  static_assert(a32_flat_type::rank == 3, "");
+  static_assert(std::is_same<typename a32_type::value_type, value_type>::value);
+  static_assert(std::is_same<typename a32_type::pointer_type, double *>::value);
+  static_assert(a32_type::rank == 2);
+  static_assert(a32_flat_type::rank == 3);
 
   a32_type x("test", 4, 5);
   a32_flat_type y(x);
diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp
index d3bdc4f93f7..673d0036b71 100644
--- a/core/unit_test/TestArray.cpp
+++ b/core/unit_test/TestArray.cpp
@@ -49,4 +49,28 @@ KOKKOS_FUNCTION constexpr bool test_array_structured_binding_support() {
 
 static_assert(test_array_structured_binding_support());
 
+template <typename L, typename R>
+KOKKOS_FUNCTION constexpr bool is_equal(L const& l, R const& r) {
+  if (std::size(l) != std::size(r)) return false;
+
+  for (size_t i = 0; i != std::size(l); ++i) {
+    if (l[i] != r[i]) return false;
+  }
+
+  return true;
+}
+
+// Disable ctad test for intel versions < 2021, see issue #6702
+#if !defined(KOKKOS_COMPILER_INTEL) || KOKKOS_COMPILER_INTEL >= 2021
+KOKKOS_FUNCTION constexpr bool test_array_ctad() {
+  constexpr int x = 10;
+  constexpr Kokkos::Array a{1, 2, 3, 5, x};
+  constexpr Kokkos::Array<int, 5> b{1, 2, 3, 5, x};
+
+  return std::is_same_v<decltype(a), decltype(b)> && is_equal(a, b);
+}
+
+static_assert(test_array_ctad());
+#endif
+
 }  // namespace
diff --git a/core/unit_test/TestAtomicOperations.hpp b/core/unit_test/TestAtomicOperations.hpp
index a5aebed4138..cd7ba47aa1e 100644
--- a/core/unit_test/TestAtomicOperations.hpp
+++ b/core/unit_test/TestAtomicOperations.hpp
@@ -368,6 +368,63 @@ bool atomic_op_test(T old_val, T update) {
   return result == 0;
 }
 
+template <class T>
+constexpr T relative_error_threshold = T(1.0e-15);
+
+template <class Op, class T, class ExecSpace>
+bool atomic_op_test_rel(T old_val, T update) {
+  Kokkos::View<T[3], ExecSpace> op_data("op_data");
+  Kokkos::deep_copy(op_data, old_val);
+  int result = 0;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecSpace>(0, 1),
+      KOKKOS_LAMBDA(int, int& local_result) {
+        auto fetch_result =
+            Op::atomic_op(&op_data(0), &op_data(1), &op_data(2), update);
+        T expected_val = Op::op(old_val, update);
+        Kokkos::memory_fence();
+        if (expected_val == T(0)) {
+          if (fabs(op_data(0)) > relative_error_threshold<T>) local_result += 1;
+          if (fabs(op_data(1)) > relative_error_threshold<T>) local_result += 2;
+          if (fabs(op_data(2)) > relative_error_threshold<T>) local_result += 4;
+          if (fetch_result.first != old_val) local_result += 8;
+          if (fabs(fetch_result.second) > relative_error_threshold<T>)
+            local_result += 16;
+        } else {
+          if (fabs((op_data(0) - expected_val) / expected_val) >
+              relative_error_threshold<T>)
+            local_result += 1;
+          if (fabs((op_data(1) - expected_val) / expected_val) >
+              relative_error_threshold<T>)
+            local_result += 2;
+          if (fabs((op_data(2) - expected_val) / expected_val) >
+              relative_error_threshold<T>)
+            local_result += 4;
+          if (fetch_result.first != old_val) local_result += 8;
+          if (fabs((fetch_result.second - expected_val) / expected_val) >
+              relative_error_threshold<T>)
+            local_result += 16;
+        }
+      },
+      result);
+  if ((result & 1) != 0)
+    printf("atomic_%s failed with type %s\n", Op::name(), typeid(T).name());
+  if ((result & 2) != 0)
+    printf("atomic_fetch_%s failed with type %s\n", Op::name(),
+           typeid(T).name());
+  if ((result & 4) != 0)
+    printf("atomic_%s_fetch failed with type %s\n", Op::name(),
+           typeid(T).name());
+  if ((result & 8) != 0)
+    printf("atomic_fetch_%s did not return old value with type %s\n",
+           Op::name(), typeid(T).name());
+  if ((result & 16) != 0)
+    printf("atomic_%s_fetch did not return updated value with type %s\n",
+           Op::name(), typeid(T).name());
+
+  return result == 0;
+}
+
 //---------------------------------------------------
 //--------------atomic_test_control------------------
 //---------------------------------------------------
@@ -395,6 +452,12 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) {
     case 9: return atomic_op_test<XorAtomicTest, T, ExecSpace>(old_val, update);
     case 10:
       return atomic_op_test<NandAtomicTest, T, ExecSpace>(old_val, update);
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+    // FIXME_NVHPC: atomic-fetch-shift operation fails due to NVHPC OpenACC
+    // compiler bugs, which are reported to NVIDIA.
+    case 11: return true;
+    case 12: return true;
+#else
     case 11:
       return update_in >= 0 ? atomic_op_test<LShiftAtomicTest, T, ExecSpace>(
                                   old_val, update)
@@ -403,6 +466,7 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) {
       return update_in >= 0 ? atomic_op_test<RShiftAtomicTest, T, ExecSpace>(
                                   old_val, update)
                             : true;
+#endif
     case 13:
       return atomic_op_test<IncAtomicTest, T, ExecSpace>(old_val, update);
     case 14:
@@ -440,10 +504,20 @@ bool AtomicOperationsTestNonIntegralType(int old_val_in, int update_in,
     case 2: return atomic_op_test<MaxAtomicTest, T, ExecSpace>(old_val, update);
     case 3: return atomic_op_test<MinAtomicTest, T, ExecSpace>(old_val, update);
     case 4: return atomic_op_test<MulAtomicTest, T, ExecSpace>(old_val, update);
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+    // NVHPC may use different internal precisions for the device and host
+    // atomic operations. Therefore, relative errors are used to compare the
+    // host results and device results.
+    case 5:
+      return update != 0 ? atomic_op_test_rel<DivAtomicTest, T, ExecSpace>(
+                               old_val, update)
+                         : true;
+#else
     case 5:
       return update != 0
                  ? atomic_op_test<DivAtomicTest, T, ExecSpace>(old_val, update)
                  : true;
+#endif
     case 6:
       return atomic_op_test<LoadStoreAtomicTest, T, ExecSpace>(old_val, update);
   }
diff --git a/core/unit_test/TestAtomics.hpp b/core/unit_test/TestAtomics.hpp
index 2b40f12d0a4..5f48e8c9746 100644
--- a/core/unit_test/TestAtomics.hpp
+++ b/core/unit_test/TestAtomics.hpp
@@ -498,7 +498,9 @@ TEST(TEST_CATEGORY, atomics) {
   ASSERT_TRUE((TestAtomic::Loop<float, TEST_EXECSPACE>(100, 2)));
   ASSERT_TRUE((TestAtomic::Loop<float, TEST_EXECSPACE>(100, 3)));
 
-#ifndef KOKKOS_ENABLE_OPENMPTARGET
+  // FIXME_OPENMPTARGET
+  // FIXME_OPENACC: atomic operations on composite types are not supported.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_OPENACC)
   ASSERT_TRUE((TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(1, 1)));
   ASSERT_TRUE((TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(1, 2)));
   ASSERT_TRUE((TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(1, 3)));
diff --git a/core/unit_test/TestBitManipulationBuiltins.hpp b/core/unit_test/TestBitManipulationBuiltins.hpp
index 092e7cff618..2f3bcfe817d 100644
--- a/core/unit_test/TestBitManipulationBuiltins.hpp
+++ b/core/unit_test/TestBitManipulationBuiltins.hpp
@@ -804,26 +804,26 @@ struct TestBitCastFunction {
     using Kokkos::bit_cast;
     if (bit_cast<int>(123) != 123) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #1\n");
+      Kokkos::printf("failed check #1\n");
     }
     if (bit_cast<int>(123u) != 123) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #2\n");
+      Kokkos::printf("failed check #2\n");
     }
     if (bit_cast<int>(~0u) != ~0) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #3\n");
+      Kokkos::printf("failed check #3\n");
     }
     if constexpr (sizeof(int) == sizeof(float)) {
       if (!check<int>(12.34f)) {
         ++e;
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #4\n");
+        Kokkos::printf("failed check #4\n");
       }
     }
     if constexpr (sizeof(unsigned long long) == sizeof(double)) {
       if (!check<unsigned long long>(123.456)) {
         ++e;
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #5\n");
+        Kokkos::printf("failed check #5\n");
       }
     }
 
@@ -848,11 +848,11 @@ struct TestBitCastFunction {
     }
     if (!(bit_cast<S>(arr) == arr)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #6\n");
+      Kokkos::printf("failed check #6\n");
     }
     if (!(bit_cast<S>(arr2) == arr2)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #7\n");
+      Kokkos::printf("failed check #7\n");
     }
   }
 };
diff --git a/core/unit_test/TestComplex.hpp b/core/unit_test/TestComplex.hpp
index bcae2e1d816..5501a35b7f0 100644
--- a/core/unit_test/TestComplex.hpp
+++ b/core/unit_test/TestComplex.hpp
@@ -451,17 +451,15 @@ TEST(TEST_CATEGORY, complex_issue_3867) {
   ASSERT_FLOAT_EQ(x.real(), y.real());
   ASSERT_FLOAT_EQ(x.imag(), y.imag());
 
-#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE)         \
-  static_assert(                                                            \
-      std::is_same<RETURNTYPE,                                              \
-                   decltype(Kokkos::pow(std::declval<ARGTYPE1>(),           \
-                                        std::declval<ARGTYPE2>()))>::value, \
-      "");                                                                  \
-  static_assert(                                                            \
-      std::is_same<RETURNTYPE,                                              \
-                   decltype(Kokkos::pow(std::declval<ARGTYPE2>(),           \
-                                        std::declval<ARGTYPE1>()))>::value, \
-      "");
+#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE)          \
+  static_assert(                                                             \
+      std::is_same<RETURNTYPE,                                               \
+                   decltype(Kokkos::pow(std::declval<ARGTYPE1>(),            \
+                                        std::declval<ARGTYPE2>()))>::value); \
+  static_assert(                                                             \
+      std::is_same<RETURNTYPE,                                               \
+                   decltype(Kokkos::pow(std::declval<ARGTYPE2>(),            \
+                                        std::declval<ARGTYPE1>()))>::value);
 
   CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<long double>, long double,
                               Kokkos::complex<long double>);
diff --git a/core/unit_test/TestConcepts.hpp b/core/unit_test/TestConcepts.hpp
index 476a8848325..b85867bf63a 100644
--- a/core/unit_test/TestConcepts.hpp
+++ b/core/unit_test/TestConcepts.hpp
@@ -22,42 +22,42 @@ using ExecutionSpace = TEST_EXECSPACE;
 using MemorySpace    = typename ExecutionSpace::memory_space;
 using DeviceType     = typename ExecutionSpace::device_type;
 
-static_assert(Kokkos::is_execution_space<ExecutionSpace>{}, "");
-static_assert(Kokkos::is_execution_space<ExecutionSpace const>{}, "");
-static_assert(!Kokkos::is_execution_space<ExecutionSpace &>{}, "");
-static_assert(!Kokkos::is_execution_space<ExecutionSpace const &>{}, "");
-
-static_assert(Kokkos::is_memory_space<MemorySpace>{}, "");
-static_assert(Kokkos::is_memory_space<MemorySpace const>{}, "");
-static_assert(!Kokkos::is_memory_space<MemorySpace &>{}, "");
-static_assert(!Kokkos::is_memory_space<MemorySpace const &>{}, "");
-
-static_assert(Kokkos::is_device<DeviceType>{}, "");
-static_assert(Kokkos::is_device<DeviceType const>{}, "");
-static_assert(!Kokkos::is_device<DeviceType &>{}, "");
-static_assert(!Kokkos::is_device<DeviceType const &>{}, "");
-
-static_assert(!Kokkos::is_device<ExecutionSpace>{}, "");
-static_assert(!Kokkos::is_device<MemorySpace>{}, "");
-
-static_assert(Kokkos::is_space<ExecutionSpace>{}, "");
-static_assert(Kokkos::is_space<MemorySpace>{}, "");
-static_assert(Kokkos::is_space<DeviceType>{}, "");
-static_assert(Kokkos::is_space<ExecutionSpace const>{}, "");
-static_assert(Kokkos::is_space<MemorySpace const>{}, "");
-static_assert(Kokkos::is_space<DeviceType const>{}, "");
-static_assert(!Kokkos::is_space<ExecutionSpace &>{}, "");
-static_assert(!Kokkos::is_space<MemorySpace &>{}, "");
-static_assert(!Kokkos::is_space<DeviceType &>{}, "");
-
-static_assert(Kokkos::is_execution_space_v<ExecutionSpace>, "");
-static_assert(!Kokkos::is_execution_space_v<ExecutionSpace &>, "");
+static_assert(Kokkos::is_execution_space<ExecutionSpace>{});
+static_assert(Kokkos::is_execution_space<ExecutionSpace const>{});
+static_assert(!Kokkos::is_execution_space<ExecutionSpace &>{});
+static_assert(!Kokkos::is_execution_space<ExecutionSpace const &>{});
+
+static_assert(Kokkos::is_memory_space<MemorySpace>{});
+static_assert(Kokkos::is_memory_space<MemorySpace const>{});
+static_assert(!Kokkos::is_memory_space<MemorySpace &>{});
+static_assert(!Kokkos::is_memory_space<MemorySpace const &>{});
+
+static_assert(Kokkos::is_device<DeviceType>{});
+static_assert(Kokkos::is_device<DeviceType const>{});
+static_assert(!Kokkos::is_device<DeviceType &>{});
+static_assert(!Kokkos::is_device<DeviceType const &>{});
+
+static_assert(!Kokkos::is_device<ExecutionSpace>{});
+static_assert(!Kokkos::is_device<MemorySpace>{});
+
+static_assert(Kokkos::is_space<ExecutionSpace>{});
+static_assert(Kokkos::is_space<MemorySpace>{});
+static_assert(Kokkos::is_space<DeviceType>{});
+static_assert(Kokkos::is_space<ExecutionSpace const>{});
+static_assert(Kokkos::is_space<MemorySpace const>{});
+static_assert(Kokkos::is_space<DeviceType const>{});
+static_assert(!Kokkos::is_space<ExecutionSpace &>{});
+static_assert(!Kokkos::is_space<MemorySpace &>{});
+static_assert(!Kokkos::is_space<DeviceType &>{});
+
+static_assert(Kokkos::is_execution_space_v<ExecutionSpace>);
+static_assert(!Kokkos::is_execution_space_v<ExecutionSpace &>);
 
 static_assert(
-    std::is_same<float, Kokkos::Impl::remove_cvref_t<float const &>>{}, "");
-static_assert(std::is_same<int, Kokkos::Impl::remove_cvref_t<int &>>{}, "");
-static_assert(std::is_same<int, Kokkos::Impl::remove_cvref_t<int const>>{}, "");
-static_assert(std::is_same<float, Kokkos::Impl::remove_cvref_t<float>>{}, "");
+    std::is_same<float, Kokkos::Impl::remove_cvref_t<float const &>>{});
+static_assert(std::is_same<int, Kokkos::Impl::remove_cvref_t<int &>>{});
+static_assert(std::is_same<int, Kokkos::Impl::remove_cvref_t<int const>>{});
+static_assert(std::is_same<float, Kokkos::Impl::remove_cvref_t<float>>{});
 
 /*-------------------------------------------------
   begin test for team_handle concept
diff --git a/core/unit_test/TestDefaultDeviceTypeInit.hpp b/core/unit_test/TestDefaultDeviceTypeInit.hpp
deleted file mode 100644
index 929c91db4e0..00000000000
--- a/core/unit_test/TestDefaultDeviceTypeInit.hpp
+++ /dev/null
@@ -1,491 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#include <gtest/gtest.h>
-
-#include <Kokkos_Core.hpp>
-
-#ifdef KOKKOS_ENABLE_OPENMP
-#include <omp.h>
-#endif
-#include <set>
-#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
-
-namespace Test {
-
-namespace Impl {
-
-std::set<char*> delete_these;
-void cleanup_memory() {
-  for (auto x : delete_these) {
-    delete[] x;
-  }
-}
-
-char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device,
-                        bool do_other, bool do_tune, int& nargs,
-                        Kokkos::InitArguments& init_args) {
-  nargs = (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + (do_device ? 1 : 0) +
-          (do_other ? 4 : 0) + (do_tune ? 1 : 0);
-
-  char** args_kokkos      = new char*[nargs];
-  const int max_args_size = 45;
-  for (int i = 0; i < nargs; i++) {
-    args_kokkos[i] = new char[max_args_size];
-    delete_these.insert(args_kokkos[i]);
-  }
-
-  int threads_idx = do_other ? 1 : 0;
-  int numa_idx    = (do_other ? 3 : 0) + (do_threads ? 1 : 0);
-  int device_idx =
-      (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0);
-  int tune_idx = (do_other ? 4 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) +
-                 (do_device ? 1 : 0);
-
-  if (do_threads) {
-    int nthreads = 3;
-
-#ifdef KOKKOS_ENABLE_OPENMP
-    if (omp_get_max_threads() < nthreads) {
-      nthreads = omp_get_max_threads();
-    }
-#elif defined(KOKKOS_ENABLE_HPX)
-    const int concurrency = std::thread::hardware_concurrency();
-    if (concurrency < nthreads) {
-      nthreads = concurrency;
-    }
-#endif
-
-    if (Kokkos::hwloc::available()) {
-      if (Kokkos::hwloc::get_available_threads_per_core() <
-          static_cast<unsigned>(nthreads))
-        nthreads = Kokkos::hwloc::get_available_threads_per_core() *
-                   Kokkos::hwloc::get_available_numa_count();
-    }
-
-#ifdef KOKKOS_ENABLE_SERIAL
-    if (std::is_same<Kokkos::Serial, Kokkos::DefaultExecutionSpace>::value ||
-        std::is_same<Kokkos::Serial,
-                     Kokkos::DefaultHostExecutionSpace>::value) {
-      nthreads = 1;
-    }
-#endif
-
-    init_args.num_threads = nthreads;
-    snprintf(args_kokkos[threads_idx], max_args_size, "--threads=%i", nthreads);
-  }
-
-  if (do_numa) {
-    int numa = 1;
-    if (Kokkos::hwloc::available()) {
-      numa = Kokkos::hwloc::get_available_numa_count();
-    }
-
-#ifdef KOKKOS_ENABLE_SERIAL
-    if (std::is_same<Kokkos::Serial, Kokkos::DefaultExecutionSpace>::value ||
-        std::is_same<Kokkos::Serial,
-                     Kokkos::DefaultHostExecutionSpace>::value) {
-      numa = 1;
-    }
-#endif
-
-    init_args.num_numa = numa;
-    snprintf(args_kokkos[numa_idx], max_args_size, "--numa=%i", numa);
-  }
-
-  if (do_device) {
-    init_args.device_id = 0;
-    snprintf(args_kokkos[device_idx], max_args_size, "--device-id=%i", 0);
-  }
-
-  if (do_other) {
-    snprintf(args_kokkos[0], max_args_size, "--dummyarg=1");
-    snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0)], max_args_size,
-             "--dummy2arg");
-    snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0) + 1], max_args_size,
-             "dummy3arg");
-    snprintf(args_kokkos[device_idx + (do_device ? 1 : 0)], max_args_size,
-             "dummy4arg=1");
-  }
-
-  if (do_tune) {
-    init_args.tune_internals = true;
-    snprintf(args_kokkos[tune_idx], max_args_size, "--kokkos-tune-internals");
-  }
-
-  return args_kokkos;
-}
-
-Kokkos::InitArguments init_initstruct(bool do_threads, bool do_numa,
-                                      bool do_device, bool do_tune) {
-  Kokkos::InitArguments args;
-
-  if (do_threads) {
-    int nthreads = 3;
-
-#ifdef KOKKOS_ENABLE_OPENMP
-    if (omp_get_max_threads() < nthreads) {
-      nthreads = omp_get_max_threads();
-    }
-#elif defined(KOKKOS_ENABLE_HPX)
-    const int concurrency = std::thread::hardware_concurrency();
-    if (concurrency < nthreads) {
-      nthreads = concurrency;
-    }
-#endif
-
-    if (Kokkos::hwloc::available()) {
-      if (Kokkos::hwloc::get_available_threads_per_core() <
-          static_cast<unsigned>(nthreads)) {
-        nthreads = Kokkos::hwloc::get_available_threads_per_core() *
-                   Kokkos::hwloc::get_available_numa_count();
-      }
-    }
-
-#ifdef KOKKOS_ENABLE_SERIAL
-    if (std::is_same<Kokkos::Serial, Kokkos::DefaultExecutionSpace>::value ||
-        std::is_same<Kokkos::Serial,
-                     Kokkos::DefaultHostExecutionSpace>::value) {
-      nthreads = 1;
-    }
-#endif
-
-    args.num_threads = nthreads;
-  }
-
-  if (do_numa) {
-    int numa = 1;
-    if (Kokkos::hwloc::available()) {
-      numa = Kokkos::hwloc::get_available_numa_count();
-    }
-
-#ifdef KOKKOS_ENABLE_SERIAL
-    if (std::is_same<Kokkos::Serial, Kokkos::DefaultExecutionSpace>::value ||
-        std::is_same<Kokkos::Serial,
-                     Kokkos::DefaultHostExecutionSpace>::value) {
-      numa = 1;
-    }
-#endif
-
-    args.num_numa = numa;
-  }
-
-  if (do_device) {
-    args.device_id = 0;
-  }
-
-  if (do_tune) {
-    args.tune_internals = true;
-  }
-
-  return args;
-}
-
-void check_correct_initialization(const Kokkos::InitArguments& argstruct) {
-  ASSERT_EQ(Kokkos::DefaultExecutionSpace::impl_is_initialized(), 1);
-  ASSERT_EQ(Kokkos::HostSpace::execution_space::impl_is_initialized(), 1);
-
-  // Figure out the number of threads the HostSpace ExecutionSpace should have
-  // initialized to.
-  int expected_nthreads = argstruct.num_threads;
-
-#ifdef KOKKOS_ENABLE_OPENMP
-  if (std::is_same<Kokkos::HostSpace::execution_space, Kokkos::OpenMP>::value) {
-    // use openmp default num threads
-    if (expected_nthreads < 0 ||
-        (expected_nthreads == 0 && !Kokkos::hwloc::available())) {
-      expected_nthreads = omp_get_max_threads();
-    }
-    // use hwloc if available
-    else if (expected_nthreads == 0 && Kokkos::hwloc::available()) {
-      expected_nthreads = Kokkos::hwloc::get_available_numa_count() *
-                          Kokkos::hwloc::get_available_cores_per_numa() *
-                          Kokkos::hwloc::get_available_threads_per_core();
-    }
-  }
-#endif
-
-  if (expected_nthreads < 1) {
-    if (Kokkos::hwloc::available()) {
-      expected_nthreads = Kokkos::hwloc::get_available_numa_count() *
-                          Kokkos::hwloc::get_available_cores_per_numa() *
-                          Kokkos::hwloc::get_available_threads_per_core();
-    } else {
-      expected_nthreads = 1;
-    }
-
-#ifdef KOKKOS_ENABLE_SERIAL
-    if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Serial>::value ||
-        std::is_same<Kokkos::DefaultHostExecutionSpace,
-                     Kokkos::Serial>::value) {
-      expected_nthreads = 1;
-    }
-#endif
-
-#ifdef KOKKOS_ENABLE_HPX
-    // HPX uses all cores on machine by default. Skip this test.
-    if (std::is_same<Kokkos::DefaultExecutionSpace,
-                     Kokkos::Experimental::HPX>::value ||
-        std::is_same<Kokkos::DefaultHostExecutionSpace,
-                     Kokkos::Experimental::HPX>::value) {
-      return;
-    }
-#endif
-  }
-
-  int expected_numa = argstruct.num_numa;
-
-  if (expected_numa < 1) {
-    if (Kokkos::hwloc::available()) {
-      expected_numa = Kokkos::hwloc::get_available_numa_count();
-    } else {
-      expected_numa = 1;
-    }
-
-#ifdef KOKKOS_ENABLE_SERIAL
-    if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Serial>::value ||
-        std::is_same<Kokkos::DefaultHostExecutionSpace, Kokkos::Serial>::value)
-      expected_numa = 1;
-#endif
-  }
-
-  ASSERT_EQ(Kokkos::HostSpace::execution_space().impl_thread_pool_size(),
-            expected_nthreads);
-
-#ifdef KOKKOS_ENABLE_CUDA
-  if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Cuda>::value) {
-    int device;
-    cudaGetDevice(&device);
-
-    int expected_device = argstruct.device_id;
-    if (argstruct.device_id < 0) {
-      expected_device = Kokkos::Cuda().cuda_device();
-    }
-
-    ASSERT_EQ(expected_device, device);
-  }
-#endif
-  ASSERT_EQ(argstruct.tune_internals, Kokkos::tune_internals());
-}
-
-// TODO: Add check whether correct number of threads are actually started.
-void test_no_arguments() {
-  Kokkos::initialize();
-  check_correct_initialization(Kokkos::InitArguments());
-  Kokkos::finalize();
-}
-
-void test_commandline_args(int nargs, char** args,
-                           const Kokkos::InitArguments& argstruct) {
-  Kokkos::initialize(nargs, args);
-  check_correct_initialization(argstruct);
-  Kokkos::finalize();
-}
-
-void test_initstruct_args(const Kokkos::InitArguments& args) {
-  Kokkos::initialize(args);
-  check_correct_initialization(args);
-  Kokkos::finalize();
-}
-
-}  // namespace Impl
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01
-TEST(defaultdevicetypeinit, no_args) { Impl::test_no_arguments(); }
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02
-TEST(defaultdevicetypeinit, commandline_args_empty) {
-  Kokkos::InitArguments argstruct;
-  int nargs   = 0;
-  char** args = Impl::init_kokkos_args(false, false, false, false, false, nargs,
-                                       argstruct);
-  Impl::test_commandline_args(nargs, args, argstruct);
-
-  Impl::cleanup_memory();
-  delete[] args;
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03
-TEST(defaultdevicetypeinit, commandline_args_other) {
-  Kokkos::InitArguments argstruct;
-  int nargs   = 0;
-  char** args = Impl::init_kokkos_args(false, false, false, true, false, nargs,
-                                       argstruct);
-  Impl::test_commandline_args(nargs, args, argstruct);
-
-  Impl::cleanup_memory();
-  delete[] args;
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04
-TEST(defaultdevicetypeinit, commandline_args_nthreads) {
-  Kokkos::InitArguments argstruct;
-  int nargs   = 0;
-  char** args = Impl::init_kokkos_args(true, false, false, false, false, nargs,
-                                       argstruct);
-  Impl::test_commandline_args(nargs, args, argstruct);
-
-  Impl::cleanup_memory();
-  delete[] args;
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05
-TEST(defaultdevicetypeinit, commandline_args_nthreads_numa) {
-  Kokkos::InitArguments argstruct;
-  int nargs = 0;
-  char** args =
-      Impl::init_kokkos_args(true, true, false, false, false, nargs, argstruct);
-  Impl::test_commandline_args(nargs, args, argstruct);
-
-  Impl::cleanup_memory();
-
-  delete[] args;
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06
-TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device) {
-  Kokkos::InitArguments argstruct;
-  int nargs = 0;
-  char** args =
-      Impl::init_kokkos_args(true, true, true, false, false, nargs, argstruct);
-  Impl::test_commandline_args(nargs, args, argstruct);
-
-  Impl::cleanup_memory();
-
-  delete[] args;
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07
-TEST(defaultdevicetypeinit, commandline_args_nthreads_device) {
-  Kokkos::InitArguments argstruct;
-  int nargs = 0;
-  char** args =
-      Impl::init_kokkos_args(true, false, true, false, false, nargs, argstruct);
-  Impl::test_commandline_args(nargs, args, argstruct);
-
-  Impl::cleanup_memory();
-  delete[] args;
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08
-TEST(defaultdevicetypeinit, commandline_args_numa_device) {
-  Kokkos::InitArguments argstruct;
-  int nargs = 0;
-  char** args =
-      Impl::init_kokkos_args(false, true, true, false, false, nargs, argstruct);
-  Impl::test_commandline_args(nargs, args, argstruct);
-
-  Impl::cleanup_memory();
-  delete[] args;
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09
-TEST(defaultdevicetypeinit, commandline_args_device) {
-  Kokkos::InitArguments argstruct;
-  int nargs   = 0;
-  char** args = Impl::init_kokkos_args(false, false, true, false, false, nargs,
-                                       argstruct);
-  Impl::test_commandline_args(nargs, args, argstruct);
-
-  Impl::cleanup_memory();
-  delete[] args;
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10
-TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) {
-  Kokkos::InitArguments argstruct;
-  int nargs = 0;
-  char** args =
-      Impl::init_kokkos_args(true, true, true, true, false, nargs, argstruct);
-  Impl::test_commandline_args(nargs, args, argstruct);
-  Impl::cleanup_memory();
-  delete[] args;
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11
-TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other_tune) {
-  Kokkos::InitArguments argstruct;
-  int nargs = 0;
-  char** args =
-      Impl::init_kokkos_args(true, true, true, true, true, nargs, argstruct);
-  Impl::test_commandline_args(nargs, args, argstruct);
-  Impl::cleanup_memory();
-  delete[] args;
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12
-TEST(defaultdevicetypeinit, initstruct_default) {
-  Kokkos::InitArguments args;
-  Impl::test_initstruct_args(args);
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13
-TEST(defaultdevicetypeinit, initstruct_nthreads) {
-  Kokkos::InitArguments args = Impl::init_initstruct(true, false, false, false);
-  Impl::test_initstruct_args(args);
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14
-TEST(defaultdevicetypeinit, initstruct_nthreads_numa) {
-  Kokkos::InitArguments args = Impl::init_initstruct(true, true, false, false);
-  Impl::test_initstruct_args(args);
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15
-TEST(defaultdevicetypeinit, initstruct_device) {
-  Kokkos::InitArguments args = Impl::init_initstruct(false, false, true, false);
-  Impl::test_initstruct_args(args);
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16
-TEST(defaultdevicetypeinit, initstruct_nthreads_device) {
-  Kokkos::InitArguments args = Impl::init_initstruct(true, false, true, false);
-  Impl::test_initstruct_args(args);
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17
-TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device) {
-  Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, false);
-  Impl::test_initstruct_args(args);
-}
-#endif
-
-#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18
-TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device_tune) {
-  Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, true);
-  Impl::test_initstruct_args(args);
-}
-#endif
-
-}  // namespace Test
-
-#endif
diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py
index 1d3ff8eea7e..63d26ad41a4 100644
--- a/core/unit_test/TestDeviceAndThreads.py
+++ b/core/unit_test/TestDeviceAndThreads.py
@@ -17,6 +17,8 @@
 
 import unittest
 import subprocess
+import platform
+import os
 
 PREFIX = "$<TARGET_FILE_DIR:Kokkos_CoreUnitTest_DeviceAndThreads>"
 EXECUTABLE = "$<TARGET_FILE_NAME:Kokkos_CoreUnitTest_DeviceAndThreads>"
@@ -30,7 +32,22 @@ def GetFlag(flag, *extra_args):
     return int(p.stdout)
 
 def GetNumThreads(max_threads):
-    for x in [1, 2, 3, 5, 7]:
+    args = []
+    name = platform.system()
+    if name == 'Darwin':
+        args = ['sysctl', '-n', 'hw.physicalcpu_max']
+    elif name == 'Linux':
+        args = ['nproc', '--all']
+    else:
+        args = ['wmic', 'cpu', 'get', 'NumberOfCores']
+
+    result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output = result.stdout.decode('utf-8')
+    phys_cores_count = int(output)
+    looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] \
+        if GetFlag("hwloc_enabled") else [1,2,3,4,5]
+
+    for x in looplist:
         if x >= max_threads:
             break
         yield x
@@ -48,13 +65,25 @@ def test_num_threads(self):
                     "num_threads",
                     "--kokkos-num-threads={}".format(num_threads)))
 
+    def test_num_devices(self):
+        if "KOKKOS_VISIBLE_DEVICES" in os.environ:
+            self.skipTest("KOKKOS_VISIBLE_DEVICES environment variable is set")
+        num_devices = GetFlag("num_devices")
+        self.assertNotEqual(num_devices, 0)
+        if num_devices == -1:
+            self.skipTest("no device backend enabled")
+        self.assertGreaterEqual(num_devices, 1)
+
     def test_device_id(self):
-        device_count = GetFlag("device_count")
-        if device_count == 0:
-            self.skipTest("no device detected")
+        if "KOKKOS_VISIBLE_DEVICES" in os.environ:
+            self.skipTest("KOKKOS_VISIBLE_DEVICES environment variable is set")
+        num_devices = GetFlag("num_devices")
+        if num_devices == -1:
+            self.assertEqual(-1, GetFlag("device_id"))
+            self.skipTest("no device backend enabled")
         # by default use the first GPU available for execution
         self.assertEqual(0, GetFlag("device_id"))
-        for device_id in range(device_count):
+        for device_id in range(num_devices):
             self.assertEqual(
                 device_id,
                 GetFlag(
diff --git a/core/unit_test/TestExecutionSpace.hpp b/core/unit_test/TestExecutionSpace.hpp
index 6f0f159c174..983a5975afd 100644
--- a/core/unit_test/TestExecutionSpace.hpp
+++ b/core/unit_test/TestExecutionSpace.hpp
@@ -25,13 +25,7 @@ struct CheckClassWithExecutionSpaceAsDataMemberIsCopyable {
   Kokkos::DefaultExecutionSpace device;
   Kokkos::DefaultHostExecutionSpace host;
 
-  KOKKOS_FUNCTION void operator()(int, int& e) const {
-    // not actually doing anything useful, mostly checking that
-    // ExecutionSpace::in_parallel() is callable
-    if (static_cast<int>(device.in_parallel()) < 0) {
-      ++e;
-    }
-  }
+  KOKKOS_FUNCTION void operator()(int i, int& e) const { e += i; }
 
   CheckClassWithExecutionSpaceAsDataMemberIsCopyable() {
     int errors;
diff --git a/core/unit_test/TestFunctorAnalysis.hpp b/core/unit_test/TestFunctorAnalysis.hpp
index c024526111b..e58324144e4 100644
--- a/core/unit_test/TestFunctorAnalysis.hpp
+++ b/core/unit_test/TestFunctorAnalysis.hpp
@@ -59,16 +59,15 @@ void test_functor_analysis() {
 
   using R01 = typename A01::Reducer;
 
-  static_assert(std::is_void<typename A01::value_type>::value, "");
-  static_assert(std::is_void<typename A01::pointer_type>::value, "");
-  static_assert(std::is_void<typename A01::reference_type>::value, "");
-  static_assert(std::is_same<typename R01::functor_type, decltype(c01)>::value,
-                "");
-
-  static_assert(!A01::has_join_member_function, "");
-  static_assert(!A01::has_init_member_function, "");
-  static_assert(!A01::has_final_member_function, "");
-  static_assert(A01::StaticValueSize == 0, "");
+  static_assert(std::is_void<typename A01::value_type>::value);
+  static_assert(std::is_void<typename A01::pointer_type>::value);
+  static_assert(std::is_void<typename A01::reference_type>::value);
+  static_assert(std::is_same<typename R01::functor_type, decltype(c01)>::value);
+
+  static_assert(!A01::has_join_member_function);
+  static_assert(!A01::has_init_member_function);
+  static_assert(!A01::has_final_member_function);
+  static_assert(A01::StaticValueSize == 0);
   ASSERT_EQ(R01(c01).length(), 0);
 
   //------------------------------
@@ -78,16 +77,15 @@ void test_functor_analysis() {
       Kokkos::RangePolicy<ExecSpace>, decltype(c02), void>;
   using R02 = typename A02::Reducer;
 
-  static_assert(std::is_same<typename A02::value_type, double>::value, "");
-  static_assert(std::is_same<typename A02::pointer_type, double*>::value, "");
-  static_assert(std::is_same<typename A02::reference_type, double&>::value, "");
-  static_assert(std::is_same<typename R02::functor_type, decltype(c02)>::value,
-                "");
+  static_assert(std::is_same<typename A02::value_type, double>::value);
+  static_assert(std::is_same<typename A02::pointer_type, double*>::value);
+  static_assert(std::is_same<typename A02::reference_type, double&>::value);
+  static_assert(std::is_same<typename R02::functor_type, decltype(c02)>::value);
 
-  static_assert(!A02::has_join_member_function, "");
-  static_assert(!A02::has_init_member_function, "");
-  static_assert(!A02::has_final_member_function, "");
-  static_assert(A02::StaticValueSize == sizeof(double), "");
+  static_assert(!A02::has_join_member_function);
+  static_assert(!A02::has_init_member_function);
+  static_assert(!A02::has_final_member_function);
+  static_assert(A02::StaticValueSize == sizeof(double));
   ASSERT_EQ(R02(c02).length(), 1);
 
   //------------------------------
@@ -99,23 +97,19 @@ void test_functor_analysis() {
   using R03 = typename A03::Reducer;
 
   static_assert(std::is_same<typename A03::value_type,
-                             TestFunctorAnalysis_03::value_type>::value,
-                "");
+                             TestFunctorAnalysis_03::value_type>::value);
   static_assert(std::is_same<typename A03::pointer_type,
-                             TestFunctorAnalysis_03::value_type*>::value,
-                "");
+                             TestFunctorAnalysis_03::value_type*>::value);
   static_assert(std::is_same<typename A03::reference_type,
-                             TestFunctorAnalysis_03::value_type&>::value,
-                "");
+                             TestFunctorAnalysis_03::value_type&>::value);
   static_assert(
-      std::is_same<typename R03::functor_type, TestFunctorAnalysis_03>::value,
-      "");
+      std::is_same<typename R03::functor_type, TestFunctorAnalysis_03>::value);
 
-  static_assert(A03::has_join_member_function, "");
-  static_assert(A03::has_init_member_function, "");
-  static_assert(!A03::has_final_member_function, "");
-  static_assert(
-      A03::StaticValueSize == sizeof(TestFunctorAnalysis_03::value_type), "");
+  static_assert(A03::has_join_member_function);
+  static_assert(A03::has_init_member_function);
+  static_assert(!A03::has_final_member_function);
+  static_assert(A03::StaticValueSize ==
+                sizeof(TestFunctorAnalysis_03::value_type));
   ASSERT_EQ(R03(c03).length(), 1);
 
   //------------------------------
diff --git a/core/unit_test/TestHalfOperators.hpp b/core/unit_test/TestHalfOperators.hpp
index 752e3b50816..c69cdd57034 100644
--- a/core/unit_test/TestHalfOperators.hpp
+++ b/core/unit_test/TestHalfOperators.hpp
@@ -268,96 +268,6 @@ enum OP_TESTS {
   N_OP_TESTS
 };
 
-// volatile-qualified parameter type 'volatile half_type' is deprecated
-#if !defined(KOKKOS_ENABLE_CXX20) && !defined(KOKKOS_ENABLE_CXX23)
-template <class view_type, class half_type>
-struct Functor_TestHalfVolatileOperators {
-  volatile half_type h_lhs, h_rhs;
-  view_type actual_lhs, expected_lhs;
-  double d_lhs, d_rhs;
-  Functor_TestHalfVolatileOperators(volatile half_type lhs = half_type(0),
-                                    volatile half_type rhs = half_type(0))
-      : h_lhs(lhs), h_rhs(rhs) {
-    actual_lhs   = view_type("actual_lhs", N_OP_TESTS);
-    expected_lhs = view_type("expected_lhs", N_OP_TESTS);
-    half_type nv_tmp;
-    nv_tmp = h_lhs;
-    d_lhs  = static_cast<double>(nv_tmp);
-    nv_tmp = h_rhs;
-    d_rhs  = static_cast<double>(nv_tmp);
-    if (std::is_same<view_type, ViewTypeHost>::value) {
-      auto run_on_host = *this;
-      run_on_host(0);
-    } else {
-      Kokkos::parallel_for("Test::Functor_TestHalfVolatileOperators",
-                           Kokkos::RangePolicy<ExecutionSpace>(0, 1), *this);
-    }
-  }
-
-  KOKKOS_FUNCTION
-  void operator()(int) const {
-    volatile half_type tmp_lhs;
-    half_type nv_tmp;
-
-    // Initialze output views to catch missing test invocations
-    for (int i = 0; i < N_OP_TESTS; ++i) {
-      actual_lhs(i)   = 1;
-      expected_lhs(i) = -1;
-    }
-
-    nv_tmp               = h_lhs;
-    actual_lhs(ASSIGN)   = static_cast<double>(nv_tmp);
-    expected_lhs(ASSIGN) = d_lhs;
-
-    actual_lhs(LT_H_H)   = h_lhs < h_rhs;
-    expected_lhs(LT_H_H) = d_lhs < d_rhs;
-
-    actual_lhs(LE_H_H)   = h_lhs <= h_rhs;
-    expected_lhs(LE_H_H) = d_lhs <= d_rhs;
-
-    actual_lhs(NEQ)   = h_lhs != h_rhs;
-    expected_lhs(NEQ) = d_lhs != d_rhs;
-
-    actual_lhs(GT_H_H)   = h_lhs > h_rhs;
-    expected_lhs(GT_H_H) = d_lhs > d_rhs;
-
-    actual_lhs(GE_H_H)   = h_lhs >= h_rhs;
-    expected_lhs(GE_H_H) = d_lhs >= d_rhs;
-
-    actual_lhs(EQ)   = h_lhs == h_rhs;
-    expected_lhs(EQ) = d_lhs == d_rhs;
-
-    tmp_lhs = h_lhs;
-    tmp_lhs += h_rhs;
-    nv_tmp                 = tmp_lhs;
-    actual_lhs(CADD_H_H)   = static_cast<double>(nv_tmp);
-    expected_lhs(CADD_H_H) = d_lhs;
-    expected_lhs(CADD_H_H) += d_rhs;
-
-    tmp_lhs = h_lhs;
-    tmp_lhs -= h_rhs;
-    nv_tmp                 = tmp_lhs;
-    actual_lhs(CSUB_H_H)   = static_cast<double>(nv_tmp);
-    expected_lhs(CSUB_H_H) = d_lhs;
-    expected_lhs(CSUB_H_H) -= d_rhs;
-
-    tmp_lhs = h_lhs;
-    tmp_lhs *= h_rhs;
-    nv_tmp                 = tmp_lhs;
-    actual_lhs(CMUL_H_H)   = static_cast<double>(nv_tmp);
-    expected_lhs(CMUL_H_H) = d_lhs;
-    expected_lhs(CMUL_H_H) *= d_rhs;
-
-    tmp_lhs = h_lhs;
-    tmp_lhs /= h_rhs;
-    nv_tmp                 = tmp_lhs;
-    actual_lhs(CDIV_H_H)   = static_cast<double>(nv_tmp);
-    expected_lhs(CDIV_H_H) = d_lhs;
-    expected_lhs(CDIV_H_H) /= d_rhs;
-  }
-};
-#endif
-
 template <class view_type, class half_type>
 struct Functor_TestHalfOperators {
   half_type h_lhs, h_rhs;
@@ -995,33 +905,6 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) {
                 static_cast<double>(epsilon));
   }
 
-// volatile-qualified parameter type 'volatile half_type' is deprecated
-#if !defined(KOKKOS_ENABLE_CXX20) && !defined(KOKKOS_ENABLE_CXX23)
-  // Test partial volatile support
-  volatile half_type _h_lhs = h_lhs;
-  volatile half_type _h_rhs = h_rhs;
-  Functor_TestHalfVolatileOperators<ViewType, half_type> f_volatile_device(
-      _h_lhs, _h_rhs);
-  Functor_TestHalfVolatileOperators<ViewTypeHost, half_type> f_volatile_host(
-      _h_lhs, _h_rhs);
-
-  ExecutionSpace().fence();
-  Kokkos::deep_copy(f_device_actual_lhs, f_device.actual_lhs);
-  Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs);
-  for (int op_test = 0; op_test < N_OP_TESTS; op_test++) {
-    // printf("op_test = %d\n", op_test);
-    if (op_test == ASSIGN || op_test == LT_H_H || op_test == LE_H_H ||
-        op_test == NEQ || op_test == EQ || op_test == GT_H_H ||
-        op_test == GE_H_H || op_test == CADD_H_H || op_test == CSUB_H_H ||
-        op_test == CMUL_H_H || op_test == CDIV_H_H) {
-      ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test),
-                  static_cast<double>(epsilon));
-      ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test),
-                  static_cast<double>(epsilon));
-    }
-  }
-#endif
-
   // is_trivially_copyable is false with the addition of explicit
   // copy constructors that are required for supporting reductions
   // ASSERT_TRUE(std::is_trivially_copyable<half_type>::value);
diff --git a/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
index 3ee2ff52051..467b9ad157f 100644
--- a/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
+++ b/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
@@ -37,7 +37,7 @@ template <class SmartPtr>
 struct CheckAccessStoredPointerAndDereferenceOnDevice {
   SmartPtr m_device_ptr;
   using ElementType = typename SmartPtr::element_type;
-  static_assert(std::is_same<ElementType, Data>::value, "");
+  static_assert(std::is_same<ElementType, Data>::value);
 
   CheckAccessStoredPointerAndDereferenceOnDevice(SmartPtr device_ptr)
       : m_device_ptr(device_ptr) {
diff --git a/core/unit_test/TestInitializationSettings.cpp b/core/unit_test/TestInitializationSettings.cpp
index f5be0e47aab..40dc3f11df3 100644
--- a/core/unit_test/TestInitializationSettings.cpp
+++ b/core/unit_test/TestInitializationSettings.cpp
@@ -20,30 +20,6 @@
 
 namespace {
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-void take_initialization_settings(Kokkos::InitializationSettings const&) {}
-
-TEST(defaultdevicetype,
-     init_arguments_implicit_conversion_to_initialization_settings) {
-  Kokkos::InitArguments arguments;
-  take_initialization_settings(arguments);  // check that conversion is implicit
-  arguments.device_id      = 1;
-  arguments.tune_internals = true;
-  Kokkos::InitializationSettings settings{arguments};
-  EXPECT_FALSE(settings.has_num_threads());
-  EXPECT_TRUE(settings.has_device_id());
-  EXPECT_EQ(settings.get_device_id(), 1);
-  EXPECT_FALSE(settings.has_num_devices());
-  EXPECT_FALSE(settings.has_skip_device());
-  EXPECT_FALSE(settings.has_disable_warnings());
-  EXPECT_TRUE(settings.has_tune_internals());
-  EXPECT_TRUE(settings.get_tune_internals());
-  EXPECT_FALSE(settings.has_tools_help());
-  EXPECT_FALSE(settings.has_tools_libs());
-  EXPECT_FALSE(settings.has_tools_args());
-}
-#endif
-
 TEST(defaultdevicetype, initialization_settings) {
   auto const settings = Kokkos::InitializationSettings()
                             .set_num_threads(255)
@@ -52,8 +28,6 @@ TEST(defaultdevicetype, initialization_settings) {
   EXPECT_TRUE(settings.has_num_threads());
   EXPECT_EQ(settings.get_num_threads(), 255);
   EXPECT_FALSE(settings.has_device_id());
-  EXPECT_FALSE(settings.has_num_devices());
-  EXPECT_FALSE(settings.has_skip_device());
   EXPECT_TRUE(settings.has_disable_warnings());
   EXPECT_FALSE(settings.get_disable_warnings());
   EXPECT_FALSE(settings.has_tune_internals());
@@ -75,8 +49,6 @@ constexpr bool test_initialization_settings_getter() {
                 TYPE>::value);
   CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_threads, int);
   CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(device_id, int);
-  CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_devices, int);
-  CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(skip_device, int);
   CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(disable_warnings, bool);
   CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tune_internals, bool);
   CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tools_help, bool);
diff --git a/core/unit_test/TestJoinBackwardCompatibility.hpp b/core/unit_test/TestJoinBackwardCompatibility.hpp
index 24cf52aa709..efe4a2307a8 100644
--- a/core/unit_test/TestJoinBackwardCompatibility.hpp
+++ b/core/unit_test/TestJoinBackwardCompatibility.hpp
@@ -36,9 +36,8 @@ KOKKOS_FUNCTION constexpr MyErrorCode operator|(MyErrorCode lhs,
 }
 
 static_assert((no_error | error_operator_plus_equal_volatile) ==
-                  error_operator_plus_equal_volatile,
-              "");
-static_assert((error_join_volatile | error_operator_plus_equal) == 0b101, "");
+              error_operator_plus_equal_volatile);
+static_assert((error_join_volatile | error_operator_plus_equal) == 0b101);
 
 struct MyJoinBackCompatValueType {
   MyErrorCode err = no_error;
diff --git a/core/unit_test/TestMDRangePolicyCTAD.cpp b/core/unit_test/TestMDRangePolicyCTAD.cpp
new file mode 100644
index 00000000000..b2c3d021c35
--- /dev/null
+++ b/core/unit_test/TestMDRangePolicyCTAD.cpp
@@ -0,0 +1,138 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <Kokkos_Core.hpp>
+
+namespace {
+
+struct TestMDRangePolicyCTAD {
+  template <typename... Ts>
+  static void maybe_unused(Ts&&...) {}
+
+  struct SomeExecutionSpace {
+    using execution_space = SomeExecutionSpace;
+    using size_type       = size_t;
+  };
+  static_assert(Kokkos::is_execution_space_v<SomeExecutionSpace>);
+
+  struct ImplicitlyConvertibleToDefaultExecutionSpace {
+    [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const {
+      return Kokkos::DefaultExecutionSpace();
+    }
+  };
+  static_assert(!Kokkos::is_execution_space_v<
+                ImplicitlyConvertibleToDefaultExecutionSpace>);
+
+  [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace des;
+  [[maybe_unused]] static inline ImplicitlyConvertibleToDefaultExecutionSpace
+      notEs;
+  [[maybe_unused]] static inline SomeExecutionSpace ses;
+
+  [[maybe_unused]] static inline int t[5];
+  [[maybe_unused]] static inline int64_t tt[5];
+  [[maybe_unused]] static inline Kokkos::Array<int64_t, 3> a;
+  [[maybe_unused]] static inline Kokkos::Array<int64_t, 2> aa;
+  [[maybe_unused]] static inline int64_t i64;
+
+  // Workaround for nvc++ (CUDA-11.7-NVHPC) ignoring [[maybe_unused]] on
+  // ImplicitlyConvertibleToDefaultExecutionSpace::operator
+  // Kokkos::DefaultExecutionSpace() const
+  [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace notEsToDes =
+      notEs;
+
+  // Workaround for HIP-ROCm-5.2 "declared but never referenced"
+  TestMDRangePolicyCTAD() {
+    maybe_unused(des, notEs, ses, t, tt, a, aa, notEsToDes, i64);
+  }
+
+  // MDRangePolicy with C array parameters
+
+  static_assert(
+      std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(t)>>,
+                     decltype(Kokkos::MDRangePolicy(t, t))>);
+  static_assert(
+      std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(t)>>,
+                     decltype(Kokkos::MDRangePolicy(t, t, tt))>);
+  static_assert(
+      std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(t)>>,
+                     decltype(Kokkos::MDRangePolicy(des, t, tt))>);
+  static_assert(
+      std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(t)>>,
+                     decltype(Kokkos::MDRangePolicy(notEs, t, t))>);
+
+  static_assert(
+      std::is_same_v<
+          Kokkos::MDRangePolicy<SomeExecutionSpace, Kokkos::Rank<std::size(t)>>,
+          decltype(Kokkos::MDRangePolicy(ses, t, t))>);
+
+  // MDRangePolicy with Kokkos::initializer_list parameters
+
+  static_assert(std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<6>>,
+                               decltype(Kokkos::MDRangePolicy(
+                                   {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}))>);
+
+  static_assert(std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<6>>,
+                               decltype(Kokkos::MDRangePolicy(
+                                   {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
+                                   {i64, i64, i64, i64, i64, i64}))>);
+
+  static_assert(std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<6>>,
+                               decltype(Kokkos::MDRangePolicy(
+                                   des, {1, 2, 3, 4, 5, 6},
+                                   {i64, i64, i64, i64, i64, i64}))>);
+
+  static_assert(
+      std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<6>>,
+                     decltype(Kokkos::MDRangePolicy(notEs, {1, 2, 3, 4, 5, 6},
+                                                    {1, 2, 3, 4, 5, 6}))>);
+
+  static_assert(
+      std::is_same_v<Kokkos::MDRangePolicy<SomeExecutionSpace, Kokkos::Rank<6>>,
+                     decltype(Kokkos::MDRangePolicy(ses, {1, 2, 3, 4, 5, 6},
+                                                    {1, 2, 3, 4, 5, 6}))>);
+
+  // MDRangePolicy with Kokkos::Array parameters
+
+  static_assert(
+      std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(a)>>,
+                     decltype(Kokkos::MDRangePolicy(a, a))>);
+  static_assert(
+      std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(a)>>,
+                     decltype(Kokkos::MDRangePolicy(a, a, aa))>);
+  static_assert(
+      std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(a)>>,
+                     decltype(Kokkos::MDRangePolicy(des, a, a))>);
+  static_assert(
+      std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(a)>>,
+                     decltype(Kokkos::MDRangePolicy(notEs, a, a))>);
+  static_assert(
+      std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(a)>>,
+                     decltype(Kokkos::MDRangePolicy(des, a, a, aa))>);
+  static_assert(
+      std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(a)>>,
+                     decltype(Kokkos::MDRangePolicy(notEs, a, a, aa))>);
+
+  static_assert(
+      std::is_same_v<
+          Kokkos::MDRangePolicy<SomeExecutionSpace, Kokkos::Rank<std::size(a)>>,
+          decltype(Kokkos::MDRangePolicy(ses, a, a))>);
+  static_assert(
+      std::is_same_v<
+          Kokkos::MDRangePolicy<SomeExecutionSpace, Kokkos::Rank<std::size(a)>>,
+          decltype(Kokkos::MDRangePolicy(ses, a, a, aa))>);
+};
+
+}  // namespace
diff --git a/core/unit_test/TestMDRangePolicyConstructors.hpp b/core/unit_test/TestMDRangePolicyConstructors.hpp
index f577f415e7c..6f241b45d47 100644
--- a/core/unit_test/TestMDRangePolicyConstructors.hpp
+++ b/core/unit_test/TestMDRangePolicyConstructors.hpp
@@ -18,6 +18,8 @@
 
 #include <Kokkos_Core.hpp>
 
+#include <regex>
+
 namespace {
 
 template <class IndexType>
@@ -86,12 +88,56 @@ TEST(TEST_CATEGORY_DEATH, policy_bounds_unsafe_narrowing_conversions) {
   using Policy = Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>,
                                        Kokkos::IndexType<unsigned>>;
 
+  std::string msg =
+      "Kokkos::MDRangePolicy bound type error: an unsafe implicit conversion "
+      "is "
+      "performed on a bound (-1) in dimension (0), which may not preserve its "
+      "original value.\n";
+  std::string expected = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&");
+
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
-  ASSERT_DEATH(
-      {
-        (void)Policy({-1, 0}, {2, 3});
-      },
-      "unsafe narrowing conversion");
+  ASSERT_DEATH({ (void)Policy({-1, 0}, {2, 3}); }, expected);
+}
+
+TEST(TEST_CATEGORY_DEATH, policy_invalid_bounds) {
+  using Policy = Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>;
+
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+
+  auto [dim0, dim1] = (Policy::inner_direction == Kokkos::Iterate::Right)
+                          ? std::make_pair(1, 0)
+                          : std::make_pair(0, 1);
+  std::string msg1 =
+      "Kokkos::MDRangePolicy bounds error: The lower bound (100) is greater "
+      "than its upper bound (90) in dimension " +
+      std::to_string(dim0) + ".\n";
+
+  std::string msg2 =
+      "Kokkos::MDRangePolicy bounds error: The lower bound (100) is greater "
+      "than its upper bound (90) in dimension " +
+      std::to_string(dim1) + ".\n";
+
+#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4)
+  // escape the parentheses in the regex to match the error message
+  msg1 = std::regex_replace(msg1, std::regex("\\(|\\)"), "\\$&");
+  (void)msg2;
+  ASSERT_DEATH({ (void)Policy({100, 100}, {90, 90}); }, msg1);
+#else
+  if (!Kokkos::show_warnings()) {
+    GTEST_SKIP() << "Kokkos warning messages are disabled";
+  }
+
+  ::testing::internal::CaptureStderr();
+  (void)Policy({100, 100}, {90, 90});
+#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
+  ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg1 + msg2);
+#else
+  ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty());
+  (void)msg1;
+  (void)msg2;
+#endif
+
+#endif
 }
 #endif
 
diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp
index 424ba05a904..ad035d4e4bf 100644
--- a/core/unit_test/TestMathematicalFunctions.hpp
+++ b/core/unit_test/TestMathematicalFunctions.hpp
@@ -287,21 +287,20 @@ struct FloatingPointComparison {
 
  public:
   template <class FPT>
-  KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, double ulp) const {
+  KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, int ulp) const {
     auto abs_tol = eps(fpv) * ulp;
 
     bool ar = absolute(fpv) < abs_tol;
     if (!ar) {
       Kokkos::printf("absolute value exceeds tolerance [|%e| > %e]\n",
-                     (double)fpv, abs_tol);
+                     (double)fpv, (double)abs_tol);
     }
 
     return ar;
   }
 
   template <class Lhs, class Rhs>
-  KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs,
-                               double ulp) const {
+  KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs, int ulp) const {
     if (lhs == 0) {
       return compare_near_zero(rhs, ulp);
     } else if (rhs == 0) {
@@ -315,7 +314,7 @@ struct FloatingPointComparison {
       bool ar         = abs_diff == 0 || rel_diff < rel_tol;
       if (!ar) {
         Kokkos::printf("relative difference exceeds tolerance [%e > %e]\n",
-                       (double)rel_diff, rel_tol);
+                       (double)rel_diff, (double)rel_tol);
       }
 
       return ar;
@@ -348,7 +347,7 @@ struct math_function_name;
       }                                                                 \
       MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE                           \
     }                                                                   \
-    static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; }   \
+    static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; }      \
   };                                                                    \
   using kk_##FUNC = MathUnaryFunction_##FUNC;                           \
   template <>                                                           \
@@ -373,7 +372,7 @@ struct math_function_name;
                        math_unary_function_return_type_t<T>>::value); \
       return REF_FUNC;                                                \
     }                                                                 \
-    static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \
+    static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; }    \
   };                                                                  \
   using kk_##FUNC = MathUnaryFunction_##FUNC;                         \
   template <>                                                         \
@@ -477,7 +476,7 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2);
       }                                                                        \
       MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE                                  \
     }                                                                          \
-    static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; }          \
+    static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; }             \
   };                                                                           \
   using kk_##FUNC = MathBinaryFunction_##FUNC;                                 \
   template <>                                                                  \
@@ -511,7 +510,7 @@ DEFINE_BINARY_FUNCTION_EVAL(copysign, 1);
                        math_ternary_function_return_type_t<T, U, V>>::value); \
       return std::FUNC(x, y, z);                                              \
     }                                                                         \
-    static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; }         \
+    static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; }            \
   };                                                                          \
   using kk3_##FUNC = MathTernaryFunction_##FUNC;                              \
   template <>                                                                 \
@@ -1307,12 +1306,12 @@ struct TestAbsoluteValueFunction {
     if (abs(static_cast<KE::half_t>(4.f)) != static_cast<KE::half_t>(4.f) ||
         abs(static_cast<KE::half_t>(-4.f)) != static_cast<KE::half_t>(4.f)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::half_t)\n");
+      Kokkos::printf("failed abs(KE::half_t)\n");
     }
     if (abs(static_cast<KE::bhalf_t>(4.f)) != static_cast<KE::bhalf_t>(4.f) ||
         abs(static_cast<KE::bhalf_t>(-4.f)) != static_cast<KE::bhalf_t>(4.f)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::bhalf_t)\n");
+      Kokkos::printf("failed abs(KE::bhalf_t)\n");
     }
     if (abs(5.) != 5. || abs(-5.) != 5.) {
       ++e;
@@ -1332,19 +1331,17 @@ struct TestAbsoluteValueFunction {
       Kokkos::printf("failed abs(floating_point) special values\n");
     }
 
-    static_assert(std::is_same<decltype(abs(1)), int>::value, "");
-    static_assert(std::is_same<decltype(abs(2l)), long>::value, "");
-    static_assert(std::is_same<decltype(abs(3ll)), long long>::value, "");
+    static_assert(std::is_same<decltype(abs(1)), int>::value);
+    static_assert(std::is_same<decltype(abs(2l)), long>::value);
+    static_assert(std::is_same<decltype(abs(3ll)), long long>::value);
     static_assert(std::is_same<decltype(abs(static_cast<KE::half_t>(4.f))),
-                               KE::half_t>::value,
-                  "");
+                               KE::half_t>::value);
     static_assert(std::is_same<decltype(abs(static_cast<KE::bhalf_t>(4.f))),
-                               KE::bhalf_t>::value,
-                  "");
-    static_assert(std::is_same<decltype(abs(4.f)), float>::value, "");
-    static_assert(std::is_same<decltype(abs(5.)), double>::value, "");
+                               KE::bhalf_t>::value);
+    static_assert(std::is_same<decltype(abs(4.f)), float>::value);
+    static_assert(std::is_same<decltype(abs(5.)), double>::value);
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
-    static_assert(std::is_same<decltype(abs(6.l)), long double>::value, "");
+    static_assert(std::is_same<decltype(abs(6.l)), long double>::value);
 #endif
   }
 };
@@ -1365,26 +1362,26 @@ struct TestFloatingPointAbsoluteValueFunction {
     using Kokkos::fabs;
     if (fabs(4.f) != 4.f || fabs(-4.f) != 4.f) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(float)\n");
+      Kokkos::printf("failed fabs(float)\n");
     }
     if (fabs(static_cast<KE::half_t>(4.f)) != static_cast<KE::half_t>(4.f) ||
         fabs(static_cast<KE::half_t>(-4.f)) != static_cast<KE::half_t>(4.f)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::half_t)\n");
+      Kokkos::printf("failed fabs(KE::half_t)\n");
     }
     if (fabs(static_cast<KE::bhalf_t>(4.f)) != static_cast<KE::bhalf_t>(4.f) ||
         fabs(static_cast<KE::bhalf_t>(-4.f)) != static_cast<KE::bhalf_t>(4.f)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::bhalf_t)\n");
+      Kokkos::printf("failed fabs(KE::bhalf_t)\n");
     }
     if (fabs(5.) != 5. || fabs(-5.) != 5.) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(double)\n");
+      Kokkos::printf("failed fabs(double)\n");
     }
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
     if (fabs(6.l) != 6.l || fabs(-6.l) != 6.l) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(long double)\n");
+      Kokkos::printf("failed fabs(long double)\n");
     }
 #endif
     // special values
@@ -1392,8 +1389,7 @@ struct TestFloatingPointAbsoluteValueFunction {
     using Kokkos::isnan;
     if (fabs(-0.) != 0. || !isinf(fabs(-INFINITY)) || !isnan(fabs(-NAN))) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "failed fabs(floating_point) special values\n");
+      Kokkos::printf("failed fabs(floating_point) special values\n");
     }
 
     static_assert(std::is_same<decltype(fabs(static_cast<KE::half_t>(4.f))),
@@ -1425,7 +1421,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison {
     if (!compare(fmod(6.2f, 4.f), 2.2f, 1) &&
         !compare(fmod(-6.2f, 4.f), -2.2f, 1)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(float)\n");
+      Kokkos::printf("failed fmod(float)\n");
     }
     if (!compare(
             fmod(static_cast<KE::half_t>(6.2f), static_cast<KE::half_t>(4.f)),
@@ -1434,7 +1430,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison {
             fmod(static_cast<KE::half_t>(-6.2f), static_cast<KE::half_t>(4.f)),
             -static_cast<KE::half_t>(2.2f), 1)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::half_t)\n");
+      Kokkos::printf("failed fmod(KE::half_t)\n");
     }
     if (!compare(
             fmod(static_cast<KE::bhalf_t>(6.2f), static_cast<KE::bhalf_t>(4.f)),
@@ -1443,17 +1439,17 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison {
                       static_cast<KE::bhalf_t>(4.f)),
                  -static_cast<KE::bhalf_t>(2.2f), 1)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::bhalf_t)\n");
+      Kokkos::printf("failed fmod(KE::bhalf_t)\n");
     }
     if (!compare(fmod(6.2, 4.), 2.2, 1) && !compare(fmod(-6.2, 4.), -2.2, 1)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(double)\n");
+      Kokkos::printf("failed fmod(double)\n");
     }
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
     if (!compare(fmod(6.2l, 4.l), 2.2l, 1) &&
         !compare(fmod(-6.2l, 4.l), -2.2l, 1)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(long double)\n");
+      Kokkos::printf("failed fmod(long double)\n");
     }
 #endif
     // special values
@@ -1462,23 +1458,19 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison {
     if (!isinf(fmod(-KE::infinity<float>::value, 1.f)) &&
         !isnan(fmod(-KE::quiet_NaN<float>::value, 1.f))) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "failed fmod(floating_point) special values\n");
+      Kokkos::printf("failed fmod(floating_point) special values\n");
     }
 
     static_assert(std::is_same<decltype(fmod(static_cast<KE::half_t>(4.f),
                                              static_cast<KE::half_t>(4.f))),
-                               KE::half_t>::value,
-                  "");
+                               KE::half_t>::value);
     static_assert(std::is_same<decltype(fmod(static_cast<KE::bhalf_t>(4.f),
                                              static_cast<KE::bhalf_t>(4.f))),
-                               KE::bhalf_t>::value,
-                  "");
-    static_assert(std::is_same<decltype(fmod(4.f, 4.f)), float>::value, "");
-    static_assert(std::is_same<decltype(fmod(5., 5.)), double>::value, "");
+                               KE::bhalf_t>::value);
+    static_assert(std::is_same<decltype(fmod(4.f, 4.f)), float>::value);
+    static_assert(std::is_same<decltype(fmod(5., 5.)), double>::value);
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
-    static_assert(std::is_same<decltype(fmod(6.l, 6.l)), long double>::value,
-                  "");
+    static_assert(std::is_same<decltype(fmod(6.l, 6.l)), long double>::value);
 #endif
   }
 };
@@ -1502,7 +1494,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison {
     if (!compare(remainder(6.2f, 4.f), 2.2f, 2) &&
         !compare(remainder(-6.2f, 4.f), 2.2f, 1)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(float)\n");
+      Kokkos::printf("failed remainder(float)\n");
     }
     if (!compare(remainder(static_cast<KE::half_t>(6.2f),
                            static_cast<KE::half_t>(4.f)),
@@ -1511,7 +1503,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison {
                            static_cast<KE::half_t>(4.f)),
                  -static_cast<KE::half_t>(2.2f), 1)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::half_t)\n");
+      Kokkos::printf("failed remainder(KE::half_t)\n");
     }
     if (!compare(remainder(static_cast<KE::bhalf_t>(6.2f),
                            static_cast<KE::bhalf_t>(4.f)),
@@ -1520,18 +1512,18 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison {
                            static_cast<KE::bhalf_t>(4.f)),
                  -static_cast<KE::bhalf_t>(2.2f), 1)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::bhalf_t)\n");
+      Kokkos::printf("failed remainder(KE::bhalf_t)\n");
     }
     if (!compare(remainder(6.2, 4.), 2.2, 2) &&
         !compare(remainder(-6.2, 4.), 2.2, 1)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(double)\n");
+      Kokkos::printf("failed remainder(double)\n");
     }
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
     if (!compare(remainder(6.2l, 4.l), 2.2l, 1) &&
         !compare(remainder(-6.2l, 4.l), -2.2l, 1)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(long double)\n");
+      Kokkos::printf("failed remainder(long double)\n");
     }
 #endif
     // special values
@@ -1540,26 +1532,23 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison {
     if (!isinf(remainder(-KE::infinity<float>::value, 1.f)) &&
         !isnan(remainder(-KE::quiet_NaN<float>::value, 1.f))) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+      Kokkos::printf(
           "failed remainder(floating_point) special values\n");
     }
 
     static_assert(
         std::is_same<decltype(remainder(static_cast<KE::half_t>(4.f),
                                         static_cast<KE::half_t>(4.f))),
-                     KE::half_t>::value,
-        "");
+                     KE::half_t>::value);
     static_assert(
         std::is_same<decltype(remainder(static_cast<KE::bhalf_t>(4.f),
                                         static_cast<KE::bhalf_t>(4.f))),
-                     KE::bhalf_t>::value,
-        "");
-    static_assert(std::is_same<decltype(remainder(4.f, 4.f)), float>::value,
-                  "");
-    static_assert(std::is_same<decltype(remainder(5., 5.)), double>::value, "");
+                     KE::bhalf_t>::value);
+    static_assert(std::is_same<decltype(remainder(4.f, 4.f)), float>::value);
+    static_assert(std::is_same<decltype(remainder(5., 5.)), double>::value);
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
     static_assert(
-        std::is_same<decltype(remainder(6.l, 6.l)), long double>::value, "");
+        std::is_same<decltype(remainder(6.l, 6.l)), long double>::value);
 #endif
   }
 };
@@ -1765,7 +1754,7 @@ struct TestIsNaN {
 #endif
     ) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::half_t)\n");
+      Kokkos::printf("failed isnan(KE::half_t)\n");
     }
     if (isnan(static_cast<KE::bhalf_t>(2.f))
 #ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7
@@ -1775,7 +1764,7 @@ struct TestIsNaN {
 #endif
     ) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::bhalf_t)\n");
+      Kokkos::printf("failed isnan(KE::bhalf_t)\n");
     }
     if (isnan(3.)
 #ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7
@@ -1801,11 +1790,11 @@ struct TestIsNaN {
       Kokkos::printf("failed isnan(floating_point) special values\n");
     }
 
-    static_assert(std::is_same<decltype(isnan(1)), bool>::value, "");
-    static_assert(std::is_same<decltype(isnan(2.f)), bool>::value, "");
-    static_assert(std::is_same<decltype(isnan(3.)), bool>::value, "");
+    static_assert(std::is_same<decltype(isnan(1)), bool>::value);
+    static_assert(std::is_same<decltype(isnan(2.f)), bool>::value);
+    static_assert(std::is_same<decltype(isnan(3.)), bool>::value);
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
-    static_assert(std::is_same<decltype(isnan(4.l)), bool>::value, "");
+    static_assert(std::is_same<decltype(isnan(4.l)), bool>::value);
 #endif
   }
 };
diff --git a/core/unit_test/TestMathematicalSpecialFunctions.hpp b/core/unit_test/TestMathematicalSpecialFunctions.hpp
index 06c84c75137..7969dc86864 100644
--- a/core/unit_test/TestMathematicalSpecialFunctions.hpp
+++ b/core/unit_test/TestMathematicalSpecialFunctions.hpp
@@ -1213,13 +1213,13 @@ struct TestComplexBesselI0K0Function {
     }
 
     EXPECT_EQ(h_ref_cbk0(0), h_cbk0(0));
-    int upper_limit = N;
+    int upper_limit_0 = N;
     // FIXME_SYCL Failing for Intel GPUs, 19 is the first failing test case
 #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
     if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
-      upper_limit = 19;
+      upper_limit_0 = 19;
 #endif
-    for (int i = 1; i < upper_limit; i++) {
+    for (int i = 1; i < upper_limit_0; i++) {
       EXPECT_LE(Kokkos::abs(h_cbk0(i) - h_ref_cbk0(i)),
                 Kokkos::abs(h_ref_cbk0(i)) * 1e-13)
           << "at index " << i;
@@ -1462,13 +1462,13 @@ struct TestComplexBesselI1K1Function {
     }
 
     EXPECT_EQ(h_ref_cbk1(0), h_cbk1(0));
-    int upper_limit = N;
+    int upper_limit_1 = N;
     // FIXME_SYCL Failing for Intel GPUs, 8 is the first failing test case
 #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
     if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
-      upper_limit = 8;
+      upper_limit_1 = 8;
 #endif
-    for (int i = 1; i < upper_limit; i++) {
+    for (int i = 1; i < upper_limit_1; i++) {
       EXPECT_LE(Kokkos::abs(h_cbk1(i) - h_ref_cbk1(i)),
                 Kokkos::abs(h_ref_cbk1(i)) * 1e-13)
           << "at index " << i;
@@ -1718,20 +1718,26 @@ struct TestComplexBesselH1Function {
     ((HIP_VERSION_MAJOR == 5) &&                               \
      !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6)))
     EXPECT_EQ(h_ref_ch10(0), h_ch10(0));
-    for (int i = 1; i < N; i++) {
+    int upper_limit_10 = N;
+// FIXME_SYCL Failing for Intel GPUs, 17 is the first failing test case
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
+    if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
+      upper_limit_10 = 17;
+#endif
+    for (int i = 1; i < upper_limit_10; i++) {
       EXPECT_LE(Kokkos::abs(h_ch10(i) - h_ref_ch10(i)),
                 Kokkos::abs(h_ref_ch10(i)) * 1e-13)
           << "at index " << i;
     }
 
     EXPECT_EQ(h_ref_ch11(0), h_ch11(0));
-    int upper_limit = N;
-    // FIXME_SYCL Failing for Intel GPUs, 16 is the first failing test case
+    int upper_limit_11 = N;
+    // FIXME_SYCL Failing for Intel GPUs, 2 is the first failing test case
 #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
     if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
-      upper_limit = 16;
+      upper_limit_11 = 2;
 #endif
-    for (int i = 1; i < upper_limit; i++) {
+    for (int i = 1; i < upper_limit_11; i++) {
       EXPECT_LE(Kokkos::abs(h_ch11(i) - h_ref_ch11(i)),
                 Kokkos::abs(h_ref_ch11(i)) * 1e-13)
           << "at index " << i;
@@ -1912,19 +1918,26 @@ struct TestComplexBesselH2Function {
     ((HIP_VERSION_MAJOR == 5) &&                               \
      !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6)))
     EXPECT_EQ(h_ref_ch20(0), h_ch20(0));
-    for (int i = 1; i < N; i++) {
+    int upper_limit_20 = N;
+// FIXME_SYCL Failing for Intel GPUs, 16 is the first failing test case
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
+    if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
+      upper_limit_20 = 16;
+#endif
+    for (int i = 1; i < upper_limit_20; i++) {
       EXPECT_LE(Kokkos::abs(h_ch20(i) - h_ref_ch20(i)),
-                Kokkos::abs(h_ref_ch20(i)) * 1e-13);
+                Kokkos::abs(h_ref_ch20(i)) * 1e-13)
+          << "at index " << i;
     }
 
     EXPECT_EQ(h_ref_ch21(0), h_ch21(0));
-    int upper_limit = N;
-    // FIXME_SYCL Failing for Intel GPUs, 17 is the first failing test case
+    int upper_limit_21 = N;
+    // FIXME_SYCL Failing for Intel GPUs, 1 is the first failing test case
 #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
     if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
-      upper_limit = 17;
+      upper_limit_21 = 1;
 #endif
-    for (int i = 1; i < upper_limit; i++) {
+    for (int i = 1; i < upper_limit_21; i++) {
       EXPECT_LE(Kokkos::abs(h_ch21(i) - h_ref_ch21(i)),
                 Kokkos::abs(h_ref_ch21(i)) * 1e-13)
           << "at index " << i;
@@ -1954,31 +1967,61 @@ TEST(TEST_CATEGORY, mathspecialfunc_errorfunc) {
 #endif
 
 TEST(TEST_CATEGORY, mathspecialfunc_cbesselj0y0) {
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>)
+    GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on "
+                    "Intel GPUs";  // FIXME_OPENMPTARGET
+#endif
   TestComplexBesselJ0Y0Function<TEST_EXECSPACE> test;
   test.testit();
 }
 
 TEST(TEST_CATEGORY, mathspecialfunc_cbesselj1y1) {
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>)
+    GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on "
+                    "Intel GPUs";  // FIXME_OPENMPTARGET
+#endif
   TestComplexBesselJ1Y1Function<TEST_EXECSPACE> test;
   test.testit();
 }
 
 TEST(TEST_CATEGORY, mathspecialfunc_cbesseli0k0) {
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>)
+    GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on "
+                    "Intel GPUs";  // FIXME_OPENMPTARGET
+#endif
   TestComplexBesselI0K0Function<TEST_EXECSPACE> test;
   test.testit();
 }
 
 TEST(TEST_CATEGORY, mathspecialfunc_cbesseli1k1) {
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>)
+    GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on "
+                    "Intel GPUs";  // FIXME_OPENMPTARGET
+#endif
   TestComplexBesselI1K1Function<TEST_EXECSPACE> test;
   test.testit();
 }
 
 TEST(TEST_CATEGORY, mathspecialfunc_cbesselh1stkind) {
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>)
+    GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on "
+                    "Intel GPUs";  // FIXME_OPENMPTARGET
+#endif
   TestComplexBesselH1Function<TEST_EXECSPACE> test;
   test.testit();
 }
 
 TEST(TEST_CATEGORY, mathspecialfunc_cbesselh2ndkind) {
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>)
+    GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on "
+                    "Intel GPUs";  // FIXME_OPENMPTARGET
+#endif
   TestComplexBesselH2Function<TEST_EXECSPACE> test;
   test.testit();
 }
diff --git a/core/unit_test/TestNonTrivialScalarTypes.hpp b/core/unit_test/TestNonTrivialScalarTypes.hpp
index eaf7a4125cc..116ac58c39f 100644
--- a/core/unit_test/TestNonTrivialScalarTypes.hpp
+++ b/core/unit_test/TestNonTrivialScalarTypes.hpp
@@ -214,7 +214,7 @@ struct point_t {
   uint8_t x, y, z;
 
   KOKKOS_FUNCTION
-  point_t() : x(1), y(1), z(1){};
+  point_t() : x(0), y(0), z(0){};
 
   KOKKOS_FUNCTION
   point_t(const point_t &val) : x(val.x), y(val.y), z(val.z){};
diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp
index ec1c1e0ca0b..81a9d0a5e0d 100644
--- a/core/unit_test/TestNumericTraits.hpp
+++ b/core/unit_test/TestNumericTraits.hpp
@@ -210,9 +210,10 @@ TEST(TEST_CATEGORY, numeric_traits_infinity) {
 #endif
   TestNumericTraits<TEST_EXECSPACE, float, Infinity>();
   TestNumericTraits<TEST_EXECSPACE, double, Infinity>();
-  // FIXME_NVHPC long double not supported
-#if !defined(KOKKOS_ENABLE_CUDA) || \
-    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double
+  // FIXME_NVHPC 23.7 long double
+  // FIXME_OPENMPTARGET long double on Intel GPUs
+#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \
+    (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU))
   TestNumericTraits<TEST_EXECSPACE, long double, Infinity>();
 #endif
 }
@@ -224,9 +225,9 @@ TEST(TEST_CATEGORY, numeric_traits_epsilon) {
 #endif
   TestNumericTraits<TEST_EXECSPACE, float, Epsilon>();
   TestNumericTraits<TEST_EXECSPACE, double, Epsilon>();
-  // FIXME_NVHPC long double not supported
-#if !defined(KOKKOS_ENABLE_CUDA) || \
-    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
+  // FIXME_NVHPC 23.7 long double
+#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \
+    (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU))
   TestNumericTraits<TEST_EXECSPACE, long double, Epsilon>();
 #endif
 }
@@ -239,9 +240,9 @@ TEST(TEST_CATEGORY, numeric_traits_round_error) {
 #endif
   TestNumericTraits<TEST_EXECSPACE, float, RoundError>();
   TestNumericTraits<TEST_EXECSPACE, double, RoundError>();
-  // FIXME_NVHPC long double not supported
-#if !defined(KOKKOS_ENABLE_CUDA) || \
-    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
+  // FIXME_NVHPC 23.7 long double
+#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \
+    (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU))
   TestNumericTraits<TEST_EXECSPACE, long double, RoundError>();
 #endif
 }
@@ -253,9 +254,9 @@ TEST(TEST_CATEGORY, numeric_traits_norm_min) {
 #endif
   TestNumericTraits<TEST_EXECSPACE, float, NormMin>();
   TestNumericTraits<TEST_EXECSPACE, double, NormMin>();
-  // FIXME_NVHPC long double not supported
-#if !defined(KOKKOS_ENABLE_CUDA) || \
-    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
+  // FIXME_NVHPC 23.7 long double
+#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \
+    (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU))
   TestNumericTraits<TEST_EXECSPACE, long double, NormMin>();
 #endif
 }
@@ -263,9 +264,9 @@ TEST(TEST_CATEGORY, numeric_traits_norm_min) {
 TEST(TEST_CATEGORY, numeric_traits_denorm_min) {
   TestNumericTraits<TEST_EXECSPACE, float, DenormMin>();
   TestNumericTraits<TEST_EXECSPACE, double, DenormMin>();
-  // FIXME_NVHPC long double not supported
-#if !defined(KOKKOS_ENABLE_CUDA) || \
-    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
+  // FIXME_NVHPC 23.7 long double
+#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \
+    (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU))
   TestNumericTraits<TEST_EXECSPACE, long double, DenormMin>();
 #endif
 }
@@ -302,8 +303,10 @@ TEST(TEST_CATEGORY, numeric_traits_finite_min_max) {
   TestNumericTraits<TEST_EXECSPACE, float, FiniteMax>();
   TestNumericTraits<TEST_EXECSPACE, double, FiniteMin>();
   TestNumericTraits<TEST_EXECSPACE, double, FiniteMax>();
-#if !defined(KOKKOS_ENABLE_CUDA) || \
-    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
+  // FIXME_NVHPC 23.7 long double
+  // FIXME_OPENMPTARGET long double on Intel GPUs
+#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \
+    (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU))
   TestNumericTraits<TEST_EXECSPACE, long double, FiniteMin>();
   TestNumericTraits<TEST_EXECSPACE, long double, FiniteMax>();
 #endif
@@ -326,8 +329,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits) {
   TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, Digits>();
   TestNumericTraits<TEST_EXECSPACE, float, Digits>();
   TestNumericTraits<TEST_EXECSPACE, double, Digits>();
-#if !defined(KOKKOS_ENABLE_CUDA) || \
-    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
+  // FIXME_NVHPC 23.7 long double
+  // FIXME_OPENMPTARGET long double on Intel GPUs
+#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \
+    (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU))
   TestNumericTraits<TEST_EXECSPACE, long double, Digits>();
 #endif
 }
@@ -349,8 +354,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) {
   TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, Digits10>();
   TestNumericTraits<TEST_EXECSPACE, float, Digits10>();
   TestNumericTraits<TEST_EXECSPACE, double, Digits10>();
-#if !defined(KOKKOS_ENABLE_CUDA) || \
-    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
+  // FIXME_NVHPC 23.7 long double
+  // FIXME_OPENMPTARGET long double on Intel GPUs
+#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \
+    (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU))
   TestNumericTraits<TEST_EXECSPACE, long double, Digits10>();
 #endif
 }
@@ -358,8 +365,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) {
 TEST(TEST_CATEGORY, numeric_traits_max_digits10) {
   TestNumericTraits<TEST_EXECSPACE, float, MaxDigits10>();
   TestNumericTraits<TEST_EXECSPACE, double, MaxDigits10>();
-#if !defined(KOKKOS_ENABLE_CUDA) || \
-    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
+  // FIXME_NVHPC 23.7 long double
+  // FIXME_OPENMPTARGET long double on Intel GPUs
+#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \
+    (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU))
   TestNumericTraits<TEST_EXECSPACE, long double, MaxDigits10>();
 #endif
 }
@@ -380,8 +389,10 @@ TEST(TEST_CATEGORY, numeric_traits_radix) {
   TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, Radix>();
   TestNumericTraits<TEST_EXECSPACE, float, Radix>();
   TestNumericTraits<TEST_EXECSPACE, double, Radix>();
-#if !defined(KOKKOS_ENABLE_CUDA) || \
-    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
+  // FIXME_NVHPC 23.7 long double
+  // FIXME_OPENMPTARGET long double on Intel GPUs
+#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \
+    (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU))
   TestNumericTraits<TEST_EXECSPACE, long double, Radix>();
 #endif
 }
@@ -395,8 +406,10 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) {
   TestNumericTraits<TEST_EXECSPACE, float, MaxExponent>();
   TestNumericTraits<TEST_EXECSPACE, double, MinExponent>();
   TestNumericTraits<TEST_EXECSPACE, double, MaxExponent>();
-#if !defined(KOKKOS_ENABLE_CUDA) || \
-    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
+  // FIXME_NVHPC 23.7 long double
+  // FIXME_OPENMPTARGET long double on Intel GPUs
+#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \
+    (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU))
   TestNumericTraits<TEST_EXECSPACE, long double, MinExponent>();
   TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent>();
 #endif
@@ -407,8 +420,10 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) {
   TestNumericTraits<TEST_EXECSPACE, float, MaxExponent10>();
   TestNumericTraits<TEST_EXECSPACE, double, MinExponent10>();
   TestNumericTraits<TEST_EXECSPACE, double, MaxExponent10>();
-#if !defined(KOKKOS_ENABLE_CUDA) || \
-    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
+  // FIXME_NVHPC 23.7 long double
+  // FIXME_OPENMPTARGET long double on Intel GPUs
+#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \
+    (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU))
   TestNumericTraits<TEST_EXECSPACE, long double, MinExponent10>();
   TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent10>();
 #endif
@@ -426,8 +441,10 @@ TEST(TEST_CATEGORY, numeric_traits_quiet_and_signaling_nan) {
   TestNumericTraits<TEST_EXECSPACE, float, SignalingNaN>();
   TestNumericTraits<TEST_EXECSPACE, double, QuietNaN>();
   TestNumericTraits<TEST_EXECSPACE, double, SignalingNaN>();
-#if !defined(KOKKOS_ENABLE_CUDA) || \
-    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
+  // FIXME_NVHPC 23.7 long double
+  // FIXME_OPENMPTARGET long double on Intel GPUs
+#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \
+    (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU))
   TestNumericTraits<TEST_EXECSPACE, long double, QuietNaN>();
   TestNumericTraits<TEST_EXECSPACE, long double, SignalingNaN>();
 #endif
@@ -442,7 +459,7 @@ struct HasNoSpecialization {};
   using TRAIT##_value_t = decltype(Kokkos::Experimental::TRAIT<T>::value); \
   template <class T>                                                       \
   using has_##TRAIT = Kokkos::is_detected<TRAIT##_value_t, T>;             \
-  static_assert(!has_##TRAIT<HasNoSpecialization>::value, "");
+  static_assert(!has_##TRAIT<HasNoSpecialization>::value);
 
 CHECK_TRAIT_IS_SFINAE_FRIENDLY(infinity)
 CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_min)
@@ -524,39 +541,39 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, denorm_min);
 #endif
 
 // clang-format off
-static_assert(Kokkos::Experimental::norm_min<float      >::value == std::numeric_limits<      float>::min(), "");
-static_assert(Kokkos::Experimental::norm_min<double     >::value == std::numeric_limits<     double>::min(), "");
-static_assert(Kokkos::Experimental::norm_min<long double>::value == std::numeric_limits<long double>::min(), "");
+static_assert(Kokkos::Experimental::norm_min<float      >::value == std::numeric_limits<      float>::min());
+static_assert(Kokkos::Experimental::norm_min<double     >::value == std::numeric_limits<     double>::min());
+static_assert(Kokkos::Experimental::norm_min<long double>::value == std::numeric_limits<long double>::min());
 // integer types
-static_assert(Kokkos::Experimental::finite_min<char                  >::value == std::numeric_limits<                  char>::min(), "");
-static_assert(Kokkos::Experimental::finite_min<signed char           >::value == std::numeric_limits<           signed char>::min(), "");
-static_assert(Kokkos::Experimental::finite_min<unsigned char         >::value == std::numeric_limits<         unsigned char>::min(), "");
-static_assert(Kokkos::Experimental::finite_min<short                 >::value == std::numeric_limits<                 short>::min(), "");
-static_assert(Kokkos::Experimental::finite_min<unsigned short        >::value == std::numeric_limits<        unsigned short>::min(), "");
-static_assert(Kokkos::Experimental::finite_min<int                   >::value == std::numeric_limits<                   int>::min(), "");
-static_assert(Kokkos::Experimental::finite_min<unsigned int          >::value == std::numeric_limits<          unsigned int>::min(), "");
-static_assert(Kokkos::Experimental::finite_min<long int              >::value == std::numeric_limits<              long int>::min(), "");
-static_assert(Kokkos::Experimental::finite_min<unsigned long int     >::value == std::numeric_limits<     unsigned long int>::min(), "");
-static_assert(Kokkos::Experimental::finite_min<long long int         >::value == std::numeric_limits<         long long int>::min(), "");
-static_assert(Kokkos::Experimental::finite_min<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::min(), "");
-static_assert(Kokkos::Experimental::finite_max<char                  >::value == std::numeric_limits<                  char>::max(), "");
-static_assert(Kokkos::Experimental::finite_max<signed char           >::value == std::numeric_limits<           signed char>::max(), "");
-static_assert(Kokkos::Experimental::finite_max<unsigned char         >::value == std::numeric_limits<         unsigned char>::max(), "");
-static_assert(Kokkos::Experimental::finite_max<short                 >::value == std::numeric_limits<                 short>::max(), "");
-static_assert(Kokkos::Experimental::finite_max<unsigned short        >::value == std::numeric_limits<        unsigned short>::max(), "");
-static_assert(Kokkos::Experimental::finite_max<int                   >::value == std::numeric_limits<                   int>::max(), "");
-static_assert(Kokkos::Experimental::finite_max<unsigned int          >::value == std::numeric_limits<          unsigned int>::max(), "");
-static_assert(Kokkos::Experimental::finite_max<long int              >::value == std::numeric_limits<              long int>::max(), "");
-static_assert(Kokkos::Experimental::finite_max<unsigned long int     >::value == std::numeric_limits<     unsigned long int>::max(), "");
-static_assert(Kokkos::Experimental::finite_max<long long int         >::value == std::numeric_limits<         long long int>::max(), "");
-static_assert(Kokkos::Experimental::finite_max<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::max(), "");
+static_assert(Kokkos::Experimental::finite_min<char                  >::value == std::numeric_limits<                  char>::min());
+static_assert(Kokkos::Experimental::finite_min<signed char           >::value == std::numeric_limits<           signed char>::min());
+static_assert(Kokkos::Experimental::finite_min<unsigned char         >::value == std::numeric_limits<         unsigned char>::min());
+static_assert(Kokkos::Experimental::finite_min<short                 >::value == std::numeric_limits<                 short>::min());
+static_assert(Kokkos::Experimental::finite_min<unsigned short        >::value == std::numeric_limits<        unsigned short>::min());
+static_assert(Kokkos::Experimental::finite_min<int                   >::value == std::numeric_limits<                   int>::min());
+static_assert(Kokkos::Experimental::finite_min<unsigned int          >::value == std::numeric_limits<          unsigned int>::min());
+static_assert(Kokkos::Experimental::finite_min<long int              >::value == std::numeric_limits<              long int>::min());
+static_assert(Kokkos::Experimental::finite_min<unsigned long int     >::value == std::numeric_limits<     unsigned long int>::min());
+static_assert(Kokkos::Experimental::finite_min<long long int         >::value == std::numeric_limits<         long long int>::min());
+static_assert(Kokkos::Experimental::finite_min<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::min());
+static_assert(Kokkos::Experimental::finite_max<char                  >::value == std::numeric_limits<                  char>::max());
+static_assert(Kokkos::Experimental::finite_max<signed char           >::value == std::numeric_limits<           signed char>::max());
+static_assert(Kokkos::Experimental::finite_max<unsigned char         >::value == std::numeric_limits<         unsigned char>::max());
+static_assert(Kokkos::Experimental::finite_max<short                 >::value == std::numeric_limits<                 short>::max());
+static_assert(Kokkos::Experimental::finite_max<unsigned short        >::value == std::numeric_limits<        unsigned short>::max());
+static_assert(Kokkos::Experimental::finite_max<int                   >::value == std::numeric_limits<                   int>::max());
+static_assert(Kokkos::Experimental::finite_max<unsigned int          >::value == std::numeric_limits<          unsigned int>::max());
+static_assert(Kokkos::Experimental::finite_max<long int              >::value == std::numeric_limits<              long int>::max());
+static_assert(Kokkos::Experimental::finite_max<unsigned long int     >::value == std::numeric_limits<     unsigned long int>::max());
+static_assert(Kokkos::Experimental::finite_max<long long int         >::value == std::numeric_limits<         long long int>::max());
+static_assert(Kokkos::Experimental::finite_max<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::max());
 // floating point types
-static_assert(Kokkos::Experimental::finite_min<float      >::value == -std::numeric_limits<      float>::max(), "");
-static_assert(Kokkos::Experimental::finite_min<double     >::value == -std::numeric_limits<     double>::max(), "");
-static_assert(Kokkos::Experimental::finite_min<long double>::value == -std::numeric_limits<long double>::max(), "");
-static_assert(Kokkos::Experimental::finite_max<float      >::value ==  std::numeric_limits<      float>::max(), "");
-static_assert(Kokkos::Experimental::finite_max<double     >::value ==  std::numeric_limits<     double>::max(), "");
-static_assert(Kokkos::Experimental::finite_max<long double>::value ==  std::numeric_limits<long double>::max(), "");
+static_assert(Kokkos::Experimental::finite_min<float      >::value == -std::numeric_limits<      float>::max());
+static_assert(Kokkos::Experimental::finite_min<double     >::value == -std::numeric_limits<     double>::max());
+static_assert(Kokkos::Experimental::finite_min<long double>::value == -std::numeric_limits<long double>::max());
+static_assert(Kokkos::Experimental::finite_max<float      >::value ==  std::numeric_limits<      float>::max());
+static_assert(Kokkos::Experimental::finite_max<double     >::value ==  std::numeric_limits<     double>::max());
+static_assert(Kokkos::Experimental::finite_max<long double>::value ==  std::numeric_limits<long double>::max());
 // clang-format on
 
 CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits);
@@ -623,15 +640,13 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent10);
 #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION
 #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT
 
-#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT)             \
-  static_assert(Kokkos::Experimental::TRAIT<T>::value !=                       \
-                    Kokkos::Experimental::TRAIT<T>::value,                     \
-                "");                                                           \
-  static_assert(                                                               \
-      std::numeric_limits<T>::TRAIT() != std::numeric_limits<T>::TRAIT(), ""); \
-  static_assert(Kokkos::Experimental::TRAIT<T>::value !=                       \
-                    std::numeric_limits<T>::TRAIT(),                           \
-                "")
+#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \
+  static_assert(Kokkos::Experimental::TRAIT<T>::value !=           \
+                Kokkos::Experimental::TRAIT<T>::value);            \
+  static_assert(std::numeric_limits<T>::TRAIT() !=                 \
+                std::numeric_limits<T>::TRAIT());                  \
+  static_assert(Kokkos::Experimental::TRAIT<T>::value !=           \
+                std::numeric_limits<T>::TRAIT())
 
 // Workaround compiler issue error: expression must have a constant value
 // See kokkos/kokkos#4574
@@ -651,14 +666,11 @@ CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, signaling_NaN);
 
 #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT)              \
   static_assert(Kokkos::Experimental::TRAIT<T const>::value ==          \
-                    Kokkos::Experimental::TRAIT<T>::value,              \
-                "");                                                    \
+                Kokkos::Experimental::TRAIT<T>::value);                 \
   static_assert(Kokkos::Experimental::TRAIT<T volatile>::value ==       \
-                    Kokkos::Experimental::TRAIT<T>::value,              \
-                "");                                                    \
+                Kokkos::Experimental::TRAIT<T>::value);                 \
   static_assert(Kokkos::Experimental::TRAIT<T const volatile>::value == \
-                    Kokkos::Experimental::TRAIT<T>::value,              \
-                "")
+                Kokkos::Experimental::TRAIT<T>::value)
 
 #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \
   CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT);              \
@@ -706,17 +718,13 @@ CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(max_exponent10);
 
 #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT)          \
   static_assert(Kokkos::Experimental::TRAIT<T>::value !=                \
-                    Kokkos::Experimental::TRAIT<T>::value,              \
-                "");                                                    \
+                Kokkos::Experimental::TRAIT<T>::value);                 \
   static_assert(Kokkos::Experimental::TRAIT<T const>::value !=          \
-                    Kokkos::Experimental::TRAIT<T>::value,              \
-                "");                                                    \
+                Kokkos::Experimental::TRAIT<T>::value);                 \
   static_assert(Kokkos::Experimental::TRAIT<T volatile>::value !=       \
-                    Kokkos::Experimental::TRAIT<T>::value,              \
-                "");                                                    \
+                Kokkos::Experimental::TRAIT<T>::value);                 \
   static_assert(Kokkos::Experimental::TRAIT<T const volatile>::value != \
-                    Kokkos::Experimental::TRAIT<T>::value,              \
-                "")
+                Kokkos::Experimental::TRAIT<T>::value)
 
 #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \
   CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT);              \
diff --git a/core/unit_test/TestOccupancyControlTrait.hpp b/core/unit_test/TestOccupancyControlTrait.hpp
new file mode 100644
index 00000000000..345a906d668
--- /dev/null
+++ b/core/unit_test/TestOccupancyControlTrait.hpp
@@ -0,0 +1,80 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <Kokkos_Core.hpp>
+
+namespace {
+
+template <class... Properties>
+void test_policy_execution(const Kokkos::RangePolicy<Properties...>& policy) {
+  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int){});
+}
+template <class... Properties>
+void test_policy_execution(const Kokkos::TeamPolicy<Properties...>& policy) {
+  Kokkos::parallel_for(
+      policy,
+      KOKKOS_LAMBDA(
+          const typename Kokkos::TeamPolicy<Properties...>::member_type&){});
+}
+template <class... Properties>
+void test_policy_execution(const Kokkos::MDRangePolicy<Properties...>& policy) {
+  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int, int){});
+}
+
+template <class Policy>
+void test_prefer_desired_occupancy(Policy policy) {
+  using Kokkos::Experimental::DesiredOccupancy;
+  using Kokkos::Experimental::MaximizeOccupancy;
+  using Kokkos::Experimental::prefer;
+  using Kokkos::Experimental::WorkItemProperty;
+
+  // MaximizeOccupancy -> MaximizeOccupancy
+  auto const policy_still_no_occ = prefer(policy, MaximizeOccupancy{});
+  test_policy_execution(policy_still_no_occ);
+
+  // MaximizeOccupancy -> DesiredOccupancy
+  auto const policy_with_occ =
+      prefer(policy_still_no_occ, DesiredOccupancy{33});
+  test_policy_execution(policy_with_occ);
+
+  // DesiredOccupancy -> DesiredOccupancy
+  auto const policy_change_occ = prefer(policy_with_occ, DesiredOccupancy{24});
+  test_policy_execution(policy_change_occ);
+
+  // DesiredOccupancy -> DesiredOccupancy w/ hint
+  auto policy_with_occ_and_hint = Kokkos::Experimental::require(
+      policy_change_occ,
+      Kokkos::Experimental::WorkItemProperty::HintLightWeight);
+  test_policy_execution(policy_with_occ_and_hint);
+
+  // DesiredOccupancy -> MaximizeOccupancy
+  auto const policy_drop_occ =
+      prefer(policy_with_occ_and_hint, MaximizeOccupancy{});
+  test_policy_execution(policy_drop_occ);
+}
+
+// FIXME_MSVC_WITH_CUDA
+// This test doesn't compile with CUDA on Windows
+#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA))
+TEST(TEST_CATEGORY, occupancy_control) {
+  test_prefer_desired_occupancy(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1));
+  test_prefer_desired_occupancy(
+      Kokkos::TeamPolicy<TEST_EXECSPACE>{1, Kokkos::AUTO});
+  test_prefer_desired_occupancy(
+      Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{{0, 0}, {1, 1}});
+}
+#endif
+}  // namespace
diff --git a/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp b/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp
index 176ce9b5fed..a56dfd9efc7 100644
--- a/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp
+++ b/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp
@@ -166,22 +166,6 @@ TEST(defaultdevicetype, cmd_line_args_device_id) {
   EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"--dummy"});
 }
 
-TEST(defaultdevicetype, cmd_line_args_num_devices) {
-  CmdLineArgsHelper cla = {{
-      "--kokkos-num-devices=5,6",
-      "--kokkos-num-devices=7",
-      "-v",
-  }};
-  Kokkos::InitializationSettings settings;
-  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
-  EXPECT_TRUE(settings.has_num_devices());
-  EXPECT_EQ(settings.get_num_devices(), 7);
-  // this is the current behavior, not suggesting this cannot be revisited
-  EXPECT_TRUE(settings.has_skip_device()) << "behavior changed see comment";
-  EXPECT_EQ(settings.get_skip_device(), 6) << "behavior changed see comment";
-  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"-v"});
-}
-
 TEST(defaultdevicetype, cmd_line_args_disable_warning) {
   CmdLineArgsHelper cla = {{
       "--kokkos-disable-warnings=1",
@@ -351,20 +335,6 @@ TEST(defaultdevicetype, env_vars_device_id) {
   EXPECT_EQ(settings.get_device_id(), 33);
 }
 
-TEST(defaultdevicetype, env_vars_num_devices) {
-  EnvVarsHelper ev = {{
-      {"KOKKOS_NUM_DEVICES", "4"},
-      {"KOKKOS_SKIP_DEVICE", "1"},
-  }};
-  SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev);
-  Kokkos::InitializationSettings settings;
-  Kokkos::Impl::parse_environment_variables(settings);
-  EXPECT_TRUE(settings.has_num_devices());
-  EXPECT_EQ(settings.get_num_devices(), 4);
-  EXPECT_TRUE(settings.has_skip_device());
-  EXPECT_EQ(settings.get_skip_device(), 1);
-}
-
 TEST(defaultdevicetype, env_vars_disable_warnings) {
   for (auto const& value_true : {"1", "true", "TRUE", "yEs"}) {
     EnvVarsHelper ev = {{
@@ -420,22 +390,20 @@ TEST(defaultdevicetype, env_vars_tune_internals) {
 }
 
 TEST(defaultdevicetype, visible_devices) {
-#define KOKKOS_TEST_VISIBLE_DEVICES(ENV, CNT, DEV)                    \
-  do {                                                                \
-    EnvVarsHelper ev{ENV};                                            \
-    SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev);                     \
-    Kokkos::InitializationSettings settings;                          \
-    Kokkos::Impl::parse_environment_variables(settings);              \
-    auto computed = Kokkos::Impl::get_visible_devices(settings, CNT); \
-    std::vector<int> expected = DEV;                                  \
-    EXPECT_EQ(expected.size(), computed.size())                       \
-        << ev << "device count: " << CNT;                             \
-    auto n = std::min<int>(expected.size(), computed.size());         \
-    for (int i = 0; i < n; ++i) {                                     \
-      EXPECT_EQ(expected[i], computed[i])                             \
-          << "devices differ at index " << i << '\n'                  \
-          << ev << "device count: " << CNT;                           \
-    }                                                                 \
+#define KOKKOS_TEST_VISIBLE_DEVICES(ENV, CNT, DEV)                      \
+  do {                                                                  \
+    EnvVarsHelper ev{ENV};                                              \
+    SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev);                       \
+    auto computed             = Kokkos::Impl::get_visible_devices(CNT); \
+    std::vector<int> expected = DEV;                                    \
+    EXPECT_EQ(expected.size(), computed.size())                         \
+        << ev << "device count: " << CNT;                               \
+    auto n = std::min<int>(expected.size(), computed.size());           \
+    for (int i = 0; i < n; ++i) {                                       \
+      EXPECT_EQ(expected[i], computed[i])                               \
+          << "devices differ at index " << i << '\n'                    \
+          << ev << "device count: " << CNT;                             \
+    }                                                                   \
   } while (false)
 
 #define DEV(...) \
@@ -444,6 +412,8 @@ TEST(defaultdevicetype, visible_devices) {
 
   // first test with all environment variables that are involved in determining
   // the visible devices so user set var do not mess up the logic below.
+  // KOKKOS_NUM_DEVICES and KOKKOS_SKIP_DEVICE are deprecated since 3.7 and are
+  // not taken into account anymore.
   KOKKOS_TEST_VISIBLE_DEVICES(
       ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"},
           {"KOKKOS_SKIP_DEVICE", "1"}),
@@ -452,10 +422,10 @@ TEST(defaultdevicetype, visible_devices) {
       ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"}, ), 6,
       DEV(2, 1));
   KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_NUM_DEVICES", "3"}), 6,
-                              DEV(0, 1, 2));
+                              DEV(0, 1, 2, 3, 4, 5));
   KOKKOS_TEST_VISIBLE_DEVICES(
       ENV({"KOKKOS_NUM_DEVICES", "4"}, {"KOKKOS_SKIP_DEVICE", "1"}, ), 6,
-      DEV(0, 2, 3));
+      DEV(0, 1, 2, 3, 4, 5));
   KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_VISIBLE_DEVICES", "1,3,4"}), 6,
                               DEV(1, 3, 4));
   KOKKOS_TEST_VISIBLE_DEVICES(
diff --git a/core/unit_test/TestRangePolicyCTAD.cpp b/core/unit_test/TestRangePolicyCTAD.cpp
new file mode 100644
index 00000000000..20288e2b40a
--- /dev/null
+++ b/core/unit_test/TestRangePolicyCTAD.cpp
@@ -0,0 +1,150 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Core_fwd.hpp"
+
+namespace {
+
+struct TestRangePolicyCTAD {
+  struct SomeExecutionSpace {
+    using execution_space = SomeExecutionSpace;
+    using size_type       = size_t;
+
+    [[maybe_unused]] static int concurrency() { return 0; }
+  };
+  static_assert(Kokkos::is_execution_space_v<SomeExecutionSpace>);
+
+  struct ImplicitlyConvertibleToDefaultExecutionSpace {
+    [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const {
+      return Kokkos::DefaultExecutionSpace();
+    }
+  };
+  static_assert(!Kokkos::is_execution_space_v<
+                ImplicitlyConvertibleToDefaultExecutionSpace>);
+
+  [[maybe_unused]] static inline auto i64 = int64_t();
+  [[maybe_unused]] static inline auto i32 = int32_t();
+  [[maybe_unused]] static inline auto cs  = Kokkos::ChunkSize(0);
+  [[maybe_unused]] static inline auto des = Kokkos::DefaultExecutionSpace();
+  [[maybe_unused]] static inline auto nes =
+      ImplicitlyConvertibleToDefaultExecutionSpace();
+  [[maybe_unused]] static inline auto ses = SomeExecutionSpace();
+
+  // RangePolicy()
+
+  [[maybe_unused]] static inline auto rp = Kokkos::RangePolicy{};
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rp)>);
+
+  // RangePolicy(index_type, index_type)
+
+  [[maybe_unused]] static inline auto rpi64i64 = Kokkos::RangePolicy(i64, i64);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi64i64)>);
+
+  [[maybe_unused]] static inline auto rpi64i32 = Kokkos::RangePolicy(i64, i32);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi64i32)>);
+
+  [[maybe_unused]] static inline auto rpi32i64 = Kokkos::RangePolicy(i32, i64);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi32i64)>);
+
+  [[maybe_unused]] static inline auto rpi32i32 = Kokkos::RangePolicy(i32, i32);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi32i32)>);
+
+  // RangePolicy(index_type, index_type, ChunkSize)
+
+  [[maybe_unused]] static inline auto rpi64i64cs =
+      Kokkos::RangePolicy(i64, i64, cs);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi64i64cs)>);
+
+  [[maybe_unused]] static inline auto rpi64i32cs =
+      Kokkos::RangePolicy(i64, i32, cs);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi64i32cs)>);
+
+  [[maybe_unused]] static inline auto rpi32i64cs =
+      Kokkos::RangePolicy(i32, i64, cs);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi32i64cs)>);
+
+  [[maybe_unused]] static inline auto rpi32i32cs =
+      Kokkos::RangePolicy(i32, i32, cs);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi32i32cs)>);
+
+  // RangePolicy(execution_space, index_type, index_type)
+
+  [[maybe_unused]] static inline auto rpdesi64i64 =
+      Kokkos::RangePolicy(des, i64, i64);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpdesi64i64)>);
+
+  [[maybe_unused]] static inline auto rpdesi32i32 =
+      Kokkos::RangePolicy(des, i32, i32);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpdesi32i32)>);
+
+  [[maybe_unused]] static inline auto rpnesi64i64 =
+      Kokkos::RangePolicy(nes, i64, i64);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpnesi64i64)>);
+
+  [[maybe_unused]] static inline auto rpnesi32i32 =
+      Kokkos::RangePolicy(nes, i32, i32);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpnesi32i32)>);
+
+  [[maybe_unused]] static inline auto rpsesi64i64 =
+      Kokkos::RangePolicy(ses, i64, i64);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<SomeExecutionSpace>,
+                               decltype(rpsesi64i64)>);
+
+  [[maybe_unused]] static inline auto rpsesi32i32 =
+      Kokkos::RangePolicy(ses, i32, i32);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<SomeExecutionSpace>,
+                               decltype(rpsesi32i32)>);
+
+  // RangePolicy(execution_space, index_type, index_type, ChunkSize)
+
+  [[maybe_unused]] static inline auto rpdesi64i64cs =
+      Kokkos::RangePolicy(des, i64, i64, cs);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpdesi64i64cs)>);
+
+  [[maybe_unused]] static inline auto rpdesi32i32cs =
+      Kokkos::RangePolicy(des, i32, i32, cs);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpdesi32i32cs)>);
+
+  [[maybe_unused]] static inline auto rpnesi64i64cs =
+      Kokkos::RangePolicy(nes, i64, i64, cs);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpnesi64i64cs)>);
+
+  [[maybe_unused]] static inline auto rpnesi32i32cs =
+      Kokkos::RangePolicy(nes, i32, i32, cs);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpnesi32i32cs)>);
+
+  [[maybe_unused]] static inline auto rpsesi64i64cs =
+      Kokkos::RangePolicy(ses, i64, i64, cs);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<SomeExecutionSpace>,
+                               decltype(rpsesi64i64cs)>);
+
+  [[maybe_unused]] static inline auto rpsesi32i32cs =
+      Kokkos::RangePolicy(ses, i32, i32, cs);
+  static_assert(std::is_same_v<Kokkos::RangePolicy<SomeExecutionSpace>,
+                               decltype(rpsesi32i32cs)>);
+
+};  // TestRangePolicyCTAD struct
+
+// To eliminate maybe_unused warning on some compilers
+
+[[maybe_unused]] const Kokkos::DefaultExecutionSpace nestodes =
+    TestRangePolicyCTAD::ImplicitlyConvertibleToDefaultExecutionSpace();
+
+[[maybe_unused]] const auto sesconcurrency =
+    TestRangePolicyCTAD::ses.concurrency();
+
+}  // namespace
diff --git a/core/unit_test/TestRangePolicyConstructors.hpp b/core/unit_test/TestRangePolicyConstructors.hpp
index 0a7e59ed980..c8c1542af13 100644
--- a/core/unit_test/TestRangePolicyConstructors.hpp
+++ b/core/unit_test/TestRangePolicyConstructors.hpp
@@ -18,6 +18,9 @@
 
 #include <Kokkos_Core.hpp>
 
+#include <regex>
+#include <limits>
+
 namespace {
 
 TEST(TEST_CATEGORY, range_policy_runtime_parameters) {
@@ -70,4 +73,127 @@ TEST(TEST_CATEGORY, range_policy_runtime_parameters) {
   }
 }
 
+TEST(TEST_CATEGORY_DEATH, range_policy_invalid_bounds) {
+  using Policy    = Kokkos::RangePolicy<TEST_EXECSPACE>;
+  using ChunkSize = Kokkos::ChunkSize;
+
+  std::string msg =
+      "Kokkos::RangePolicy bounds error: The lower bound (100) is greater than "
+      "the upper bound (90).\n";
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  // escape the parentheses in the regex to match the error message
+  msg = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&");
+  ASSERT_DEATH({ (void)Policy(100, 90); }, msg);
+
+  ASSERT_DEATH({ (void)Policy(TEST_EXECSPACE(), 100, 90, ChunkSize(10)); },
+               msg);
+#else
+
+  if (!Kokkos::show_warnings()) {
+    GTEST_SKIP() << "Kokkos warning messages are disabled";
+  }
+
+  {
+    ::testing::internal::CaptureStderr();
+    Policy policy(100, 90);
+    ASSERT_EQ((int)policy.begin(), 0);
+    ASSERT_EQ((int)policy.end(), 0);
+#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
+    ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg);
+#else
+    ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty());
+    (void)msg;
+#endif
+  }
+
+  {
+    ::testing::internal::CaptureStderr();
+    Policy policy(TEST_EXECSPACE(), 100, 90, ChunkSize(10));
+    ASSERT_EQ((int)policy.begin(), 0);
+    ASSERT_EQ((int)policy.end(), 0);
+#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
+    ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg);
+#else
+    ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty());
+    (void)msg;
+#endif
+  }
+
+#endif
+}
+
+TEST(TEST_CATEGORY_DEATH, range_policy_implicitly_converted_bounds) {
+  using UIntIndexType = Kokkos::IndexType<unsigned>;
+  using IntIndexType  = Kokkos::IndexType<int>;
+  using UIntPolicy    = Kokkos::RangePolicy<TEST_EXECSPACE, UIntIndexType>;
+  using IntPolicy     = Kokkos::RangePolicy<TEST_EXECSPACE, IntIndexType>;
+
+  std::string msg =
+      "Kokkos::RangePolicy bound type error: an unsafe implicit conversion is "
+      "performed on a bound (), which may not preserve its original value.\n";
+
+  auto get_error_msg = [](auto str, auto val) {
+    return str.insert(str.find("(") + 1, std::to_string(val).c_str());
+  };
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  std::string expected = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&");
+  {
+    int test_val = -1;
+    ASSERT_DEATH({ (void)UIntPolicy(test_val, 10); },
+                 get_error_msg(expected, test_val));
+  }
+  {
+    unsigned test_val = std::numeric_limits<unsigned>::max();
+    ASSERT_DEATH({ (void)IntPolicy(0u, test_val); },
+                 get_error_msg(expected, test_val));
+  }
+  {
+    long long test_val = std::numeric_limits<long long>::max();
+    ASSERT_DEATH({ (void)IntPolicy(0LL, test_val); },
+                 get_error_msg(expected, test_val));
+  }
+  {
+    int test_val = -1;
+    ASSERT_DEATH({ (void)UIntPolicy(test_val, 10, Kokkos::ChunkSize(2)); },
+                 get_error_msg(expected, test_val));
+  }
+
+#else
+  {
+    ::testing::internal::CaptureStderr();
+    int test_val = -1;
+    UIntPolicy policy(test_val, 10);
+    ASSERT_EQ(policy.begin(), 0u);
+    ASSERT_EQ(policy.end(), 0u);
+#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
+    if (Kokkos::show_warnings()) {
+      auto s = std::string(::testing::internal::GetCapturedStderr());
+      ASSERT_EQ(s.substr(0, s.find("\n") + 1), get_error_msg(msg, test_val));
+    }
+#else
+    ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty());
+    (void)msg;
+    (void)get_error_msg;
+#endif
+  }
+  {
+    ::testing::internal::CaptureStderr();
+    unsigned test_val = std::numeric_limits<unsigned>::max();
+    IntPolicy policy(0u, test_val);
+    ASSERT_EQ(policy.begin(), 0);
+    ASSERT_EQ(policy.end(), 0);
+#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
+    if (Kokkos::show_warnings()) {
+      auto s = std::string(::testing::internal::GetCapturedStderr());
+      ASSERT_EQ(s.substr(0, s.find("\n") + 1), get_error_msg(msg, test_val));
+    }
+#else
+    ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty());
+    (void)msg;
+    (void)get_error_msg;
+#endif
+  }
+#endif
+}
+
 }  // namespace
diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp
index 957b9a0ca1a..fbcb9629af0 100644
--- a/core/unit_test/TestReducers.hpp
+++ b/core/unit_test/TestReducers.hpp
@@ -19,6 +19,7 @@
 #include <limits>
 
 #include <Kokkos_Core.hpp>
+#include <TestNonTrivialScalarTypes.hpp>
 
 //--------------------------------------------------------------------------
 
@@ -46,6 +47,37 @@ struct TestReducers {
     void operator()(const int& i, Scalar& value) const { value += values(i); }
   };
 
+  struct TeamSumFunctor {
+    using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type& m, Scalar& value) const {
+      if (m.team_rank() == m.team_size() - 1) value += Scalar(1);
+    }
+  };
+
+  struct TeamSumNestedFunctor {
+    using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
+
+    SumFunctor f;
+    int M, N;
+    Kokkos::View<Scalar*, ExecSpace> result;
+
+    TeamSumNestedFunctor(SumFunctor& f_, const int M_, const int N_,
+                         Kokkos::View<Scalar*, ExecSpace> result_)
+        : f(f_), M(M_), N(N_), result(result_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type& m) const {
+      const int i = m.league_rank();
+      Scalar local_scalar;
+      Kokkos::Sum<Scalar, typename ExecSpace::memory_space> reducer_scalar(
+          local_scalar);
+      Kokkos::parallel_reduce(Kokkos::TeamThreadRange(m, N), f, reducer_scalar);
+      result(i) = local_scalar;
+    }
+  };
+
   struct ProdFunctor {
     Kokkos::View<const Scalar*, ExecSpace> values;
 
@@ -319,6 +351,102 @@ struct TestReducers {
       value = value || values(i);
     }
   };
+
+  // get number of teams for TeamPolicy depending on the tested type
+  constexpr static int get_num_teams() {
+    if constexpr (sizeof(Scalar) == 1) {
+      return 126;
+    } else if constexpr (std::is_same_v<Scalar,
+                                        Kokkos::Experimental::bhalf_t>) {
+      return 256;
+    }
+
+    return 1024;
+  }
+
+  static void test_sum_team_policy(int N, SumFunctor f, Scalar reference_sum) {
+#ifdef KOKKOS_ENABLE_OPENACC
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::OpenACC> &&
+                  (std::is_same_v<Scalar, size_t> ||
+                   std::is_same_v<Scalar, double>)) {
+      return;  // FIXME_OPENACC
+    }
+#endif
+
+    Scalar sum_scalar;
+    Kokkos::View<Scalar, ExecSpace> sum_view("result");
+    Kokkos::deep_copy(sum_view, Scalar(1));
+
+    // Test team policy reduction
+    {
+      constexpr int num_teams = get_num_teams();
+      TeamSumFunctor tf;
+      // FIXME_OPENMPTARGET temporary restriction for team size to be at least
+      // 32
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+      int team_size =
+          std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
+              ? 32
+              : 1;
+#else
+      int team_size         = 1;
+#endif
+      auto team_pol = Kokkos::TeamPolicy<ExecSpace>(num_teams, team_size);
+      Kokkos::parallel_reduce(team_pol, tf, sum_view);
+      Kokkos::deep_copy(sum_scalar, sum_view);
+      ASSERT_EQ(sum_scalar, Scalar{num_teams}) << "num_teams: " << num_teams;
+    }
+
+    // Test TeamThreadRange level reduction with 0 work produces 0 result
+    {
+      const int league_size = 1;
+      Kokkos::View<Scalar*, ExecSpace> result("result", league_size);
+      TeamSumNestedFunctor tnf(f, league_size, 0, result);
+      // FIXME_OPENMPTARGET temporary restriction for team size to be at least
+      // 32
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+      int team_size =
+          std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
+              ? 32
+              : 1;
+#else
+      int team_size         = 1;
+#endif
+      auto team_pol = Kokkos::TeamPolicy<ExecSpace>(1, team_size);
+      Kokkos::parallel_for(team_pol, tnf);
+      auto result_h =
+          Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result);
+      ASSERT_EQ(result_h(0), Scalar{0}) << "N: " << N;
+    }
+
+    // Same test as above, but with inner reduction over N, and league_size=10
+    {
+      const int league_size = 10;
+      Kokkos::View<Scalar*, ExecSpace> result("result", league_size);
+      TeamSumNestedFunctor tnf(f, league_size, N, result);
+      // FIXME_OPENMPTARGET temporary restriction for team size to be at least
+      // 32
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+      int initial_team_size =
+          std::is_same_v<ExecSpace, Kokkos::Experimental::OpenMPTarget> ? 32
+                                                                        : 1;
+#else
+      int initial_team_size = 1;
+#endif
+      auto team_size_max =
+          Kokkos::TeamPolicy<ExecSpace>(league_size, initial_team_size)
+              .team_size_max(tnf, Kokkos::ParallelForTag());
+      auto team_size = std::min(team_size_max, TEST_EXECSPACE().concurrency());
+      auto team_pol  = Kokkos::TeamPolicy<ExecSpace>(league_size, team_size);
+      Kokkos::parallel_for(team_pol, tnf);
+      auto result_h =
+          Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result);
+      for (int i = 0; i < result_h.extent_int(0); ++i) {
+        ASSERT_EQ(result_h(i), reference_sum) << "N: " << N;
+      }
+    }
+  }
+
   static void test_sum(int N) {
     Kokkos::View<Scalar*, ExecSpace> values("Values", N);
     auto h_values        = Kokkos::create_mirror_view(values);
@@ -374,6 +502,8 @@ struct TestReducers {
       ASSERT_EQ(sum_scalar_view, reference_sum) << "N: " << N;
     }
 
+    test_sum_team_policy(N, f, reference_sum);
+
     {
       Kokkos::View<Scalar, Kokkos::HostSpace> sum_view("View");
       sum_view() = Scalar(1);
diff --git a/core/unit_test/TestReducers_d.hpp b/core/unit_test/TestReducers_d.hpp
index 19eaa6d7000..ecf851aa108 100644
--- a/core/unit_test/TestReducers_d.hpp
+++ b/core/unit_test/TestReducers_d.hpp
@@ -80,6 +80,20 @@ TEST(TEST_CATEGORY, reducers_int8_t) {
   TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(4);
 }
 
+TEST(TEST_CATEGORY, reducers_int16_t) {
+  using ThisTestType = int16_t;
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(4);
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(4);
+}
+
 #if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENMPTARGET)
 // TODO - resolve: "Kokkos_HIP_Vectorization.hpp:80:15: error: call to
 //                 implicitly-deleted default constructor of 'conv_type'
diff --git a/core/unit_test/TestSwap.hpp b/core/unit_test/TestSwap.hpp
new file mode 100644
index 00000000000..4e98351cf19
--- /dev/null
+++ b/core/unit_test/TestSwap.hpp
@@ -0,0 +1,68 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <type_traits>
+#include <utility>
+
+namespace {
+
+template <class ExecutionSpace>
+struct TestSwap {
+  KOKKOS_FUNCTION void operator()(int, int& err) const {
+    {
+      int a = 1;
+      int b = 2;
+      Kokkos::kokkos_swap(a, b);
+      if (!(a == 2 && b == 1)) {
+        Kokkos::printf("Failed Kokkos::kokkos_swap(int, int)\n");
+        ++err;
+      }
+    }
+    {
+      float a = 1;
+      float b = 2;
+      Kokkos::kokkos_swap(a, b);
+      if (!(a == 2 && b == 1)) {
+        Kokkos::printf("Failed Kokkos::kokkos_swap(float, float)\n");
+        ++err;
+      }
+    }
+    {
+      int a[3] = {1, 2, 3};
+      int b[3] = {4, 5, 6};
+      Kokkos::kokkos_swap(a, b);
+      if (!(a[0] == 4 && a[1] == 5 && a[2] == 6 && b[0] == 1 && b[1] == 2 &&
+            b[2] == 3)) {
+        Kokkos::printf("Failed Kokkos::kokkos_swap(int[3], int[3])\n");
+        ++err;
+      }
+    }
+  }
+
+  TestSwap() {
+    int errors;
+    Kokkos::parallel_reduce(
+        "TestSwap", Kokkos::RangePolicy<ExecutionSpace>(0, 1), *this, errors);
+    EXPECT_EQ(errors, 0);
+  }
+};
+
+TEST(TEST_CATEGORY, kokkos_swap) { TestSwap<TEST_EXECSPACE>(); }
+
+}  // namespace
diff --git a/core/unit_test/TestTeamBasic.hpp b/core/unit_test/TestTeamBasic.hpp
index c395bc0837c..a3d84c5e16b 100644
--- a/core/unit_test/TestTeamBasic.hpp
+++ b/core/unit_test/TestTeamBasic.hpp
@@ -280,7 +280,7 @@ namespace Test {
 
 // Test for non-arithmetic type
 TEST(TEST_CATEGORY, team_broadcast_long_wrapper) {
-  static_assert(!std::is_arithmetic<long_wrapper>::value, "");
+  static_assert(!std::is_arithmetic<long_wrapper>::value);
 
   TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
                     long_wrapper>::test_teambroadcast(0, 1);
diff --git a/core/unit_test/TestTeamMDRange.hpp b/core/unit_test/TestTeamMDRange.hpp
index 6e65cde0cf8..81931467c5a 100644
--- a/core/unit_test/TestTeamMDRange.hpp
+++ b/core/unit_test/TestTeamMDRange.hpp
@@ -169,7 +169,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -202,7 +209,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -236,7 +250,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -272,7 +293,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -310,7 +338,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -350,7 +385,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor {
                                           n6);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -420,7 +462,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -457,7 +506,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -496,7 +552,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -536,7 +599,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -579,7 +649,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor {
                                           n6);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -620,7 +697,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -653,7 +737,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -687,7 +778,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -723,7 +821,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -761,7 +866,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor {
     FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -801,7 +913,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor {
                                           n6);
 
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(const TeamType& team) {
           int leagueRank = team.league_rank();
 
@@ -908,13 +1027,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce {
           v(i, j, k) = fillFlattenedIndex(i, j, k);
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
+          DataType teamSum;
 
           Kokkos::parallel_reduce(
               Kokkos::TeamThreadMDRange<Kokkos::Rank<2, Direction>, TeamType>(
@@ -923,7 +1049,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce {
                 threadSum += v(leagueRank, i, j);
               },
               teamSum);
-          leagueSum += teamSum;
+      // FIXME_OPENMPTARGET
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+          if (team.team_rank() == 0) leagueSum += teamSum;
+#else
+          Kokkos::single(Kokkos::PerTeam(team),
+                         [&]() { leagueSum += teamSum; });
+#endif
         },
         finalSum);
 
@@ -952,13 +1084,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce {
           v(i, j, k, l) = fillFlattenedIndex(i, j, k, l);
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
+          DataType teamSum;
 
           Kokkos::parallel_reduce(
               Kokkos::TeamThreadMDRange<Kokkos::Rank<3, Direction>, TeamType>(
@@ -966,7 +1105,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce {
               [=](const int& i, const int& j, const int& k,
                   DataType& threadSum) { threadSum += v(leagueRank, i, j, k); },
               teamSum);
-          leagueSum += teamSum;
+// FIXME_OPENMPTARGET
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+          if (team.team_rank() == 0) leagueSum += teamSum;
+#else
+          Kokkos::single(Kokkos::PerTeam(team),
+                         [&]() { leagueSum += teamSum; });
+#endif
         },
         finalSum);
 
@@ -997,13 +1142,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce {
           v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m);
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
+          DataType teamSum;
 
           Kokkos::parallel_reduce(
               Kokkos::TeamThreadMDRange<Kokkos::Rank<4, Direction>, TeamType>(
@@ -1013,7 +1165,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce {
                 threadSum += v(leagueRank, i, j, k, l);
               },
               teamSum);
-          leagueSum += teamSum;
+// FIXME_OPENMPTARGET
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+          if (team.team_rank() == 0) leagueSum += teamSum;
+#else
+          Kokkos::single(Kokkos::PerTeam(team),
+                         [&]() { leagueSum += teamSum; });
+#endif
         },
         finalSum);
 
@@ -1045,13 +1203,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce {
           v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n);
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
+          DataType teamSum;
 
           Kokkos::parallel_reduce(
               Kokkos::TeamThreadMDRange<Kokkos::Rank<5, Direction>, TeamType>(
@@ -1061,7 +1226,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce {
                 threadSum += v(leagueRank, i, j, k, l, m);
               },
               teamSum);
-          leagueSum += teamSum;
+      // FIXME_OPENMPTARGET
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+          if (team.team_rank() == 0) leagueSum += teamSum;
+#else
+          Kokkos::single(Kokkos::PerTeam(team),
+                         [&]() { leagueSum += teamSum; });
+#endif
         },
         finalSum);
 
@@ -1100,13 +1271,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce {
           }
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
+          DataType teamSum;
 
           Kokkos::parallel_reduce(
               Kokkos::TeamThreadMDRange<Kokkos::Rank<6, Direction>, TeamType>(
@@ -1116,7 +1294,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce {
                 threadSum += v(leagueRank, i, j, k, l, m, n);
               },
               teamSum);
-          leagueSum += teamSum;
+// FIXME_OPENMPTARGET
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+          if (team.team_rank() == 0) leagueSum += teamSum;
+#else
+          Kokkos::single(Kokkos::PerTeam(team),
+                         [&]() { leagueSum += teamSum; });
+#endif
         },
         finalSum);
 
@@ -1157,13 +1341,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce {
           }
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
+          DataType teamSum;
 
           Kokkos::parallel_reduce(
               Kokkos::TeamThreadMDRange<Kokkos::Rank<7, Direction>, TeamType>(
@@ -1174,7 +1365,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce {
                 threadSum += v(leagueRank, i, j, k, l, m, n, o);
               },
               teamSum);
-          leagueSum += teamSum;
+// FIXME_OPENMPTARGET
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+          if (team.team_rank() == 0) leagueSum += teamSum;
+#else
+          Kokkos::single(Kokkos::PerTeam(team),
+                         [&]() { leagueSum += teamSum; });
+#endif
         },
         finalSum);
 
@@ -1207,20 +1404,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
           v(i, j, k, l) = fillFlattenedIndex(i, j, k, l);
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
 
           auto teamThreadRange = Kokkos::TeamThreadRange(team, n0);
           auto threadVectorRange =
               Kokkos::ThreadVectorMDRange<Kokkos::Rank<2, Direction>, TeamType>(
                   team, n1, n2);
 
-          Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) {
+          Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) {
             DataType threadSum = 0;
             Kokkos::parallel_reduce(
                 threadVectorRange,
@@ -1228,11 +1431,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
                   vectorSum += v(leagueRank, i, j, k);
                 },
                 threadSum);
-
-            teamSum += threadSum;
+            Kokkos::single(Kokkos::PerThread(team),
+                           [&]() { leagueSum += threadSum; });
           });
-
-          leagueSum += teamSum;
         },
         finalSum);
 
@@ -1263,20 +1464,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
           v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m);
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
 
           auto teamThreadRange = Kokkos::TeamThreadRange(team, n0);
           auto threadVectorRange =
               Kokkos::ThreadVectorMDRange<Kokkos::Rank<3, Direction>, TeamType>(
                   team, n1, n2, n3);
 
-          Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) {
+          Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) {
             DataType threadSum = 0;
             Kokkos::parallel_reduce(
                 threadVectorRange,
@@ -1286,10 +1493,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
                 },
                 threadSum);
 
-            teamSum += threadSum;
+            Kokkos::single(Kokkos::PerThread(team),
+                           [&]() { leagueSum += threadSum; });
           });
-
-          leagueSum += teamSum;
         },
         finalSum);
 
@@ -1321,20 +1527,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
           v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n);
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
 
           auto teamThreadRange = Kokkos::TeamThreadRange(team, n0);
           auto threadVectorRange =
               Kokkos::ThreadVectorMDRange<Kokkos::Rank<4, Direction>, TeamType>(
                   team, n1, n2, n3, n4);
 
-          Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) {
+          Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) {
             DataType threadSum = 0;
             Kokkos::parallel_reduce(
                 threadVectorRange,
@@ -1344,10 +1556,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
                 },
                 threadSum);
 
-            teamSum += threadSum;
+            Kokkos::single(Kokkos::PerThread(team),
+                           [&]() { leagueSum += threadSum; });
           });
-
-          leagueSum += teamSum;
         },
         finalSum);
 
@@ -1384,20 +1595,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
           }
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
 
           auto teamThreadRange = Kokkos::TeamThreadRange(team, n0);
           auto threadVectorRange =
               Kokkos::ThreadVectorMDRange<Kokkos::Rank<5, Direction>, TeamType>(
                   team, n1, n2, n3, n4, n5);
 
-          Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) {
+          Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) {
             DataType threadSum = 0;
             Kokkos::parallel_reduce(
                 threadVectorRange,
@@ -1407,10 +1624,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
                 },
                 threadSum);
 
-            teamSum += threadSum;
+            Kokkos::single(Kokkos::PerThread(team),
+                           [&]() { leagueSum += threadSum; });
           });
-
-          leagueSum += teamSum;
         },
         finalSum);
 
@@ -1451,20 +1667,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
           }
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
 
           auto teamThreadRange = Kokkos::TeamThreadRange(team, n0);
           auto threadVectorRange =
               Kokkos::ThreadVectorMDRange<Kokkos::Rank<6, Direction>, TeamType>(
                   team, n1, n2, n3, n4, n5, n6);
 
-          Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) {
+          Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) {
             DataType threadSum = 0;
             Kokkos::parallel_reduce(
                 threadVectorRange,
@@ -1474,10 +1696,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
                 },
                 threadSum);
 
-            teamSum += threadSum;
+            Kokkos::single(Kokkos::PerThread(team),
+                           [&]() { leagueSum += threadSum; });
           });
-
-          leagueSum += teamSum;
         },
         finalSum);
 
@@ -1510,13 +1731,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
           v(i, j, k, l) = fillFlattenedIndex(i, j, k, l);
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
+          DataType teamSum;
 
           auto teamVectorRange =
               Kokkos::TeamVectorMDRange<Kokkos::Rank<3, Direction>, TeamType>(
@@ -1527,7 +1755,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
               [=](const int& i, const int& j, const int& k,
                   DataType& vectorSum) { vectorSum += v(leagueRank, i, j, k); },
               teamSum);
-          leagueSum += teamSum;
+// FIXME_OPENMPTARGET
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+          if (team.team_rank() == 0) leagueSum += teamSum;
+#else
+          Kokkos::single(Kokkos::PerTeam(team),
+                         [&]() { leagueSum += teamSum; });
+#endif
         },
         finalSum);
 
@@ -1558,13 +1792,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
           v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m);
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
+          DataType teamSum;
 
           auto teamVectorRange =
               Kokkos::TeamVectorMDRange<Kokkos::Rank<4, Direction>, TeamType>(
@@ -1577,7 +1818,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
                 vectorSum += v(leagueRank, i, j, k, l);
               },
               teamSum);
-          leagueSum += teamSum;
+// FIXME_OPENMPTARGET
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+          if (team.team_rank() == 0) leagueSum += teamSum;
+#else
+          Kokkos::single(Kokkos::PerTeam(team),
+                         [&]() { leagueSum += teamSum; });
+#endif
         },
         finalSum);
 
@@ -1609,13 +1856,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
           v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n);
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
+          DataType teamSum;
 
           auto teamVectorRange =
               Kokkos::TeamVectorMDRange<Kokkos::Rank<5, Direction>, TeamType>(
@@ -1628,7 +1882,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
                 vectorSum += v(leagueRank, i, j, k, l, m);
               },
               teamSum);
-          leagueSum += teamSum;
+// FIXME_OPENMPTARGET
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+          if (team.team_rank() == 0) leagueSum += teamSum;
+#else
+          Kokkos::single(Kokkos::PerTeam(team),
+                         [&]() { leagueSum += teamSum; });
+#endif
         },
         finalSum);
 
@@ -1665,13 +1925,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
           }
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
+          DataType teamSum;
 
           auto teamVectorRange =
               Kokkos::TeamVectorMDRange<Kokkos::Rank<6, Direction>, TeamType>(
@@ -1684,7 +1951,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
                 vectorSum += v(leagueRank, i, j, k, l, m, n);
               },
               teamSum);
-          leagueSum += teamSum;
+// FIXME_OPENMPTARGET
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+          if (team.team_rank() == 0) leagueSum += teamSum;
+#else
+          Kokkos::single(Kokkos::PerTeam(team),
+                         [&]() { leagueSum += teamSum; });
+#endif
         },
         finalSum);
 
@@ -1725,13 +1998,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
           }
         });
 
-    DataType finalSum = 0;
+    DataType finalSum;
 
     Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO),
+        Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO,
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+                                      2
+#else
+                                      Kokkos::TeamPolicy<
+                                          ExecSpace>::vector_length_max()
+#endif
+                                      ),
         KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) {
-          auto leagueRank  = team.league_rank();
-          DataType teamSum = 0;
+          auto leagueRank = team.league_rank();
+          DataType teamSum;
 
           auto teamVectorRange =
               Kokkos::TeamVectorMDRange<Kokkos::Rank<7, Direction>, TeamType>(
@@ -1745,7 +2025,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce {
                 vectorSum += v(leagueRank, i, j, k, l, m, n, o);
               },
               teamSum);
-          leagueSum += teamSum;
+// FIXME_OPENMPTARGET
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+          if (team.team_rank() == 0) leagueSum += teamSum;
+#else
+          Kokkos::single(Kokkos::PerTeam(team),
+                         [&]() { leagueSum += teamSum; });
+#endif
         },
         finalSum);
 
@@ -1904,13 +2190,6 @@ TEST(TEST_CATEGORY, ThreadVectorMDRangeParallelReduce) {
     GTEST_SKIP() << "skipping because of bug in group_barrier implementation";
 #endif
 
-// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness.
-#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG)
-  if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>)
-    GTEST_SKIP() << "Cray compiler fails correctness at runtime with the "
-                    "OpenMPTarget backend.";
-#endif
-
   TestThreadVectorMDRangeParallelReduce<TEST_EXECSPACE>::
       test_parallel_reduce_for_4D_ThreadVectorMDRange<Left>(dims);
   TestThreadVectorMDRangeParallelReduce<TEST_EXECSPACE>::
@@ -1944,13 +2223,6 @@ TEST(TEST_CATEGORY, TeamVectorMDRangeParallelReduce) {
     GTEST_SKIP() << "skipping because of bug in group_barrier implementation";
 #endif
 
-// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness.
-#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG)
-  if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>)
-    GTEST_SKIP() << "Cray compiler fails correctness at runtime with the "
-                    "OpenMPTarget backend.";
-#endif
-
   TestTeamVectorMDRangeParallelReduce<TEST_EXECSPACE>::
       test_parallel_reduce_for_4D_TeamVectorMDRange<Left>(dims);
   TestTeamVectorMDRangeParallelReduce<TEST_EXECSPACE>::
diff --git a/core/unit_test/TestTeamPolicyConstructors.hpp b/core/unit_test/TestTeamPolicyConstructors.hpp
index 5b0bfdb1755..9d89f757086 100644
--- a/core/unit_test/TestTeamPolicyConstructors.hpp
+++ b/core/unit_test/TestTeamPolicyConstructors.hpp
@@ -20,11 +20,24 @@
 
 namespace {
 
+struct SomeTag {};
+
+struct FunctorFor {
+  KOKKOS_FUNCTION
+  void operator()(
+      Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type const&) const {}
+
+  KOKKOS_FUNCTION
+  void operator()(
+      SomeTag, Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type const&) const {}
+};
+
 template <typename Policy>
 void test_run_time_parameters() {
   int league_size = 131;
 
   using ExecutionSpace = typename Policy::execution_space;
+  using ParallelTag    = Kokkos::ParallelForTag;
   int team_size =
       4 < ExecutionSpace().concurrency() ? 4 : ExecutionSpace().concurrency();
 #ifdef KOKKOS_ENABLE_HPX
@@ -44,6 +57,8 @@ void test_run_time_parameters() {
   ASSERT_EQ(p1.team_size(), team_size);
   ASSERT_GT(p1.chunk_size(), 0);
   ASSERT_EQ(p1.scratch_size(0), 0u);
+  ASSERT_GT(p1.team_size_max(FunctorFor(), ParallelTag()), 0);
+  ASSERT_GT(p1.team_size_recommended(FunctorFor(), ParallelTag()), 0);
 
   Policy p2 = p1.set_chunk_size(chunk_size);
   ASSERT_EQ(p1.league_size(), league_size);
@@ -112,6 +127,8 @@ void test_run_time_parameters() {
   Policy p8;  // default constructed
   ASSERT_EQ(p8.league_size(), 0);
   ASSERT_EQ(p8.scratch_size(0), 0u);
+  ASSERT_GT(p8.team_size_max(FunctorFor(), ParallelTag()), 0);
+  ASSERT_GT(p8.team_size_recommended(FunctorFor(), ParallelTag()), 0);
   p8 = p3;  // call assignment operator
   ASSERT_EQ(p3.league_size(), league_size);
   ASSERT_EQ(p3.team_size(), team_size);
@@ -121,11 +138,25 @@ void test_run_time_parameters() {
   ASSERT_EQ(p8.team_size(), team_size);
   ASSERT_EQ(p8.chunk_size(), chunk_size);
   ASSERT_EQ(p8.scratch_size(0), size_t(scratch_size));
+
+  Policy p9(league_size, Kokkos::AUTO);
+  ASSERT_EQ(p9.league_size(), league_size);
+  ASSERT_GT(p9.team_size_max(FunctorFor(), ParallelTag()), 0);
+  ASSERT_GT(p9.team_size_recommended(FunctorFor(), ParallelTag()), 0);
+
+  Policy p10(league_size, team_size, Kokkos::AUTO);
+  ASSERT_EQ(p10.league_size(), league_size);
+  ASSERT_EQ(p10.team_size(), team_size);
+  ASSERT_GT(p10.team_size_max(FunctorFor(), ParallelTag()), 0);
+  ASSERT_GT(p10.team_size_recommended(FunctorFor(), ParallelTag()), 0);
+
+  Policy p11(league_size, Kokkos::AUTO, Kokkos::AUTO);
+  ASSERT_EQ(p11.league_size(), league_size);
+  ASSERT_GT(p11.team_size_max(FunctorFor(), ParallelTag()), 0);
+  ASSERT_GT(p11.team_size_recommended(FunctorFor(), ParallelTag()), 0);
 }
 
 TEST(TEST_CATEGORY, team_policy_runtime_parameters) {
-  struct SomeTag {};
-
   using TestExecSpace   = TEST_EXECSPACE;
   using DynamicSchedule = Kokkos::Schedule<Kokkos::Dynamic>;
   using LongIndex       = Kokkos::IndexType<long>;
diff --git a/core/unit_test/TestTeamVector.hpp b/core/unit_test/TestTeamVector.hpp
index 39122736ed7..5e16539d652 100644
--- a/core/unit_test/TestTeamVector.hpp
+++ b/core/unit_test/TestTeamVector.hpp
@@ -1012,7 +1012,6 @@ struct checkScan {
 };
 }  // namespace VectorScanReducer
 
-#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
 TEST(TEST_CATEGORY, team_vector) {
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(0)));
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(1)));
@@ -1028,9 +1027,7 @@ TEST(TEST_CATEGORY, team_vector) {
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(11)));
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(12)));
 }
-#endif
 
-#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
 TEST(TEST_CATEGORY, triple_nested_parallelism) {
 // With KOKKOS_ENABLE_DEBUG enabled, the functor uses too many registers to run
 // with a team size of 32 on GPUs, 16 is the max possible (at least on a K80
@@ -1055,7 +1052,6 @@ TEST(TEST_CATEGORY, triple_nested_parallelism) {
   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 16);
   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 7, 16);
 }
-#endif
 
 TEST(TEST_CATEGORY, parallel_scan_with_reducers) {
   using T = double;
diff --git a/core/unit_test/TestUtilities.hpp b/core/unit_test/TestUtilities.hpp
index b1f9d30c1fc..ad5a0df92de 100644
--- a/core/unit_test/TestUtilities.hpp
+++ b/core/unit_test/TestUtilities.hpp
@@ -25,20 +25,18 @@ namespace Test {
 
 void test_is_specialization_of() {
   using Kokkos::Impl::is_specialization_of;
-  static_assert(is_specialization_of<Kokkos::pair<float, int>, Kokkos::pair>{},
-                "");
-  static_assert(!is_specialization_of<Kokkos::View<int*>, Kokkos::pair>{}, "");
-  static_assert(is_specialization_of<Kokkos::View<int*>, Kokkos::View>{}, "");
+  static_assert(is_specialization_of<Kokkos::pair<float, int>, Kokkos::pair>{});
+  static_assert(!is_specialization_of<Kokkos::View<int*>, Kokkos::pair>{});
+  static_assert(is_specialization_of<Kokkos::View<int*>, Kokkos::View>{});
   // NOTE Not removing cv-qualifiers
-  static_assert(!is_specialization_of<Kokkos::View<int*> const, Kokkos::View>{},
-                "");
+  static_assert(
+      !is_specialization_of<Kokkos::View<int*> const, Kokkos::View>{});
   // NOTE Would not compile because Kokkos::Array takes a non-type template
   // parameter
-  // static_assert(is_specialization_of<Kokkos::Array<int, 4>, Kokkos::Array>{},
-  // "");
+  // static_assert(is_specialization_of<Kokkos::Array<int, 4>,
+  //               Kokkos::Array>{});
   // But this is fine of course
-  static_assert(!is_specialization_of<Kokkos::Array<float, 2>, Kokkos::pair>{},
-                "");
+  static_assert(!is_specialization_of<Kokkos::Array<float, 2>, Kokkos::pair>{});
 }
 
 namespace {
diff --git a/core/unit_test/TestViewAPI.hpp b/core/unit_test/TestViewAPI.hpp
index ffc500e4a9a..ca098dbc247 100644
--- a/core/unit_test/TestViewAPI.hpp
+++ b/core/unit_test/TestViewAPI.hpp
@@ -958,8 +958,7 @@ class TestViewAPI {
     using mirror_type = typename view_type::HostMirror;
 
     static_assert(std::is_same<typename view_type::memory_space,
-                               typename mirror_type::memory_space>::value,
-                  "");
+                               typename mirror_type::memory_space>::value);
 
     view_type a("a");
     mirror_type am = Kokkos::create_mirror_view(a);
@@ -1005,25 +1004,25 @@ class TestViewAPI {
     hView3 hv_3("dView3::HostMirror", N0);
     hView4 hv_4("dView4::HostMirror", N0);
 
-    dView0 dv_0_1(nullptr, 0);
+    dView0 dv_0_1(nullptr);
     dView0 dv_0_2(hv_0.label(), hv_0.layout());
 
-    dView1 dv_1_1(nullptr, 0);
+    dView1 dv_1_1(nullptr, N0);
     dView1 dv_1_2(hv_1.label(), hv_1.layout());
 
-    dView2 dv_2_1(nullptr, 0);
+    dView2 dv_2_1(nullptr, N0);
     dView2 dv_2_2(hv_2.label(), hv_2.layout());
 
-    dView3 dv_3_1(nullptr, 0);
+    dView3 dv_3_1(nullptr, N0);
     dView3 dv_3_2(hv_3.label(), hv_3.layout());
 
-    dView4 dv_4_1(nullptr, 0);
+    dView4 dv_4_1(nullptr, N0);
     dView4 dv_4_2(hv_4.label(), hv_4.layout());
   }
 
   static void run_test_contruction_from_layout_2() {
     using dView3_0 = Kokkos::View<T ***, device>;
-    using dView3_1 = Kokkos::View<T * * [N1], device>;
+    using dView3_1 = Kokkos::View<T * * [N2], device>;
     using dView3_2 = Kokkos::View<T * [N1][N2], device>;
     using dView3_3 = Kokkos::View<T[N0][N1][N2], device>;
 
@@ -1554,6 +1553,7 @@ class TestViewAPI {
                      Kokkos::CudaUVMSpace>::value)
       return;
 #endif
+    bool did_throw  = false;
     auto alloc_size = std::numeric_limits<size_t>::max() - 42;
     try {
       auto should_always_fail = dView1("hello_world_failure", alloc_size);
@@ -1585,7 +1585,9 @@ class TestViewAPI {
                             "because of an unknown error.", msg);
       }
 #endif
+      did_throw = true;
     }
+    ASSERT_TRUE(did_throw);
   }
 };
 
diff --git a/core/unit_test/TestViewAPI_d.hpp b/core/unit_test/TestViewAPI_d.hpp
index 08d21f54499..b0d759ffccc 100644
--- a/core/unit_test/TestViewAPI_d.hpp
+++ b/core/unit_test/TestViewAPI_d.hpp
@@ -27,8 +27,19 @@ TEST(TEST_CATEGORY, view_api_d) {
 }
 
 TEST(TEST_CATEGORY, view_allocation_error) {
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+  GTEST_SKIP() << "AddressSanitzer detects allocating too much memory "
+                  "preventing our checks to run";
+#endif
+#endif
 #if ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3))
   GTEST_SKIP() << "ROCm 5.3 segfaults when trying to allocate too much memory";
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)  // FIXME_OPENACC
+  if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenACC>) {
+    GTEST_SKIP() << "acc_malloc() not properly returning nullptr";
+  }
 #endif
   TestViewAPI<double, TEST_EXECSPACE>::run_test_error();
 }
diff --git a/core/unit_test/TestViewCopy_a.hpp b/core/unit_test/TestViewCopy_a.hpp
index 3bfc93aadac..a4735b29988 100644
--- a/core/unit_test/TestViewCopy_a.hpp
+++ b/core/unit_test/TestViewCopy_a.hpp
@@ -147,6 +147,40 @@ TEST(TEST_CATEGORY, view_copy_tests) {
       Kokkos::deep_copy(s_a, hs_a);
       ASSERT_TRUE(run_check(s_a, 6));
     }
+  } else {
+    // These copies won't succeed, but they should each throw
+    // an exception whose message contains the view labels,
+    // and the names of the views' memory spaces.
+    //
+    // Note: original a,b both have the same device type,
+    // and their mirrors have the same device type.
+    using memory_space        = typename decltype(a)::memory_space;
+    using mirror_memory_space = typename decltype(h_a)::memory_space;
+    bool threw                = false;
+    std::string msg;
+    try {
+      Kokkos::deep_copy(hs_b, s_b);
+    } catch (std::exception& e) {
+      threw = true;
+      msg   = e.what();
+    }
+    ASSERT_TRUE(threw);
+    ASSERT_NE(msg.find(hs_b.label()), std::string::npos);
+    ASSERT_NE(msg.find(s_b.label()), std::string::npos);
+    ASSERT_NE(msg.find(memory_space().name()), std::string::npos);
+    ASSERT_NE(msg.find(mirror_memory_space().name()), std::string::npos);
+    threw = false;
+    try {
+      Kokkos::deep_copy(s_a, hs_a);
+    } catch (std::exception& e) {
+      threw = true;
+      msg   = e.what();
+    }
+    ASSERT_TRUE(threw);
+    ASSERT_NE(msg.find(s_a.label()), std::string::npos);
+    ASSERT_NE(msg.find(hs_a.label()), std::string::npos);
+    ASSERT_NE(msg.find(memory_space().name()), std::string::npos);
+    ASSERT_NE(msg.find(mirror_memory_space().name()), std::string::npos);
   }
 
   // Contiguous copies
diff --git a/core/unit_test/TestViewCtorDimMatch.hpp b/core/unit_test/TestViewCtorDimMatch.hpp
index d71841eef84..40b7737f2e4 100644
--- a/core/unit_test/TestViewCtorDimMatch.hpp
+++ b/core/unit_test/TestViewCtorDimMatch.hpp
@@ -19,33 +19,72 @@
 
 namespace Test {
 
-#define LIVE(EXPR, ARGS, DYNRANK) EXPECT_NO_THROW(EXPR)
-#define DIE(EXPR, ARGS, DYNRANK)                                           \
-  ASSERT_DEATH(                                                            \
-      EXPR,                                                                \
-      "Constructor for Kokkos View 'v_" #ARGS                              \
-      "' has mismatched number of arguments. Number of arguments = " #ARGS \
-      " but dynamic rank = " #DYNRANK)
+template <int rank, int dynrank, class RankType, std::size_t... Is>
+void test_matching_arguments_rank_helper(std::index_sequence<Is...>) {
+  constexpr int nargs = sizeof...(Is);
+  using view_type     = Kokkos::View<RankType>;
+  if (nargs == rank || nargs == dynrank) {
+    EXPECT_NO_THROW({ view_type v("v", ((Is * 0) + 1)...); });
+    EXPECT_NO_THROW({ view_type v(nullptr, ((Is * 0) + 1)...); });
+  } else {
+    ASSERT_DEATH(
+        { view_type v("v", ((Is * 0) + 1)...); },
+        "Constructor for Kokkos::View 'v' has mismatched number of arguments. "
+        "The number of arguments = " +
+            std::to_string(nargs) +
+            " neither matches the dynamic rank = " + std::to_string(dynrank) +
+            " nor the total rank = " + std::to_string(rank));
+    ASSERT_DEATH(
+        { view_type v(nullptr, ((Is * 0) + 1)...); },
+        "Constructor for Kokkos::View 'UNMANAGED' has mismatched number of "
+        "arguments. "
+        "The number of arguments = " +
+            std::to_string(nargs) +
+            " neither matches the dynamic rank = " + std::to_string(dynrank) +
+            " nor the total rank = " + std::to_string(rank));
+  }
+}
 
-#define PARAM_0
-#define PARAM_1 1
-#define PARAM_2 1, 1
-#define PARAM_3 1, 1, 1
-#define PARAM_4 1, 1, 1, 1
-#define PARAM_5 1, 1, 1, 1, 1
-#define PARAM_6 1, 1, 1, 1, 1, 1
-#define PARAM_7 1, 1, 1, 1, 1, 1, 1
+template <int rank, int dynrank, template <int> class RankType>
+void test_matching_arguments_rank() {
+  test_matching_arguments_rank_helper<rank, dynrank,
+                                      typename RankType<rank>::type>(
+      std::make_index_sequence<0>());
+  test_matching_arguments_rank_helper<rank, dynrank,
+                                      typename RankType<rank>::type>(
+      std::make_index_sequence<1>());
+  test_matching_arguments_rank_helper<rank, dynrank,
+                                      typename RankType<rank>::type>(
+      std::make_index_sequence<2>());
+  test_matching_arguments_rank_helper<rank, dynrank,
+                                      typename RankType<rank>::type>(
+      std::make_index_sequence<3>());
+  test_matching_arguments_rank_helper<rank, dynrank,
+                                      typename RankType<rank>::type>(
+      std::make_index_sequence<4>());
+  test_matching_arguments_rank_helper<rank, dynrank,
+                                      typename RankType<rank>::type>(
+      std::make_index_sequence<5>());
+  test_matching_arguments_rank_helper<rank, dynrank,
+                                      typename RankType<rank>::type>(
+      std::make_index_sequence<6>());
+  test_matching_arguments_rank_helper<rank, dynrank,
+                                      typename RankType<rank>::type>(
+      std::make_index_sequence<7>());
+  test_matching_arguments_rank_helper<rank, dynrank,
+                                      typename RankType<rank>::type>(
+      std::make_index_sequence<8>());
+}
 
-#define PARAM_0_RANK 0
-#define PARAM_1_RANK 1
-#define PARAM_2_RANK 2
-#define PARAM_3_RANK 3
-#define PARAM_4_RANK 4
-#define PARAM_5_RANK 5
-#define PARAM_6_RANK 6
-#define PARAM_7_RANK 7
+template <int rank>
+struct DynamicRank {
+  using type = typename DynamicRank<rank - 1>::type*;
+};
 
-using DType = int;
+template <>
+struct DynamicRank<0> {
+  using type = int;
+};
 
 // Skip test execution when KOKKOS_ENABLE_OPENMPTARGET is enabled until
 // Kokkos::abort() aborts properly on that backend
@@ -53,348 +92,110 @@ using DType = int;
 TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_dyn) {
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
 
-  using DType_0 = DType;
-  using DType_1 = DType *;
-  using DType_2 = DType **;
-  using DType_3 = DType ***;
-  using DType_4 = DType ****;
-  using DType_5 = DType *****;
-  using DType_6 = DType ******;
-  using DType_7 = DType *******;
-  {
-    // test View parameters for View dim = 0, dynamic = 0
-    LIVE({ Kokkos::View<DType_0> v_0("v_0" PARAM_0); }, 0, 0);
-    DIE({ Kokkos::View<DType_0> v_1("v_1", PARAM_1); }, 1, 0);
-    DIE({ Kokkos::View<DType_0> v_2("v_2", PARAM_2); }, 2, 0);
-    DIE({ Kokkos::View<DType_0> v_3("v_3", PARAM_3); }, 3, 0);
-    DIE({ Kokkos::View<DType_0> v_4("v_4", PARAM_4); }, 4, 0);
-    DIE({ Kokkos::View<DType_0> v_5("v_5", PARAM_5); }, 5, 0);
-    DIE({ Kokkos::View<DType_0> v_6("v_6", PARAM_6); }, 6, 0);
-    DIE({ Kokkos::View<DType_0> v_7("v_7", PARAM_7); }, 7, 0);
-  }
-
-  {
-    // test View parameters for View dim = 1, dynamic = 1
-    DIE({ Kokkos::View<DType_1> v_0("v_0" PARAM_0); }, 0, 1);
-    LIVE({ Kokkos::View<DType_1> v_1("v_1", PARAM_1); }, 1, 1);
-    DIE({ Kokkos::View<DType_1> v_2("v_2", PARAM_2); }, 2, 1);
-    DIE({ Kokkos::View<DType_1> v_3("v_3", PARAM_3); }, 3, 1);
-    DIE({ Kokkos::View<DType_1> v_4("v_4", PARAM_4); }, 4, 1);
-    DIE({ Kokkos::View<DType_1> v_5("v_5", PARAM_5); }, 5, 1);
-    DIE({ Kokkos::View<DType_1> v_6("v_6", PARAM_6); }, 6, 1);
-    DIE({ Kokkos::View<DType_1> v_7("v_7", PARAM_7); }, 7, 1);
-  }
-
-  {
-    // test View parameters for View dim = 2, dynamic = 2
-    DIE({ Kokkos::View<DType_2> v_0("v_0" PARAM_0); }, 0, 2);
-    DIE({ Kokkos::View<DType_2> v_1("v_1", PARAM_1); }, 1, 2);
-    LIVE({ Kokkos::View<DType_2> v_2("v_2", PARAM_2); }, 2, 2);
-    DIE({ Kokkos::View<DType_2> v_3("v_3", PARAM_3); }, 3, 2);
-    DIE({ Kokkos::View<DType_2> v_4("v_4", PARAM_4); }, 4, 2);
-    DIE({ Kokkos::View<DType_2> v_5("v_5", PARAM_5); }, 5, 2);
-    DIE({ Kokkos::View<DType_2> v_6("v_6", PARAM_6); }, 6, 2);
-    DIE({ Kokkos::View<DType_2> v_7("v_7", PARAM_7); }, 7, 2);
-  }
-
-  {
-    // test View parameters for View dim = 3, dynamic = 3
-    DIE({ Kokkos::View<DType_3> v_0("v_0" PARAM_0); }, 0, 3);
-    DIE({ Kokkos::View<DType_3> v_1("v_1", PARAM_1); }, 1, 3);
-    DIE({ Kokkos::View<DType_3> v_2("v_2", PARAM_2); }, 2, 3);
-    LIVE({ Kokkos::View<DType_3> v_3("v_3", PARAM_3); }, 3, 3);
-    DIE({ Kokkos::View<DType_3> v_4("v_4", PARAM_4); }, 4, 3);
-    DIE({ Kokkos::View<DType_3> v_5("v_5", PARAM_5); }, 5, 3);
-    DIE({ Kokkos::View<DType_3> v_6("v_6", PARAM_6); }, 6, 3);
-    DIE({ Kokkos::View<DType_3> v_7("v_7", PARAM_7); }, 7, 3);
-  }
-
-  {
-    // test View parameters for View dim = 4, dynamic = 4
-    DIE({ Kokkos::View<DType_4> v_0("v_0" PARAM_0); }, 0, 4);
-    DIE({ Kokkos::View<DType_4> v_1("v_1", PARAM_1); }, 1, 4);
-    DIE({ Kokkos::View<DType_4> v_2("v_2", PARAM_2); }, 2, 4);
-    DIE({ Kokkos::View<DType_4> v_3("v_3", PARAM_3); }, 3, 4);
-    LIVE({ Kokkos::View<DType_4> v_4("v_4", PARAM_4); }, 4, 4);
-    DIE({ Kokkos::View<DType_4> v_5("v_5", PARAM_5); }, 5, 4);
-    DIE({ Kokkos::View<DType_4> v_6("v_6", PARAM_6); }, 6, 4);
-    DIE({ Kokkos::View<DType_4> v_7("v_7", PARAM_7); }, 7, 4);
-  }
-
-  {
-    // test View parameters for View dim = 5, dynamic = 5
-    DIE({ Kokkos::View<DType_5> v_0("v_0" PARAM_0); }, 0, 5);
-    DIE({ Kokkos::View<DType_5> v_1("v_1", PARAM_1); }, 1, 5);
-    DIE({ Kokkos::View<DType_5> v_2("v_2", PARAM_2); }, 2, 5);
-    DIE({ Kokkos::View<DType_5> v_3("v_3", PARAM_3); }, 3, 5);
-    DIE({ Kokkos::View<DType_5> v_4("v_4", PARAM_4); }, 4, 5);
-    LIVE({ Kokkos::View<DType_5> v_5("v_5", PARAM_5); }, 5, 5);
-    DIE({ Kokkos::View<DType_5> v_6("v_6", PARAM_6); }, 6, 5);
-    DIE({ Kokkos::View<DType_5> v_7("v_7", PARAM_7); }, 7, 5);
-  }
+#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS
+  test_matching_arguments_rank<0, 0, DynamicRank>();  // dim = 0, dynamic = 0
+  test_matching_arguments_rank<1, 1, DynamicRank>();  // dim = 1, dynamic = 1
+  test_matching_arguments_rank<2, 2, DynamicRank>();  // dim = 2, dynamic = 2
+  test_matching_arguments_rank<3, 3, DynamicRank>();  // dim = 3, dynamic = 3
+  test_matching_arguments_rank<4, 4, DynamicRank>();  // dim = 4, dynamic = 4
+  test_matching_arguments_rank<5, 5, DynamicRank>();  // dim = 5, dynamic = 5
+  test_matching_arguments_rank<6, 6, DynamicRank>();  // dim = 6, dynamic = 6
+  test_matching_arguments_rank<7, 7, DynamicRank>();  // dim = 7, dynamic = 7
+  test_matching_arguments_rank<8, 8, DynamicRank>();  // dim = 8, dynamic = 8
+#endif
+}
 
-  {
-    // test View parameters for View dim = 6, dynamic = 6
-    DIE({ Kokkos::View<DType_6> v_0("v_0" PARAM_0); }, 0, 6);
-    DIE({ Kokkos::View<DType_6> v_1("v_1", PARAM_1); }, 1, 6);
-    DIE({ Kokkos::View<DType_6> v_2("v_2", PARAM_2); }, 2, 6);
-    DIE({ Kokkos::View<DType_6> v_3("v_3", PARAM_3); }, 3, 6);
-    DIE({ Kokkos::View<DType_6> v_4("v_4", PARAM_4); }, 4, 6);
-    DIE({ Kokkos::View<DType_6> v_5("v_5", PARAM_5); }, 5, 6);
-    LIVE({ Kokkos::View<DType_6> v_6("v_6", PARAM_6); }, 6, 6);
-    DIE({ Kokkos::View<DType_6> v_7("v_7", PARAM_7); }, 7, 6);
-  }
+template <int rank>
+struct StaticRank {
+  using type = typename StaticRank<rank - 1>::type[1];
+};
 
-  {
-    // test View parameters for View dim = 7, dynamic = 7
-    DIE({ Kokkos::View<DType_7> v_0("v_0" PARAM_0); }, 0, 7);
-    DIE({ Kokkos::View<DType_7> v_1("v_1", PARAM_1); }, 1, 7);
-    DIE({ Kokkos::View<DType_7> v_2("v_2", PARAM_2); }, 2, 7);
-    DIE({ Kokkos::View<DType_7> v_3("v_3", PARAM_3); }, 3, 7);
-    DIE({ Kokkos::View<DType_7> v_4("v_4", PARAM_4); }, 4, 7);
-    DIE({ Kokkos::View<DType_7> v_5("v_5", PARAM_5); }, 5, 7);
-    DIE({ Kokkos::View<DType_7> v_6("v_6", PARAM_6); }, 6, 7);
-    LIVE({ Kokkos::View<DType_7> v_7("v_7", PARAM_7); }, 7, 7);
-  }
-}
+template <>
+struct StaticRank<0> {
+  using type = int;
+};
 
 TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_stat) {
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
 
-  using DType_0 = DType;
-  using DType_1 = DType[1];
-  using DType_2 = DType[1][1];
-  using DType_3 = DType[1][1][1];
-  using DType_4 = DType[1][1][1][1];
-  using DType_5 = DType[1][1][1][1][1];
-  using DType_6 = DType[1][1][1][1][1][1];
-  using DType_7 = DType[1][1][1][1][1][1][1];
-  {
-    // test View parameters for View dim = 0, dynamic = 0
-    LIVE({ Kokkos::View<DType_0> v_0("v_0" PARAM_0); }, 0, 0);
-    DIE({ Kokkos::View<DType_0> v_1("v_1", PARAM_1); }, 1, 0);
-    DIE({ Kokkos::View<DType_0> v_2("v_2", PARAM_2); }, 2, 0);
-    DIE({ Kokkos::View<DType_0> v_3("v_3", PARAM_3); }, 3, 0);
-    DIE({ Kokkos::View<DType_0> v_4("v_4", PARAM_4); }, 4, 0);
-    DIE({ Kokkos::View<DType_0> v_5("v_5", PARAM_5); }, 5, 0);
-    DIE({ Kokkos::View<DType_0> v_6("v_6", PARAM_6); }, 6, 0);
-    DIE({ Kokkos::View<DType_0> v_7("v_7", PARAM_7); }, 7, 0);
-  }
-
-  {
-    // test View parameters for View dim = 1, dynamic = 0
-    LIVE({ Kokkos::View<DType_1> v_0("v_0" PARAM_0); }, 0, 0);
-    LIVE({ Kokkos::View<DType_1> v_1("v_1", PARAM_1); }, 1, 0);
-    DIE({ Kokkos::View<DType_1> v_2("v_2", PARAM_2); }, 2, 0);
-    DIE({ Kokkos::View<DType_1> v_3("v_3", PARAM_3); }, 3, 0);
-    DIE({ Kokkos::View<DType_1> v_4("v_4", PARAM_4); }, 4, 0);
-    DIE({ Kokkos::View<DType_1> v_5("v_5", PARAM_5); }, 5, 0);
-    DIE({ Kokkos::View<DType_1> v_6("v_6", PARAM_6); }, 6, 0);
-    DIE({ Kokkos::View<DType_1> v_7("v_7", PARAM_7); }, 7, 0);
-  }
-
-  {
-    // test View parameters for View dim = 2, dynamic = 0
-    LIVE({ Kokkos::View<DType_2> v_0("v_0" PARAM_0); }, 0, 0);
-    DIE({ Kokkos::View<DType_2> v_1("v_1", PARAM_1); }, 1, 0);
-    LIVE({ Kokkos::View<DType_2> v_2("v_2", PARAM_2); }, 2, 0);
-    DIE({ Kokkos::View<DType_2> v_3("v_3", PARAM_3); }, 3, 0);
-    DIE({ Kokkos::View<DType_2> v_4("v_4", PARAM_4); }, 4, 0);
-    DIE({ Kokkos::View<DType_2> v_5("v_5", PARAM_5); }, 5, 0);
-    DIE({ Kokkos::View<DType_2> v_6("v_6", PARAM_6); }, 6, 0);
-    DIE({ Kokkos::View<DType_2> v_7("v_7", PARAM_7); }, 7, 0);
-  }
-
-  {
-    // test View parameters for View dim = 3, dynamic = 0
-    LIVE({ Kokkos::View<DType_3> v_0("v_0" PARAM_0); }, 0, 0);
-    DIE({ Kokkos::View<DType_3> v_1("v_1", PARAM_1); }, 1, 0);
-    DIE({ Kokkos::View<DType_3> v_2("v_2", PARAM_2); }, 2, 0);
-    LIVE({ Kokkos::View<DType_3> v_3("v_3", PARAM_3); }, 3, 0);
-    DIE({ Kokkos::View<DType_3> v_4("v_4", PARAM_4); }, 4, 0);
-    DIE({ Kokkos::View<DType_3> v_5("v_5", PARAM_5); }, 5, 0);
-    DIE({ Kokkos::View<DType_3> v_6("v_6", PARAM_6); }, 6, 0);
-    DIE({ Kokkos::View<DType_3> v_7("v_7", PARAM_7); }, 7, 0);
-  }
-
-  {
-    // test View parameters for View dim = 4, dynamic = 0
-    LIVE({ Kokkos::View<DType_4> v_0("v_0" PARAM_0); }, 0, 0);
-    DIE({ Kokkos::View<DType_4> v_1("v_1", PARAM_1); }, 1, 0);
-    DIE({ Kokkos::View<DType_4> v_2("v_2", PARAM_2); }, 2, 0);
-    DIE({ Kokkos::View<DType_4> v_3("v_3", PARAM_3); }, 3, 0);
-    LIVE({ Kokkos::View<DType_4> v_4("v_4", PARAM_4); }, 4, 0);
-    DIE({ Kokkos::View<DType_4> v_5("v_5", PARAM_5); }, 5, 0);
-    DIE({ Kokkos::View<DType_4> v_6("v_6", PARAM_6); }, 6, 0);
-    DIE({ Kokkos::View<DType_4> v_7("v_7", PARAM_7); }, 7, 0);
-  }
+#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS
+  test_matching_arguments_rank<0, 0, StaticRank>();  // dim = 0, dynamic = 0
+  test_matching_arguments_rank<1, 0, StaticRank>();  // dim = 1, dynamic = 0
+  test_matching_arguments_rank<2, 0, StaticRank>();  // dim = 2, dynamic = 0
+  test_matching_arguments_rank<3, 0, StaticRank>();  // dim = 3, dynamic = 0
+  test_matching_arguments_rank<4, 0, StaticRank>();  // dim = 4, dynamic = 0
+  test_matching_arguments_rank<5, 0, StaticRank>();  // dim = 5, dynamic = 0
+  test_matching_arguments_rank<6, 0, StaticRank>();  // dim = 6, dynamic = 0
+  test_matching_arguments_rank<7, 0, StaticRank>();  // dim = 7, dynamic = 0
+  test_matching_arguments_rank<8, 0, StaticRank>();  // dim = 8, dynamic = 0
+#endif
+}
 
-  {
-    // test View parameters for View dim = 5, dynamic = 0
-    LIVE({ Kokkos::View<DType_5> v_0("v_0" PARAM_0); }, 0, 0);
-    DIE({ Kokkos::View<DType_5> v_1("v_1", PARAM_1); }, 1, 0);
-    DIE({ Kokkos::View<DType_5> v_2("v_2", PARAM_2); }, 2, 0);
-    DIE({ Kokkos::View<DType_5> v_3("v_3", PARAM_3); }, 3, 0);
-    DIE({ Kokkos::View<DType_5> v_4("v_4", PARAM_4); }, 4, 0);
-    LIVE({ Kokkos::View<DType_5> v_5("v_5", PARAM_5); }, 5, 0);
-    DIE({ Kokkos::View<DType_5> v_6("v_6", PARAM_6); }, 6, 0);
-    DIE({ Kokkos::View<DType_5> v_7("v_7", PARAM_7); }, 7, 0);
-  }
+template <int rank>
+struct MixedRank {
+  using type = typename DynamicRank<rank - 1>::type[1];
+};
 
-  {
-    // test View parameters for View dim = 6, dynamic = 0
-    LIVE({ Kokkos::View<DType_6> v_0("v_0" PARAM_0); }, 0, 0);
-    DIE({ Kokkos::View<DType_6> v_1("v_1", PARAM_1); }, 1, 0);
-    DIE({ Kokkos::View<DType_6> v_2("v_2", PARAM_2); }, 2, 0);
-    DIE({ Kokkos::View<DType_6> v_3("v_3", PARAM_3); }, 3, 0);
-    DIE({ Kokkos::View<DType_6> v_4("v_4", PARAM_4); }, 4, 0);
-    DIE({ Kokkos::View<DType_6> v_5("v_5", PARAM_5); }, 5, 0);
-    LIVE({ Kokkos::View<DType_6> v_6("v_6", PARAM_6); }, 6, 0);
-    DIE({ Kokkos::View<DType_6> v_7("v_7", PARAM_7); }, 7, 0);
-  }
-
-  {
-    // test View parameters for View dim = 7, dynamic = 0
-    LIVE({ Kokkos::View<DType_7> v_0("v_0" PARAM_0); }, 0, 0);
-    DIE({ Kokkos::View<DType_7> v_1("v_1", PARAM_1); }, 1, 0);
-    DIE({ Kokkos::View<DType_7> v_2("v_2", PARAM_2); }, 2, 0);
-    DIE({ Kokkos::View<DType_7> v_3("v_3", PARAM_3); }, 3, 0);
-    DIE({ Kokkos::View<DType_7> v_4("v_4", PARAM_4); }, 4, 0);
-    DIE({ Kokkos::View<DType_7> v_5("v_5", PARAM_5); }, 5, 0);
-    DIE({ Kokkos::View<DType_7> v_6("v_6", PARAM_6); }, 6, 0);
-    LIVE({ Kokkos::View<DType_7> v_7("v_7", PARAM_7); }, 7, 0);
-  }
-}
+template <>
+struct MixedRank<0> {
+  using type = int;
+};
 
 TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_mix) {
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
 
-  using DType_0 = DType;
-  using DType_1 = DType[1];
-  using DType_2 = DType * [1];
-  using DType_3 = DType * * [1];
-  using DType_4 = DType ** * [1];
-  using DType_5 = DType *** * [1];
-  using DType_6 = DType **** * [1];
-  using DType_7 = DType ***** * [1];
-  {
-    // test View parameters for View dim = 0, dynamic = 0
-    LIVE({ Kokkos::View<DType_0> v_0("v_0" PARAM_0); }, 0, 0);
-    DIE({ Kokkos::View<DType_0> v_1("v_1", PARAM_1); }, 1, 0);
-    DIE({ Kokkos::View<DType_0> v_2("v_2", PARAM_2); }, 2, 0);
-    DIE({ Kokkos::View<DType_0> v_3("v_3", PARAM_3); }, 3, 0);
-    DIE({ Kokkos::View<DType_0> v_4("v_4", PARAM_4); }, 4, 0);
-    DIE({ Kokkos::View<DType_0> v_5("v_5", PARAM_5); }, 5, 0);
-    DIE({ Kokkos::View<DType_0> v_6("v_6", PARAM_6); }, 6, 0);
-    DIE({ Kokkos::View<DType_0> v_7("v_7", PARAM_7); }, 7, 0);
-  }
-
-  {
-    // test View parameters for View dim = 1, dynamic = 0
-    LIVE({ Kokkos::View<DType_1> v_0("v_0" PARAM_0); }, 0, 0);
-    LIVE({ Kokkos::View<DType_1> v_1("v_1", PARAM_1); }, 1, 0);
-    DIE({ Kokkos::View<DType_1> v_2("v_2", PARAM_2); }, 2, 0);
-    DIE({ Kokkos::View<DType_1> v_3("v_3", PARAM_3); }, 3, 0);
-    DIE({ Kokkos::View<DType_1> v_4("v_4", PARAM_4); }, 4, 0);
-    DIE({ Kokkos::View<DType_1> v_5("v_5", PARAM_5); }, 5, 0);
-    DIE({ Kokkos::View<DType_1> v_6("v_6", PARAM_6); }, 6, 0);
-    DIE({ Kokkos::View<DType_1> v_7("v_7", PARAM_7); }, 7, 0);
-  }
-
-  {
-    // test View parameters for View dim = 2, dynamic = 1
-    DIE({ Kokkos::View<DType_2> v_0("v_0" PARAM_0); }, 0, 1);
-    LIVE({ Kokkos::View<DType_2> v_1("v_1", PARAM_1); }, 1, 1);
-    LIVE({ Kokkos::View<DType_2> v_2("v_2", PARAM_2); }, 2, 1);
-    DIE({ Kokkos::View<DType_2> v_3("v_3", PARAM_3); }, 3, 1);
-    DIE({ Kokkos::View<DType_2> v_4("v_4", PARAM_4); }, 4, 1);
-    DIE({ Kokkos::View<DType_2> v_5("v_5", PARAM_5); }, 5, 1);
-    DIE({ Kokkos::View<DType_2> v_6("v_6", PARAM_6); }, 6, 1);
-    DIE({ Kokkos::View<DType_2> v_7("v_7", PARAM_7); }, 7, 1);
-  }
-
-  {
-    // test View parameters for View dim = 3, dynamic = 2
-    DIE({ Kokkos::View<DType_3> v_0("v_0" PARAM_0); }, 0, 2);
-    DIE({ Kokkos::View<DType_3> v_1("v_1", PARAM_1); }, 1, 2);
-    LIVE({ Kokkos::View<DType_3> v_2("v_2", PARAM_2); }, 2, 2);
-    LIVE({ Kokkos::View<DType_3> v_3("v_3", PARAM_3); }, 3, 2);
-    DIE({ Kokkos::View<DType_3> v_4("v_4", PARAM_4); }, 4, 2);
-    DIE({ Kokkos::View<DType_3> v_5("v_5", PARAM_5); }, 5, 2);
-    DIE({ Kokkos::View<DType_3> v_6("v_6", PARAM_6); }, 6, 2);
-    DIE({ Kokkos::View<DType_3> v_7("v_7", PARAM_7); }, 7, 2);
-  }
+#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS
+  test_matching_arguments_rank<0, 0, MixedRank>();  // dim = 0, dynamic = 0
+  test_matching_arguments_rank<1, 0, MixedRank>();  // dim = 1, dynamic = 0
+  test_matching_arguments_rank<2, 1, MixedRank>();  // dim = 2, dynamic = 1
+  test_matching_arguments_rank<3, 2, MixedRank>();  // dim = 3, dynamic = 2
+  test_matching_arguments_rank<4, 3, MixedRank>();  // dim = 4, dynamic = 3
+  test_matching_arguments_rank<5, 4, MixedRank>();  // dim = 5, dynamic = 4
+  test_matching_arguments_rank<6, 5, MixedRank>();  // dim = 6, dynamic = 5
+  test_matching_arguments_rank<7, 6, MixedRank>();  // dim = 7, dynamic = 6
+  test_matching_arguments_rank<8, 7, MixedRank>();  // dim = 8, dynamic = 7
+#endif
+}
 
-  {
-    // test View parameters for View dim = 4, dynamic = 3
-    DIE({ Kokkos::View<DType_4> v_0("v_0" PARAM_0); }, 0, 3);
-    DIE({ Kokkos::View<DType_4> v_1("v_1", PARAM_1); }, 1, 3);
-    DIE({ Kokkos::View<DType_4> v_2("v_2", PARAM_2); }, 2, 3);
-    LIVE({ Kokkos::View<DType_4> v_3("v_3", PARAM_3); }, 3, 3);
-    LIVE({ Kokkos::View<DType_4> v_4("v_4", PARAM_4); }, 4, 3);
-    DIE({ Kokkos::View<DType_4> v_5("v_5", PARAM_5); }, 5, 3);
-    DIE({ Kokkos::View<DType_4> v_6("v_6", PARAM_6); }, 6, 3);
-    DIE({ Kokkos::View<DType_4> v_7("v_7", PARAM_7); }, 7, 3);
-  }
+#define CHECK_DEATH(EXPR)                                                     \
+  ASSERT_DEATH(EXPR,                                                          \
+               "The specified run-time extent for Kokkos::View 'v' does not " \
+               "match the compile-time extent in dimension 0. The given "     \
+               "extent is 2 but should be 1.")
 
-  {
-    // test View parameters for View dim = 5, dynamic = 4
-    DIE({ Kokkos::View<DType_5> v_0("v_0" PARAM_0); }, 0, 4);
-    DIE({ Kokkos::View<DType_5> v_1("v_1", PARAM_1); }, 1, 4);
-    DIE({ Kokkos::View<DType_5> v_2("v_2", PARAM_2); }, 2, 4);
-    DIE({ Kokkos::View<DType_5> v_3("v_3", PARAM_3); }, 3, 4);
-    LIVE({ Kokkos::View<DType_5> v_4("v_4", PARAM_4); }, 4, 4);
-    LIVE({ Kokkos::View<DType_5> v_5("v_5", PARAM_5); }, 5, 4);
-    DIE({ Kokkos::View<DType_5> v_6("v_6", PARAM_6); }, 6, 4);
-    DIE({ Kokkos::View<DType_5> v_7("v_7", PARAM_7); }, 7, 4);
-  }
+#define CHECK_DEATH_UNMANAGED(EXPR)                                          \
+  ASSERT_DEATH(                                                              \
+      EXPR,                                                                  \
+      "The specified run-time extent for Kokkos::View 'UNMANAGED' does not " \
+      "match the compile-time extent in dimension 0. The given "             \
+      "extent is 2 but should be 1.")
 
-  {
-    // test View parameters for View dim = 6, dynamic = 5
-    DIE({ Kokkos::View<DType_6> v_0("v_0" PARAM_0); }, 0, 5);
-    DIE({ Kokkos::View<DType_6> v_1("v_1", PARAM_1); }, 1, 5);
-    DIE({ Kokkos::View<DType_6> v_2("v_2", PARAM_2); }, 2, 5);
-    DIE({ Kokkos::View<DType_6> v_3("v_3", PARAM_3); }, 3, 5);
-    DIE({ Kokkos::View<DType_6> v_4("v_4", PARAM_4); }, 4, 5);
-    LIVE({ Kokkos::View<DType_6> v_5("v_5", PARAM_5); }, 5, 5);
-    LIVE({ Kokkos::View<DType_6> v_6("v_6", PARAM_6); }, 6, 5);
-    DIE({ Kokkos::View<DType_6> v_7("v_7", PARAM_7); }, 7, 5);
-  }
+TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_static_extents) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
 
-  {
-    // test View parameters for View dim = 7, dynamic = 6
-    DIE({ Kokkos::View<DType_7> v_0("v_0" PARAM_0); }, 0, 6);
-    DIE({ Kokkos::View<DType_7> v_1("v_1", PARAM_1); }, 1, 6);
-    DIE({ Kokkos::View<DType_7> v_2("v_2", PARAM_2); }, 2, 6);
-    DIE({ Kokkos::View<DType_7> v_3("v_3", PARAM_3); }, 3, 6);
-    DIE({ Kokkos::View<DType_7> v_4("v_4", PARAM_4); }, 4, 6);
-    DIE({ Kokkos::View<DType_7> v_5("v_5", PARAM_5); }, 5, 6);
-    LIVE({ Kokkos::View<DType_7> v_6("v_6", PARAM_6); }, 6, 6);
-    LIVE({ Kokkos::View<DType_7> v_7("v_7", PARAM_7); }, 7, 6);
-  }
+#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS
+  // clang-format off
+  CHECK_DEATH({ Kokkos::View<int[1]>                      v("v", 2); });
+  CHECK_DEATH({ Kokkos::View<int[1][1]>                   v("v", 2, 1); });
+  CHECK_DEATH({ Kokkos::View<int[1][1][1]>                v("v", 2, 1, 1); });
+  CHECK_DEATH({ Kokkos::View<int[1][1][1][1]>             v("v", 2, 1, 1, 1); });
+  CHECK_DEATH({ Kokkos::View<int[1][1][1][1][1]>          v("v", 2, 1, 1, 1, 1); });
+  CHECK_DEATH({ Kokkos::View<int[1][1][1][1][1][1]>       v("v", 2, 1, 1, 1, 1, 1); });
+  CHECK_DEATH({ Kokkos::View<int[1][1][1][1][1][1][1]>    v("v", 2, 1, 1, 1, 1, 1, 1); });
+  CHECK_DEATH({ Kokkos::View<int[1][1][1][1][1][1][1][1]> v("v", 2, 1, 1, 1, 1, 1, 1, 1); });
+
+  CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1]>                      v(nullptr, 2); });
+  CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1]>                   v(nullptr, 2, 1); });
+  CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1][1]>                v(nullptr, 2, 1, 1); });
+  CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1][1][1]>             v(nullptr, 2, 1, 1, 1); });
+  CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1][1][1][1]>          v(nullptr, 2, 1, 1, 1, 1); });
+  CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1][1][1][1][1]>       v(nullptr, 2, 1, 1, 1, 1, 1); });
+  CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1][1][1][1][1][1]>    v(nullptr, 2, 1, 1, 1, 1, 1, 1); });
+  CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1][1][1][1][1][1][1]> v(nullptr, 2, 1, 1, 1, 1, 1, 1, 1); });
+  // clang-format on
+#endif
 }
-#endif  // KOKKOS_ENABLE_OPENMPTARGET
-
-#undef PARAM_0
-#undef PARAM_1
-#undef PARAM_2
-#undef PARAM_3
-#undef PARAM_4
-#undef PARAM_5
-#undef PARAM_6
-#undef PARAM_7
 
-#undef PARAM_0_RANK
-#undef PARAM_1_RANK
-#undef PARAM_2_RANK
-#undef PARAM_3_RANK
-#undef PARAM_4_RANK
-#undef PARAM_5_RANK
-#undef PARAM_6_RANK
-#undef PARAM_7_RANK
-
-#undef DType
-
-#undef LIVE
-#undef DIE
+#undef CHECK_DEATH
+#endif  // KOKKOS_ENABLE_OPENMPTARGET
 }  // namespace Test
diff --git a/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp b/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp
new file mode 100644
index 00000000000..b156b72860e
--- /dev/null
+++ b/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp
@@ -0,0 +1,55 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+namespace {
+
+template <class T>
+void test_empty_view_runtime_unmanaged() {
+  T d{};
+  auto* p = reinterpret_cast<T*>(0xABADBABE);
+
+  (void)Kokkos::View<T>(p);
+  (void)Kokkos::View<T>(&d);
+  (void)Kokkos::View<T>(nullptr);
+  (void)Kokkos::View<T>(NULL);  // NOLINT(modernize-use-nullptr)
+  (void)Kokkos::View<T>(0);     // NOLINT(modernize-use-nullptr)
+
+  (void)Kokkos::View<T*>(p, 0);
+  (void)Kokkos::View<T*>(&d, 0);
+  (void)Kokkos::View<T*>(nullptr, 0);
+  (void)Kokkos::View<T*>(NULL, 0);  // NOLINT(modernize-use-nullptr)
+  (void)Kokkos::View<T*>(0, 0);     // NOLINT(modernize-use-nullptr)
+
+  (void)Kokkos::View<T**>(p, 0, 0);
+  (void)Kokkos::View<T**>(&d, 0, 0);
+  (void)Kokkos::View<T**>(nullptr, 0, 0);
+  (void)Kokkos::View<T**>(NULL, 0, 0);  // NOLINT(modernize-use-nullptr)
+  (void)Kokkos::View<T**>(0, 0, 0);     // NOLINT(modernize-use-nullptr)
+}
+
+TEST(TEST_CATEGORY, view_empty_runtime_unmanaged) {
+  test_empty_view_runtime_unmanaged<float>();
+  test_empty_view_runtime_unmanaged<const double>();
+  test_empty_view_runtime_unmanaged<int>();
+  test_empty_view_runtime_unmanaged<char>();
+  test_empty_view_runtime_unmanaged<const char>();
+}
+
+}  // namespace
diff --git a/core/unit_test/TestViewMapping_a.hpp b/core/unit_test/TestViewMapping_a.hpp
index 9173f0d4316..a4dfdb26e3f 100644
--- a/core/unit_test/TestViewMapping_a.hpp
+++ b/core/unit_test/TestViewMapping_a.hpp
@@ -73,67 +73,67 @@ void test_view_mapping() {
   ASSERT_LE(sizeof(dim_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned));
   ASSERT_EQ(sizeof(dim_s0_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned));
 #endif
-  static_assert(int(dim_0::rank) == int(0), "");
-  static_assert(int(dim_0::rank_dynamic) == int(0), "");
-  static_assert(int(dim_0::ArgN0) == 1, "");
-  static_assert(int(dim_0::ArgN1) == 1, "");
-  static_assert(int(dim_0::ArgN2) == 1, "");
-
-  static_assert(int(dim_s2::rank) == int(1), "");
-  static_assert(int(dim_s2::rank_dynamic) == int(0), "");
-  static_assert(int(dim_s2::ArgN0) == 2, "");
-  static_assert(int(dim_s2::ArgN1) == 1, "");
-
-  static_assert(int(dim_s2_s3::rank) == int(2), "");
-  static_assert(int(dim_s2_s3::rank_dynamic) == int(0), "");
-  static_assert(int(dim_s2_s3::ArgN0) == 2, "");
-  static_assert(int(dim_s2_s3::ArgN1) == 3, "");
-  static_assert(int(dim_s2_s3::ArgN2) == 1, "");
-
-  static_assert(int(dim_s2_s3_s4::rank) == int(3), "");
-  static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0), "");
-  static_assert(int(dim_s2_s3_s4::ArgN0) == 2, "");
-  static_assert(int(dim_s2_s3_s4::ArgN1) == 3, "");
-  static_assert(int(dim_s2_s3_s4::ArgN2) == 4, "");
-  static_assert(int(dim_s2_s3_s4::ArgN3) == 1, "");
-
-  static_assert(int(dim_s0::rank) == int(1), "");
-  static_assert(int(dim_s0::rank_dynamic) == int(1), "");
-
-  static_assert(int(dim_s0_s3::rank) == int(2), "");
-  static_assert(int(dim_s0_s3::rank_dynamic) == int(1), "");
-  static_assert(int(dim_s0_s3::ArgN0) == 0, "");
-  static_assert(int(dim_s0_s3::ArgN1) == 3, "");
-
-  static_assert(int(dim_s0_s3_s4::rank) == int(3), "");
-  static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1), "");
-  static_assert(int(dim_s0_s3_s4::ArgN0) == 0, "");
-  static_assert(int(dim_s0_s3_s4::ArgN1) == 3, "");
-  static_assert(int(dim_s0_s3_s4::ArgN2) == 4, "");
-
-  static_assert(int(dim_s0_s0_s4::rank) == int(3), "");
-  static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2), "");
-  static_assert(int(dim_s0_s0_s4::ArgN0) == 0, "");
-  static_assert(int(dim_s0_s0_s4::ArgN1) == 0, "");
-  static_assert(int(dim_s0_s0_s4::ArgN2) == 4, "");
-
-  static_assert(int(dim_s0_s0_s0::rank) == int(3), "");
-  static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3), "");
-
-  static_assert(int(dim_s0_s0_s0_s0::rank) == int(4), "");
-  static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4), "");
-
-  static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5), "");
-  static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5), "");
-
-  static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6), "");
-  static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6), "");
-
-  static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7), "");
-  static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7), "");
-
-  static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8), "");
-  static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8), "");
+  static_assert(int(dim_0::rank) == int(0));
+  static_assert(int(dim_0::rank_dynamic) == int(0));
+  static_assert(int(dim_0::ArgN0) == 1);
+  static_assert(int(dim_0::ArgN1) == 1);
+  static_assert(int(dim_0::ArgN2) == 1);
+
+  static_assert(int(dim_s2::rank) == int(1));
+  static_assert(int(dim_s2::rank_dynamic) == int(0));
+  static_assert(int(dim_s2::ArgN0) == 2);
+  static_assert(int(dim_s2::ArgN1) == 1);
+
+  static_assert(int(dim_s2_s3::rank) == int(2));
+  static_assert(int(dim_s2_s3::rank_dynamic) == int(0));
+  static_assert(int(dim_s2_s3::ArgN0) == 2);
+  static_assert(int(dim_s2_s3::ArgN1) == 3);
+  static_assert(int(dim_s2_s3::ArgN2) == 1);
+
+  static_assert(int(dim_s2_s3_s4::rank) == int(3));
+  static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0));
+  static_assert(int(dim_s2_s3_s4::ArgN0) == 2);
+  static_assert(int(dim_s2_s3_s4::ArgN1) == 3);
+  static_assert(int(dim_s2_s3_s4::ArgN2) == 4);
+  static_assert(int(dim_s2_s3_s4::ArgN3) == 1);
+
+  static_assert(int(dim_s0::rank) == int(1));
+  static_assert(int(dim_s0::rank_dynamic) == int(1));
+
+  static_assert(int(dim_s0_s3::rank) == int(2));
+  static_assert(int(dim_s0_s3::rank_dynamic) == int(1));
+  static_assert(int(dim_s0_s3::ArgN0) == 0);
+  static_assert(int(dim_s0_s3::ArgN1) == 3);
+
+  static_assert(int(dim_s0_s3_s4::rank) == int(3));
+  static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1));
+  static_assert(int(dim_s0_s3_s4::ArgN0) == 0);
+  static_assert(int(dim_s0_s3_s4::ArgN1) == 3);
+  static_assert(int(dim_s0_s3_s4::ArgN2) == 4);
+
+  static_assert(int(dim_s0_s0_s4::rank) == int(3));
+  static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2));
+  static_assert(int(dim_s0_s0_s4::ArgN0) == 0);
+  static_assert(int(dim_s0_s0_s4::ArgN1) == 0);
+  static_assert(int(dim_s0_s0_s4::ArgN2) == 4);
+
+  static_assert(int(dim_s0_s0_s0::rank) == int(3));
+  static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3));
+
+  static_assert(int(dim_s0_s0_s0_s0::rank) == int(4));
+  static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4));
+
+  static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5));
+  static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5));
+
+  static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6));
+  static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6));
+
+  static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7));
+  static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7));
+
+  static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8));
+  static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8));
 
   dim_s0 d1(2, 3, 4, 5, 6, 7, 8, 9);
   dim_s0_s0 d2(2, 3, 4, 5, 6, 7, 8, 9);
@@ -514,11 +514,11 @@ void test_view_mapping() {
   {
     using namespace Kokkos::Impl;
 
-    static_assert(rank_dynamic<>::value == 0, "");
-    static_assert(rank_dynamic<1>::value == 0, "");
-    static_assert(rank_dynamic<0>::value == 1, "");
-    static_assert(rank_dynamic<0, 1>::value == 1, "");
-    static_assert(rank_dynamic<0, 0, 1>::value == 2, "");
+    static_assert(rank_dynamic<>::value == 0);
+    static_assert(rank_dynamic<1>::value == 0);
+    static_assert(rank_dynamic<0>::value == 1);
+    static_assert(rank_dynamic<0, 1>::value == 1);
+    static_assert(rank_dynamic<0, 0, 1>::value == 2);
   }
 
   {
@@ -529,54 +529,48 @@ void test_view_mapping() {
     using a_const_int_r1 = ViewArrayAnalysis<const int[]>;
     using a_const_int_r5 = ViewArrayAnalysis<const int* * [4][5][6]>;
 
-    static_assert(a_int_r1::dimension::rank == 1, "");
-    static_assert(a_int_r1::dimension::rank_dynamic == 1, "");
-    static_assert(a_int_r5::dimension::ArgN0 == 0, "");
-    static_assert(a_int_r5::dimension::ArgN1 == 0, "");
-    static_assert(a_int_r5::dimension::ArgN2 == 4, "");
-    static_assert(a_int_r5::dimension::ArgN3 == 5, "");
-    static_assert(a_int_r5::dimension::ArgN4 == 6, "");
-    static_assert(a_int_r5::dimension::ArgN5 == 1, "");
+    static_assert(a_int_r1::dimension::rank == 1);
+    static_assert(a_int_r1::dimension::rank_dynamic == 1);
+    static_assert(a_int_r5::dimension::ArgN0 == 0);
+    static_assert(a_int_r5::dimension::ArgN1 == 0);
+    static_assert(a_int_r5::dimension::ArgN2 == 4);
+    static_assert(a_int_r5::dimension::ArgN3 == 5);
+    static_assert(a_int_r5::dimension::ArgN4 == 6);
+    static_assert(a_int_r5::dimension::ArgN5 == 1);
 
     static_assert(
-        std::is_same<typename a_int_r1::dimension, ViewDimension<0> >::value,
-        "");
+        std::is_same<typename a_int_r1::dimension, ViewDimension<0> >::value);
     static_assert(
-        std::is_same<typename a_int_r1::non_const_value_type, int>::value, "");
+        std::is_same<typename a_int_r1::non_const_value_type, int>::value);
 
-    static_assert(a_const_int_r1::dimension::rank == 1, "");
-    static_assert(a_const_int_r1::dimension::rank_dynamic == 1, "");
+    static_assert(a_const_int_r1::dimension::rank == 1);
+    static_assert(a_const_int_r1::dimension::rank_dynamic == 1);
     static_assert(std::is_same<typename a_const_int_r1::dimension,
-                               ViewDimension<0> >::value,
-                  "");
-    static_assert(
-        std::is_same<typename a_const_int_r1::non_const_value_type, int>::value,
-        "");
+                               ViewDimension<0> >::value);
+    static_assert(std::is_same<typename a_const_int_r1::non_const_value_type,
+                               int>::value);
 
-    static_assert(a_const_int_r5::dimension::rank == 5, "");
-    static_assert(a_const_int_r5::dimension::rank_dynamic == 2, "");
+    static_assert(a_const_int_r5::dimension::rank == 5);
+    static_assert(a_const_int_r5::dimension::rank_dynamic == 2);
 
-    static_assert(a_const_int_r5::dimension::ArgN0 == 0, "");
-    static_assert(a_const_int_r5::dimension::ArgN1 == 0, "");
-    static_assert(a_const_int_r5::dimension::ArgN2 == 4, "");
-    static_assert(a_const_int_r5::dimension::ArgN3 == 5, "");
-    static_assert(a_const_int_r5::dimension::ArgN4 == 6, "");
-    static_assert(a_const_int_r5::dimension::ArgN5 == 1, "");
+    static_assert(a_const_int_r5::dimension::ArgN0 == 0);
+    static_assert(a_const_int_r5::dimension::ArgN1 == 0);
+    static_assert(a_const_int_r5::dimension::ArgN2 == 4);
+    static_assert(a_const_int_r5::dimension::ArgN3 == 5);
+    static_assert(a_const_int_r5::dimension::ArgN4 == 6);
+    static_assert(a_const_int_r5::dimension::ArgN5 == 1);
 
     static_assert(std::is_same<typename a_const_int_r5::dimension,
-                               ViewDimension<0, 0, 4, 5, 6> >::value,
-                  "");
-    static_assert(
-        std::is_same<typename a_const_int_r5::non_const_value_type, int>::value,
-        "");
+                               ViewDimension<0, 0, 4, 5, 6> >::value);
+    static_assert(std::is_same<typename a_const_int_r5::non_const_value_type,
+                               int>::value);
 
-    static_assert(a_int_r5::dimension::rank == 5, "");
-    static_assert(a_int_r5::dimension::rank_dynamic == 2, "");
+    static_assert(a_int_r5::dimension::rank == 5);
+    static_assert(a_int_r5::dimension::rank_dynamic == 2);
     static_assert(std::is_same<typename a_int_r5::dimension,
-                               ViewDimension<0, 0, 4, 5, 6> >::value,
-                  "");
+                               ViewDimension<0, 0, 4, 5, 6> >::value);
     static_assert(
-        std::is_same<typename a_int_r5::non_const_value_type, int>::value, "");
+        std::is_same<typename a_int_r5::non_const_value_type, int>::value);
   }
 
   {
@@ -587,15 +581,15 @@ void test_view_mapping() {
     // Dimensions of t_i4 are appended to the multdimensional array.
     using a_int_r5 = ViewArrayAnalysis<t_i4** * [3]>;
 
-    static_assert(a_int_r5::dimension::rank == 5, "");
-    static_assert(a_int_r5::dimension::rank_dynamic == 3, "");
-    static_assert(a_int_r5::dimension::ArgN0 == 0, "");
-    static_assert(a_int_r5::dimension::ArgN1 == 0, "");
-    static_assert(a_int_r5::dimension::ArgN2 == 0, "");
-    static_assert(a_int_r5::dimension::ArgN3 == 3, "");
-    static_assert(a_int_r5::dimension::ArgN4 == 4, "");
+    static_assert(a_int_r5::dimension::rank == 5);
+    static_assert(a_int_r5::dimension::rank_dynamic == 3);
+    static_assert(a_int_r5::dimension::ArgN0 == 0);
+    static_assert(a_int_r5::dimension::ArgN1 == 0);
+    static_assert(a_int_r5::dimension::ArgN2 == 0);
+    static_assert(a_int_r5::dimension::ArgN3 == 3);
+    static_assert(a_int_r5::dimension::ArgN4 == 4);
     static_assert(
-        std::is_same<typename a_int_r5::non_const_value_type, int>::value, "");
+        std::is_same<typename a_int_r5::non_const_value_type, int>::value);
   }
 
   {
@@ -603,71 +597,54 @@ void test_view_mapping() {
 
     using a_const_int_r1 = ViewDataAnalysis<const int[], void>;
 
-    static_assert(std::is_void<typename a_const_int_r1::specialize>::value, "");
+    static_assert(std::is_void<typename a_const_int_r1::specialize>::value);
     static_assert(std::is_same<typename a_const_int_r1::dimension,
-                               Kokkos::Impl::ViewDimension<0> >::value,
-                  "");
+                               Kokkos::Impl::ViewDimension<0> >::value);
 
     static_assert(
-        std::is_same<typename a_const_int_r1::type, const int*>::value, "");
+        std::is_same<typename a_const_int_r1::type, const int*>::value);
     static_assert(
-        std::is_same<typename a_const_int_r1::value_type, const int>::value,
-        "");
+        std::is_same<typename a_const_int_r1::value_type, const int>::value);
 
     static_assert(std::is_same<typename a_const_int_r1::scalar_array_type,
-                               const int*>::value,
-                  "");
+                               const int*>::value);
     static_assert(
-        std::is_same<typename a_const_int_r1::const_type, const int*>::value,
-        "");
+        std::is_same<typename a_const_int_r1::const_type, const int*>::value);
     static_assert(std::is_same<typename a_const_int_r1::const_value_type,
-                               const int>::value,
-                  "");
+                               const int>::value);
     static_assert(std::is_same<typename a_const_int_r1::const_scalar_array_type,
-                               const int*>::value,
-                  "");
+                               const int*>::value);
     static_assert(
-        std::is_same<typename a_const_int_r1::non_const_type, int*>::value, "");
-    static_assert(
-        std::is_same<typename a_const_int_r1::non_const_value_type, int>::value,
-        "");
+        std::is_same<typename a_const_int_r1::non_const_type, int*>::value);
+    static_assert(std::is_same<typename a_const_int_r1::non_const_value_type,
+                               int>::value);
 
     using a_const_int_r3 = ViewDataAnalysis<const int* * [4], void>;
 
-    static_assert(std::is_void<typename a_const_int_r3::specialize>::value, "");
+    static_assert(std::is_void<typename a_const_int_r3::specialize>::value);
 
     static_assert(std::is_same<typename a_const_int_r3::dimension,
-                               Kokkos::Impl::ViewDimension<0, 0, 4> >::value,
-                  "");
+                               Kokkos::Impl::ViewDimension<0, 0, 4> >::value);
 
     static_assert(
-        std::is_same<typename a_const_int_r3::type, const int* * [4]>::value,
-        "");
+        std::is_same<typename a_const_int_r3::type, const int* * [4]>::value);
     static_assert(
-        std::is_same<typename a_const_int_r3::value_type, const int>::value,
-        "");
+        std::is_same<typename a_const_int_r3::value_type, const int>::value);
     static_assert(std::is_same<typename a_const_int_r3::scalar_array_type,
-                               const int* * [4]>::value,
-                  "");
+                               const int* * [4]>::value);
     static_assert(std::is_same<typename a_const_int_r3::const_type,
-                               const int* * [4]>::value,
-                  "");
+                               const int* * [4]>::value);
     static_assert(std::is_same<typename a_const_int_r3::const_value_type,
-                               const int>::value,
-                  "");
+                               const int>::value);
     static_assert(std::is_same<typename a_const_int_r3::const_scalar_array_type,
-                               const int* * [4]>::value,
-                  "");
+                               const int* * [4]>::value);
     static_assert(std::is_same<typename a_const_int_r3::non_const_type,
-                               int* * [4]>::value,
-                  "");
-    static_assert(
-        std::is_same<typename a_const_int_r3::non_const_value_type, int>::value,
-        "");
+                               int* * [4]>::value);
+    static_assert(std::is_same<typename a_const_int_r3::non_const_value_type,
+                               int>::value);
     static_assert(
         std::is_same<typename a_const_int_r3::non_const_scalar_array_type,
-                     int* * [4]>::value,
-        "");
+                     int* * [4]>::value);
 
     // std::cout << "typeid( const int**[4] ).name() = " << typeid( const
     // int**[4] ).name() << std::endl;
diff --git a/core/unit_test/TestViewMapping_b.hpp b/core/unit_test/TestViewMapping_b.hpp
index 9ac4e7da845..4aee035d17a 100644
--- a/core/unit_test/TestViewMapping_b.hpp
+++ b/core/unit_test/TestViewMapping_b.hpp
@@ -156,7 +156,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) {
     using dst_traits = Kokkos::ViewTraits<int, Kokkos::LayoutLeft, exec_space>;
     using src_traits = Kokkos::ViewTraits<int, Kokkos::LayoutRight, exec_space>;
     using mapping    = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>;
-    static_assert(mapping::is_assignable, "");
+    static_assert(mapping::is_assignable);
 
     Kokkos::View<int, Kokkos::LayoutRight, exec_space> src;
     Kokkos::View<int, Kokkos::LayoutLeft, exec_space> dst(src);
@@ -167,7 +167,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) {
     using dst_traits = Kokkos::ViewTraits<int, Kokkos::LayoutRight, exec_space>;
     using src_traits = Kokkos::ViewTraits<int, Kokkos::LayoutLeft, exec_space>;
     using mapping    = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>;
-    static_assert(mapping::is_assignable, "");
+    static_assert(mapping::is_assignable);
 
     Kokkos::View<int, Kokkos::LayoutLeft, exec_space> src;
     Kokkos::View<int, Kokkos::LayoutRight, exec_space> dst(src);
@@ -180,7 +180,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) {
     using src_traits =
         Kokkos::ViewTraits<int *, Kokkos::LayoutRight, exec_space>;
     using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>;
-    static_assert(mapping::is_assignable, "");
+    static_assert(mapping::is_assignable);
 
     Kokkos::View<int *, Kokkos::LayoutRight, exec_space> src;
     Kokkos::View<int *, Kokkos::LayoutLeft, exec_space> dst(src);
@@ -193,7 +193,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) {
     using src_traits =
         Kokkos::ViewTraits<int *, Kokkos::LayoutLeft, exec_space>;
     using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>;
-    static_assert(mapping::is_assignable, "");
+    static_assert(mapping::is_assignable);
 
     Kokkos::View<int *, Kokkos::LayoutLeft, exec_space> src;
     Kokkos::View<int *, Kokkos::LayoutRight, exec_space> dst(src);
@@ -206,7 +206,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) {
     using src_traits =
         Kokkos::ViewTraits<int **, Kokkos::LayoutRight, exec_space>;
     using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>;
-    static_assert(!mapping::is_assignable, "");
+    static_assert(!mapping::is_assignable);
   }
 
   {  // Assignment of rank-2 Right = Left
@@ -215,7 +215,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) {
     using src_traits =
         Kokkos::ViewTraits<int **, Kokkos::LayoutLeft, exec_space>;
     using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>;
-    static_assert(!mapping::is_assignable, "");
+    static_assert(!mapping::is_assignable);
   }
 }
 
@@ -226,7 +226,7 @@ TEST(TEST_CATEGORY, view_mapping_trivially_copyable) {
   using src_traits = dst_traits;
   using mapping    = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>;
 
-  static_assert(std::is_trivially_copyable<mapping>{}, "");
+  static_assert(std::is_trivially_copyable<mapping>{});
 }
 
 }  // namespace Test
diff --git a/core/unit_test/TestViewOutOfBoundsAccess.hpp b/core/unit_test/TestViewOutOfBoundsAccess.hpp
new file mode 100644
index 00000000000..2716856c1fc
--- /dev/null
+++ b/core/unit_test/TestViewOutOfBoundsAccess.hpp
@@ -0,0 +1,175 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <Kokkos_Core.hpp>
+#include <sstream>
+
+#include <gtest/gtest.h>
+
+namespace {
+
+TEST(TEST_CATEGORY, append_formatted_multidimensional_index) {
+  using Kokkos::Impl::append_formatted_multidimensional_index;
+  {
+    char buffer[64] = "my prefix ";
+    append_formatted_multidimensional_index(buffer, 1);
+    EXPECT_STREQ(buffer, "my prefix [1]");
+  }
+  {
+    char buffer[64] = "I was here";
+    append_formatted_multidimensional_index(buffer, 1, 2, 3);
+    EXPECT_STREQ(buffer, "I was here[1,2,3]");
+  }
+  {
+    char buffer[64] = "with mixed integer types ";
+    append_formatted_multidimensional_index(buffer, 1u, -2);
+    EXPECT_STREQ(buffer, "with mixed integer types [1,-2]");
+  }
+}
+
+#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
+
+template <class View, class ExecutionSpace>
+struct TestViewOutOfBoundAccess {
+  View v;
+  static constexpr auto rank = View::rank;
+
+  template <std::size_t... Is>
+  KOKKOS_FUNCTION decltype(auto) bad_access(std::index_sequence<Is...>) const {
+    return v((Is * 1 + Is == 0 ? v.extent(Is) + 3 : 0)...);
+  }
+
+  KOKKOS_FUNCTION void operator()(int) const {
+    ++bad_access(std::make_index_sequence<rank>{});
+  }
+
+  template <std::size_t... Is>
+  std::string get_details(std::index_sequence<Is...>) {
+    std::stringstream ss;
+    ss << "with indices \\[";
+    ((ss << (Is == 0 ? v.extent(Is) + 3 : 0)
+         << (Is == View::rank() - 1 ? "\\]" : ",")),
+     ...);
+    ss << " but extents \\[";
+    ((ss << v.extent(Is) << (Is == View::rank() - 1 ? "\\]" : ",")), ...);
+    return ss.str();
+  }
+
+  auto get_details() {
+    return get_details(std::make_index_sequence<View::rank()>());
+  }
+
+  TestViewOutOfBoundAccess(View w, ExecutionSpace const& s, std::string matcher)
+      : v(std::move(w)) {
+    constexpr bool view_accessible_from_execution_space =
+        Kokkos::SpaceAccessibility<
+            /*AccessSpace=*/ExecutionSpace,
+            /*MemorySpace=*/typename View::memory_space>::accessible;
+    EXPECT_TRUE(view_accessible_from_execution_space);
+
+    matcher += ".*" + get_details();
+
+    EXPECT_DEATH(
+        {
+          Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(s, 0, 1),
+                               *this);
+          Kokkos::fence();
+        },
+        matcher);
+  }
+};
+
+template <class View, class LblOrPtr, std::size_t... Is>
+auto make_view_impl(LblOrPtr x, std::index_sequence<Is...>) {
+  return View(x, (Is + 1)...);
+}
+
+template <class View, class LblOrPtr>
+auto make_view(LblOrPtr x) {
+  return make_view_impl<View>(std::move(x),
+                              std::make_index_sequence<View::rank>());
+}
+
+template <class ExecutionSpace>
+void test_view_out_of_bounds_access() {
+  ExecutionSpace const exec_space{};
+  // clang-format off
+  using V1 = Kokkos::View<int*,        ExecutionSpace>;
+  using V2 = Kokkos::View<int**,       ExecutionSpace>;
+  using V3 = Kokkos::View<int***,      ExecutionSpace>;
+  using V4 = Kokkos::View<int****,     ExecutionSpace>;
+  using V5 = Kokkos::View<int*****,    ExecutionSpace>;
+  using V6 = Kokkos::View<int******,   ExecutionSpace>;
+  using V7 = Kokkos::View<int*******,  ExecutionSpace>;
+  using V8 = Kokkos::View<int********, ExecutionSpace>;
+  std::string const prefix = "Kokkos::View ERROR: out of bounds access";
+  std::string const lbl = "my_label";
+  TestViewOutOfBoundAccess(make_view<V1>(lbl), exec_space, prefix + ".*" + lbl);
+  TestViewOutOfBoundAccess(make_view<V2>(lbl), exec_space, prefix + ".*" + lbl);
+  TestViewOutOfBoundAccess(make_view<V3>(lbl), exec_space, prefix + ".*" + lbl);
+  TestViewOutOfBoundAccess(make_view<V4>(lbl), exec_space, prefix + ".*" + lbl);
+  TestViewOutOfBoundAccess(make_view<V5>(lbl), exec_space, prefix + ".*" + lbl);
+  TestViewOutOfBoundAccess(make_view<V6>(lbl), exec_space, prefix + ".*" + lbl);
+  TestViewOutOfBoundAccess(make_view<V7>(lbl), exec_space, prefix + ".*" + lbl);
+  TestViewOutOfBoundAccess(make_view<V8>(lbl), exec_space, prefix + ".*" + lbl);
+  int* const ptr = nullptr;
+  TestViewOutOfBoundAccess(make_view<V1>(ptr), exec_space, prefix + ".*UNMANAGED");
+  TestViewOutOfBoundAccess(make_view<V2>(ptr), exec_space, prefix + ".*UNMANAGED");
+  TestViewOutOfBoundAccess(make_view<V3>(ptr), exec_space, prefix + ".*UNMANAGED");
+  TestViewOutOfBoundAccess(make_view<V4>(ptr), exec_space, prefix + ".*UNMANAGED");
+  TestViewOutOfBoundAccess(make_view<V5>(ptr), exec_space, prefix + ".*UNMANAGED");
+  TestViewOutOfBoundAccess(make_view<V6>(ptr), exec_space, prefix + ".*UNMANAGED");
+  TestViewOutOfBoundAccess(make_view<V7>(ptr), exec_space, prefix + ".*UNMANAGED");
+  TestViewOutOfBoundAccess(make_view<V8>(ptr), exec_space, prefix + ".*UNMANAGED");
+  // clang-format on
+}
+
+TEST(TEST_CATEGORY_DEATH, view_out_of_bounds_access) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+
+  using ExecutionSpace = TEST_EXECSPACE;
+
+  if (false && Kokkos::SpaceAccessibility<
+                   /*AccessSpace=*/ExecutionSpace,
+                   /*MemorySpace=*/Kokkos::HostSpace>::accessible) {
+    GTEST_SKIP() << "skipping since no memory access violation would occur";
+  }
+
+#if defined(KOKKOS_ENABLE_SYCL) && defined(NDEBUG)  // FIXME_SYCL
+  if (std::is_same_v<ExecutionSpace, Kokkos::Experimental::SYCL>) {
+    GTEST_SKIP() << "skipping SYCL device-side abort does not work when NDEBUG "
+                    "is defined";
+  }
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)  // FIXME_OPENMPTARGET
+  if (std::is_same_v<ExecutionSpace, Kokkos::Experimental::OpenMPTarget>) {
+    GTEST_SKIP() << "skipping because OpenMPTarget backend is currently not "
+                    "able to abort from the device";
+  }
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)  // FIXME_OPENACC
+  if (std::is_same<ExecutionSpace, Kokkos::Experimental::OpenACC>::value) {
+    GTEST_SKIP() << "skipping because OpenACC backend is currently not "
+                    "able to abort from the device";
+  }
+#endif
+
+  test_view_out_of_bounds_access<ExecutionSpace>();
+}
+
+#endif
+
+}  // namespace
diff --git a/core/unit_test/UnitTest_DeviceAndThreads.cpp b/core/unit_test/UnitTest_DeviceAndThreads.cpp
index b522ac3e69b..25442146fba 100644
--- a/core/unit_test/UnitTest_DeviceAndThreads.cpp
+++ b/core/unit_test/UnitTest_DeviceAndThreads.cpp
@@ -19,22 +19,23 @@
 #include <string>
 #include <thread>
 
-int get_device_count() {
+int get_num_devices() {
+  int num_devices;
 #if defined(KOKKOS_ENABLE_CUDA)
-  int count;
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count));
-  return count;
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&num_devices));
 #elif defined(KOKKOS_ENABLE_HIP)
-  int count;
-  KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&count));
-  return count;
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&num_devices));
 #elif defined(KOKKOS_ENABLE_OPENMPTARGET)
-  return omp_get_num_devices();
+  num_devices = omp_get_num_devices();
 #elif defined(KOKKOS_ENABLE_OPENACC)
-  return acc_get_num_devices(acc_get_device_type());
+  num_devices = acc_get_num_devices(acc_get_device_type());
+#elif defined(KOKKOS_ENABLE_SYCL)
+  num_devices = sycl::device::get_devices(sycl::info::device_type::gpu).size();
 #else
-  return 0;
+  num_devices = -1;
 #endif
+  assert(num_devices == Kokkos::num_devices());
+  return num_devices;
 }
 
 int get_device_id() {
@@ -44,15 +45,17 @@ int get_device_id() {
 #elif defined(KOKKOS_ENABLE_HIP)
   KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&device_id));
 #elif defined(KOKKOS_ENABLE_OPENMPTARGET)
-  device_id = omp_get_device_num();
+  device_id   = omp_get_default_device();
 #elif defined(KOKKOS_ENABLE_OPENACC)
-  device_id = acc_get_device_num(acc_get_device_type());
+  device_id   = acc_get_device_num(acc_get_device_type());
 #elif defined(KOKKOS_ENABLE_SYCL)
-  // FIXME_SYCL ?
-  assert(false);
-  return -2;
+  // Not able to query the underlying runtime because there is no such thing as
+  // device currently being used with SYCL.  We go through the Kokkos runtime
+  // which makes the assert below pointless but it still let us check that
+  // Kokkos selected the device we asked for from the Python tests.
+  device_id = Kokkos::device_id();
 #else
-  device_id = -1;
+  device_id   = -1;
 #endif
   assert(device_id == Kokkos::device_id());
   return device_id;
@@ -68,6 +71,14 @@ int get_max_threads() {
 #endif
 }
 
+int get_hwloc_enabled() {
+#ifdef KOKKOS_ENABLE_HWLOC
+  return 1;
+#else
+  return 0;
+#endif
+}
+
 int get_num_threads() {
   int const num_threads = Kokkos::DefaultHostExecutionSpace().concurrency();
   assert(num_threads == Kokkos::num_threads());
@@ -90,9 +101,10 @@ int print_flag(std::string const& flag) {
   KOKKOS_TEST_PRINT_FLAG(num_threads);
   KOKKOS_TEST_PRINT_FLAG(max_threads);
   KOKKOS_TEST_PRINT_FLAG(device_id);
-  KOKKOS_TEST_PRINT_FLAG(device_count);
+  KOKKOS_TEST_PRINT_FLAG(num_devices);
   KOKKOS_TEST_PRINT_FLAG(disable_warnings);
   KOKKOS_TEST_PRINT_FLAG(tune_internals);
+  KOKKOS_TEST_PRINT_FLAG(hwloc_enabled);
 
 #undef KOKKOS_TEST_PRINT_FLAG
 
diff --git a/core/unit_test/configuration/test-code/test_config_arch_list.bash b/core/unit_test/configuration/test-code/test_config_arch_list.bash
index 8fe8e2b5ece..8bc8ef21cd0 100755
--- a/core/unit_test/configuration/test-code/test_config_arch_list.bash
+++ b/core/unit_test/configuration/test-code/test_config_arch_list.bash
@@ -4,7 +4,7 @@ HostArch=(SNB HSW SKX KNL)
 DeviceArch=(Kepler35 Kepler37 Pascal60 Pascal61 Volta70)
 if [ ! -z "$KOKKOS_HOST_ARCH_TEST" ]; then
   export KOKKOS_ARCH_TEST=1
-  HostArch=(WSM SNB HSW SKX WSM AMDAVX ARMv80 ARMv81 BDW KNC KNL BGQ Power7 Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2)
+  HostArch=(SNB HSW SKX AMDAVX ARMv80 ARMv81 BDW KNC KNL Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2)
   DeviceArch=()
 fi
 
diff --git a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp
new file mode 100644
index 00000000000..d94735ceb23
--- /dev/null
+++ b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp
@@ -0,0 +1,268 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestCuda_Category.hpp>
+#include <Test_InterOp_Streams.hpp>
+
+namespace {
+
+struct StreamsAndDevices {
+  std::array<cudaStream_t, 2> streams;
+  std::array<int, 2> devices;
+
+  StreamsAndDevices() {
+    int n_devices;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&n_devices));
+
+    devices = {0, n_devices - 1};
+    for (int i = 0; i < 2; ++i) {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i]));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&streams[i]));
+    }
+  }
+  StreamsAndDevices(const StreamsAndDevices &) = delete;
+  StreamsAndDevices &operator=(const StreamsAndDevices &) = delete;
+  ~StreamsAndDevices() {
+    for (int i = 0; i < 2; ++i) {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i]));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]));
+    }
+  }
+};
+
+std::array<TEST_EXECSPACE, 2> get_execution_spaces(
+    const StreamsAndDevices &streams_and_devices) {
+  TEST_EXECSPACE exec0(streams_and_devices.streams[0]);
+  TEST_EXECSPACE exec1(streams_and_devices.streams[1]);
+
+  // Must return void to use ASSERT_EQ
+  [&]() {
+    ASSERT_EQ(exec0.cuda_device(), streams_and_devices.devices[0]);
+    ASSERT_EQ(exec1.cuda_device(), streams_and_devices.devices[1]);
+  }();
+
+  return {exec0, exec1};
+}
+
+// Test Interoperability with Cuda Streams
+void test_policies(TEST_EXECSPACE exec0, Kokkos::View<int *, TEST_EXECSPACE> v0,
+                   TEST_EXECSPACE exec, Kokkos::View<int *, TEST_EXECSPACE> v) {
+  using MemorySpace = typename TEST_EXECSPACE::memory_space;
+
+  Kokkos::deep_copy(exec, v, 5);
+  Kokkos::deep_copy(exec0, v0, 5);
+
+  Kokkos::deep_copy(v, v0);
+
+  int sum;
+  int sum0;
+
+  Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range_0",
+                       Kokkos::RangePolicy<TEST_EXECSPACE>(exec0, 0, 100),
+                       Test::FunctorRange<MemorySpace>(v0));
+  Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range",
+                       Kokkos::RangePolicy<TEST_EXECSPACE>(exec, 0, 100),
+                       Test::FunctorRange<MemorySpace>(v));
+  Kokkos::parallel_reduce(
+      "Test::cuda::raw_cuda_stream::RangeReduce_0",
+      Kokkos::RangePolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec0,
+                                                                        0, 100),
+      Test::FunctorRangeReduce<MemorySpace>(v0), sum0);
+  Kokkos::parallel_reduce(
+      "Test::cuda::raw_cuda_stream::RangeReduce",
+      Kokkos::RangePolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec, 0,
+                                                                        100),
+      Test::FunctorRangeReduce<MemorySpace>(v), sum);
+  ASSERT_EQ(600, sum0);
+  ASSERT_EQ(600, sum);
+
+  Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange_0",
+                       Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                           exec0, {0, 0}, {10, 10}),
+                       Test::FunctorMDRange<MemorySpace>(v0));
+  Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange",
+                       Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                           exec, {0, 0}, {10, 10}),
+                       Test::FunctorMDRange<MemorySpace>(v));
+  Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce_0",
+                          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>,
+                                                Kokkos::LaunchBounds<128, 2>>(
+                              exec0, {0, 0}, {10, 10}),
+                          Test::FunctorMDRangeReduce<MemorySpace>(v0), sum0);
+  Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce",
+                          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>,
+                                                Kokkos::LaunchBounds<128, 2>>(
+                              exec, {0, 0}, {10, 10}),
+                          Test::FunctorMDRangeReduce<MemorySpace>(v), sum);
+  ASSERT_EQ(700, sum0);
+  ASSERT_EQ(700, sum);
+
+  Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team_0",
+                       Kokkos::TeamPolicy<TEST_EXECSPACE>(exec0, 10, 10),
+                       Test::FunctorTeam<MemorySpace, TEST_EXECSPACE>(v0));
+  Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team",
+                       Kokkos::TeamPolicy<TEST_EXECSPACE>(exec, 10, 10),
+                       Test::FunctorTeam<MemorySpace, TEST_EXECSPACE>(v));
+  Kokkos::parallel_reduce(
+      "Test::cuda::raw_cuda_stream::Team_0",
+      Kokkos::TeamPolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec0,
+                                                                       10, 10),
+      Test::FunctorTeamReduce<MemorySpace, TEST_EXECSPACE>(v0), sum0);
+  Kokkos::parallel_reduce(
+      "Test::cuda::raw_cuda_stream::Team",
+      Kokkos::TeamPolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec, 10,
+                                                                       10),
+      Test::FunctorTeamReduce<MemorySpace, TEST_EXECSPACE>(v), sum);
+  ASSERT_EQ(800, sum0);
+  ASSERT_EQ(800, sum);
+}
+
+TEST(cuda_multi_gpu, managed_views) {
+  StreamsAndDevices streams_and_devices;
+  {
+    std::array<TEST_EXECSPACE, 2> execs =
+        get_execution_spaces(streams_and_devices);
+
+    Kokkos::View<int *, TEST_EXECSPACE> view0(
+        Kokkos::view_alloc("v0", execs[0]), 100);
+    Kokkos::View<int *, TEST_EXECSPACE> view(Kokkos::view_alloc("v", execs[1]),
+                                             100);
+
+    test_policies(execs[0], view0, execs[1], view);
+  }
+}
+
+TEST(cuda_multi_gpu, unmanaged_views) {
+  StreamsAndDevices streams_and_devices;
+  {
+    std::array<TEST_EXECSPACE, 2> execs =
+        get_execution_spaces(streams_and_devices);
+
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[0].cuda_device()));
+    int *p0;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMalloc(reinterpret_cast<void **>(&p0), sizeof(int) * 100));
+    Kokkos::View<int *, TEST_EXECSPACE> view0(p0, 100);
+
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[1].cuda_device()));
+    int *p;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMalloc(reinterpret_cast<void **>(&p), sizeof(int) * 100));
+    Kokkos::View<int *, TEST_EXECSPACE> view(p, 100);
+
+    test_policies(execs[0], view0, execs[1], view);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p0));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p));
+  }
+}
+
+struct ScratchFunctor {
+  int scratch_size;
+  int R;
+
+  ScratchFunctor(int scratch_size_, int R_)
+      : scratch_size(scratch_size_), R(R_) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const Kokkos::TeamPolicy<Kokkos::Cuda>::member_type &team,
+                  int &error_accum) const {
+    Kokkos::View<int *, Kokkos::Cuda::scratch_memory_space> scratch_mem(
+        team.team_scratch(1), scratch_size);
+
+    // Initialize scratch memory
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size),
+                         [&](int i) { scratch_mem(i) = 0; });
+    team.team_barrier();
+
+    // Increment each entry in scratch memory R times
+    for (int r = 0; r < R; ++r) {
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size),
+                           [&](int i) { scratch_mem(i) += 1; });
+    }
+    team.team_barrier();
+
+    // Check that each scratch entry has been incremented exactly R times
+    int team_error_accum;
+    auto R_loc = R;  // avoid implicit capture of this
+    Kokkos::parallel_reduce(
+        Kokkos::TeamVectorRange(team, 0, scratch_size),
+        [&](int i, int &tsum) {
+          if (scratch_mem(i) != R_loc) {
+            tsum += 1;
+          }
+        },
+        team_error_accum);
+    Kokkos::single(Kokkos::PerTeam(team),
+                   [&]() { error_accum += team_error_accum; });
+  }
+};
+
+void test_scratch(TEST_EXECSPACE exec0, TEST_EXECSPACE exec1) {
+  constexpr int N            = 10;
+  constexpr int R            = 1000;
+  constexpr int scratch_size = 100;
+  using ScratchType = Kokkos::View<int *, Kokkos::Cuda::scratch_memory_space>;
+
+  // Test allocating and using scratch space
+  ScratchFunctor f(scratch_size, R);
+
+  auto policy0 =
+      Kokkos::TeamPolicy<Kokkos::Cuda>(exec0, N, 10)
+          .set_scratch_size(
+              1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size)));
+  auto policy1 =
+      Kokkos::TeamPolicy<Kokkos::Cuda>(exec1, N, 10)
+          .set_scratch_size(
+              1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size)));
+
+  int error0, error1;
+
+  Kokkos::parallel_reduce("test_scratch_device_0", policy0, f, error0);
+  Kokkos::parallel_reduce("test_scratch_device_1", policy1, f, error1);
+  ASSERT_EQ(error0, 0);
+  ASSERT_EQ(error1, 0);
+
+  // Request larger scratch size to trigger a realloc and test
+  const auto new_scratch_size = scratch_size + 10;
+  ScratchFunctor f_more_scratch(new_scratch_size, R);
+
+  auto policy0_more_scratch =
+      Kokkos::TeamPolicy<Kokkos::Cuda>(exec0, N, 10)
+          .set_scratch_size(
+              1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size)));
+  auto policy1_more_scratch =
+      Kokkos::TeamPolicy<Kokkos::Cuda>(exec1, N, 10)
+          .set_scratch_size(
+              1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size)));
+
+  Kokkos::parallel_reduce("test_realloc_scratch_device_0", policy0_more_scratch,
+                          f_more_scratch, error0);
+  Kokkos::parallel_reduce("test_realloc_scratch_device_1", policy1_more_scratch,
+                          f_more_scratch, error1);
+  ASSERT_EQ(error0, 0);
+  ASSERT_EQ(error1, 0);
+}
+
+TEST(cuda_multi_gpu, scratch_space) {
+  StreamsAndDevices streams_and_devices;
+  {
+    std::array<TEST_EXECSPACE, 2> execs =
+        get_execution_spaces(streams_and_devices);
+
+    test_scratch(execs[0], execs[1]);
+  }
+}
+}  // namespace
diff --git a/core/unit_test/cuda/TestCuda_Spaces.cpp b/core/unit_test/cuda/TestCuda_Spaces.cpp
index ae603101abb..11fe6b8555b 100644
--- a/core/unit_test/cuda/TestCuda_Spaces.cpp
+++ b/core/unit_test/cuda/TestCuda_Spaces.cpp
@@ -29,200 +29,166 @@ __global__ void test_cuda_spaces_int_value(int *ptr) {
 
 TEST(cuda, space_access) {
   static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                                Kokkos::HostSpace>::assignable,
-                "");
+                                                Kokkos::HostSpace>::assignable);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                      Kokkos::CudaHostPinnedSpace>::assignable,
-      "");
+                                      Kokkos::CudaHostPinnedSpace>::assignable);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                                 Kokkos::CudaSpace>::assignable,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
+                                       Kokkos::CudaSpace>::assignable);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                                 Kokkos::CudaSpace>::accessible,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
+                                       Kokkos::CudaSpace>::accessible);
 
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                       Kokkos::CudaUVMSpace>::assignable,
-      "");
+                                       Kokkos::CudaUVMSpace>::assignable);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                      Kokkos::CudaUVMSpace>::accessible,
-      "");
+                                      Kokkos::CudaUVMSpace>::accessible);
 
   //--------------------------------------
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace,
-                                                Kokkos::CudaSpace>::assignable,
-                "");
+                                                Kokkos::CudaSpace>::assignable);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace,
-                                      Kokkos::CudaUVMSpace>::assignable,
-      "");
+                                      Kokkos::CudaUVMSpace>::assignable);
 
-  static_assert(
-      !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace,
-                                       Kokkos::CudaHostPinnedSpace>::assignable,
-      "");
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace>::assignable);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace,
-                                      Kokkos::CudaHostPinnedSpace>::accessible,
-      "");
+                                      Kokkos::CudaHostPinnedSpace>::accessible);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace,
-                                                 Kokkos::HostSpace>::assignable,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace,
+                                       Kokkos::HostSpace>::assignable);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace,
-                                                 Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace,
+                                       Kokkos::HostSpace>::accessible);
 
   //--------------------------------------
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace,
-                                      Kokkos::CudaUVMSpace>::assignable,
-      "");
+                                      Kokkos::CudaUVMSpace>::assignable);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace,
-                                                 Kokkos::CudaSpace>::assignable,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace,
+                                       Kokkos::CudaSpace>::assignable);
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace,
-                                                Kokkos::CudaSpace>::accessible,
-                "");
+                                                Kokkos::CudaSpace>::accessible);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace,
-                                                 Kokkos::HostSpace>::assignable,
-                "");
-
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace,
-                                                 Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace,
+                                       Kokkos::HostSpace>::assignable);
 
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace,
-                                       Kokkos::CudaHostPinnedSpace>::assignable,
-      "");
+                                       Kokkos::HostSpace>::accessible);
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace>::assignable);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace,
-                                      Kokkos::CudaHostPinnedSpace>::accessible,
-      "");
+                                      Kokkos::CudaHostPinnedSpace>::accessible);
 
   //--------------------------------------
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace,
-                                      Kokkos::CudaHostPinnedSpace>::assignable,
-      "");
+                                      Kokkos::CudaHostPinnedSpace>::assignable);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace,
-                                                 Kokkos::HostSpace>::assignable,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace,
+                                       Kokkos::HostSpace>::assignable);
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace,
-                                                Kokkos::HostSpace>::accessible,
-                "");
+                                                Kokkos::HostSpace>::accessible);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace,
-                                                 Kokkos::CudaSpace>::assignable,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace,
+                                       Kokkos::CudaSpace>::assignable);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace,
-                                                 Kokkos::CudaSpace>::accessible,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace,
+                                       Kokkos::CudaSpace>::accessible);
 
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace,
-                                       Kokkos::CudaUVMSpace>::assignable,
-      "");
+                                       Kokkos::CudaUVMSpace>::assignable);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace,
-                                      Kokkos::CudaUVMSpace>::accessible,
-      "");
+                                      Kokkos::CudaUVMSpace>::accessible);
 
   //--------------------------------------
 
   static_assert(
-      !Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::HostSpace>::accessible,
-      "");
+      !Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::HostSpace>::accessible);
 
   static_assert(
-      Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::CudaSpace>::accessible,
-      "");
+      Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::CudaSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<Kokkos::Cuda,
-                                           Kokkos::CudaUVMSpace>::accessible,
-                "");
+                                           Kokkos::CudaUVMSpace>::accessible);
 
   static_assert(
       Kokkos::SpaceAccessibility<Kokkos::Cuda,
-                                 Kokkos::CudaHostPinnedSpace>::accessible,
-      "");
+                                 Kokkos::CudaHostPinnedSpace>::accessible);
 
   static_assert(!Kokkos::SpaceAccessibility<Kokkos::HostSpace,
-                                            Kokkos::CudaSpace>::accessible,
-                "");
+                                            Kokkos::CudaSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace,
-                                           Kokkos::CudaUVMSpace>::accessible,
-                "");
+                                           Kokkos::CudaUVMSpace>::accessible);
 
   static_assert(
       Kokkos::SpaceAccessibility<Kokkos::HostSpace,
-                                 Kokkos::CudaHostPinnedSpace>::accessible,
-      "");
+                                 Kokkos::CudaHostPinnedSpace>::accessible);
 
   static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
-                             Kokkos::HostSpace>::value,
-                "");
+                             Kokkos::HostSpace>::value);
 
   static_assert(
       std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space,
                    Kokkos::Device<Kokkos::HostSpace::execution_space,
-                                  Kokkos::CudaUVMSpace>>::value,
-      "");
+                                  Kokkos::CudaUVMSpace>>::value);
 
   static_assert(
       std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaHostPinnedSpace>::Space,
-                   Kokkos::CudaHostPinnedSpace>::value,
-      "");
+                   Kokkos::CudaHostPinnedSpace>::value);
 
   static_assert(std::is_same<Kokkos::Device<Kokkos::HostSpace::execution_space,
                                             Kokkos::CudaUVMSpace>,
                              Kokkos::Device<Kokkos::HostSpace::execution_space,
-                                            Kokkos::CudaUVMSpace>>::value,
-                "");
+                                            Kokkos::CudaUVMSpace>>::value);
 
   static_assert(
       Kokkos::SpaceAccessibility<Kokkos::Impl::HostMirror<Kokkos::Cuda>::Space,
-                                 Kokkos::HostSpace>::accessible,
-      "");
+                                 Kokkos::HostSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
-                    Kokkos::HostSpace>::accessible,
-                "");
+                Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
+                Kokkos::HostSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space,
-                    Kokkos::HostSpace>::accessible,
-                "");
+                Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space,
+                Kokkos::HostSpace>::accessible);
 
-  static_assert(
-      Kokkos::SpaceAccessibility<
-          Kokkos::Impl::HostMirror<Kokkos::CudaHostPinnedSpace>::Space,
-          Kokkos::HostSpace>::accessible,
-      "");
+  static_assert(Kokkos::SpaceAccessibility<
+                Kokkos::Impl::HostMirror<Kokkos::CudaHostPinnedSpace>::Space,
+                Kokkos::HostSpace>::accessible);
 #ifdef KOKKOS_ENABLE_CUDA_UVM
   using uvm_view = Kokkos::View<double *, Kokkos::CudaUVMSpace>;
   static_assert(std::is_same<uvm_view::HostMirror::execution_space,
diff --git a/core/unit_test/default/TestDefaultDeviceType.cpp b/core/unit_test/default/TestDefaultDeviceType.cpp
index 4a70d9e79be..b88f052ea2e 100644
--- a/core/unit_test/default/TestDefaultDeviceType.cpp
+++ b/core/unit_test/default/TestDefaultDeviceType.cpp
@@ -32,16 +32,13 @@ TEST(TEST_CATEGORY, host_space_access) {
       Kokkos::Impl::HostMirror<Kokkos::DefaultExecutionSpace>::Space;
 
   static_assert(Kokkos::SpaceAccessibility<host_exec_space,
-                                           Kokkos::HostSpace>::accessible,
-                "");
+                                           Kokkos::HostSpace>::accessible);
 
   static_assert(
-      Kokkos::SpaceAccessibility<device_space, Kokkos::HostSpace>::accessible,
-      "");
+      Kokkos::SpaceAccessibility<device_space, Kokkos::HostSpace>::accessible);
 
   static_assert(
-      Kokkos::SpaceAccessibility<mirror_space, Kokkos::HostSpace>::accessible,
-      "");
+      Kokkos::SpaceAccessibility<mirror_space, Kokkos::HostSpace>::accessible);
 }
 
 }  // namespace Test
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp
deleted file mode 100644
index 348b9feeab0..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp
deleted file mode 100644
index a77a55ea653..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp
deleted file mode 100644
index 1b6a140920c..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp
deleted file mode 100644
index 316bc85526f..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp
deleted file mode 100644
index 6344960a1cf..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp
deleted file mode 100644
index 4515174b82b..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp
deleted file mode 100644
index 7ead50f0944..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp
deleted file mode 100644
index e12b9b3894a..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp
deleted file mode 100644
index 959d0ab7503..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp
deleted file mode 100644
index 07d841519dc..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp
deleted file mode 100644
index 042a515b16a..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp
deleted file mode 100644
index dba401e5bcf..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp
deleted file mode 100644
index a44c58bdb55..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp
deleted file mode 100644
index cac0841dd83..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp
deleted file mode 100644
index bafe3b3fd2a..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp
deleted file mode 100644
index 3a4dd9d2533..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp
deleted file mode 100644
index 4e92aae565a..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp
deleted file mode 100644
index 44b8f3428d9..00000000000
--- a/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09
-#include <TestDefaultDeviceTypeInit.hpp>
diff --git a/core/unit_test/hip/TestHIP_Memory_Requirements.cpp b/core/unit_test/hip/TestHIP_Memory_Requirements.cpp
index 8c72e9f2972..a213453ea18 100644
--- a/core/unit_test/hip/TestHIP_Memory_Requirements.cpp
+++ b/core/unit_test/hip/TestHIP_Memory_Requirements.cpp
@@ -48,6 +48,9 @@ TEST(hip, memory_requirements) {
   // we want all user-facing memory in hip to be coarse grained. As of
   // today(07.01.22) the documentation is not reliable/correct, we test the
   // memory on the device and host
+  // FIXME_HIP
+  GTEST_SKIP() << "skipping the test because the CI on MI100 returns:  error( "
+                  "hipErrorInvalidValue)";
   KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPSpace, int, 10);
   KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPHostPinnedSpace, int, 10);
   KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPManagedSpace, int, 10);
diff --git a/core/unit_test/hip/TestHIP_Spaces.cpp b/core/unit_test/hip/TestHIP_Spaces.cpp
index 14fd4e28837..8f7499c244b 100644
--- a/core/unit_test/hip/TestHIP_Spaces.cpp
+++ b/core/unit_test/hip/TestHIP_Spaces.cpp
@@ -29,198 +29,164 @@ __global__ void test_hip_spaces_int_value(int *ptr) {
 
 TEST(hip, space_access) {
   static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                                Kokkos::HostSpace>::assignable,
-                "");
+                                                Kokkos::HostSpace>::assignable);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                      Kokkos::HIPHostPinnedSpace>::assignable,
-      "");
+                                      Kokkos::HIPHostPinnedSpace>::assignable);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                                 Kokkos::HIPSpace>::assignable,
-                "");
+                                                 Kokkos::HIPSpace>::assignable);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                                 Kokkos::HIPSpace>::accessible,
-                "");
+                                                 Kokkos::HIPSpace>::accessible);
 
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                       Kokkos::HIPManagedSpace>::assignable,
-      "");
+                                       Kokkos::HIPManagedSpace>::assignable);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                      Kokkos::HIPManagedSpace>::accessible,
-      "");
+                                      Kokkos::HIPManagedSpace>::accessible);
 
   //--------------------------------------
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace,
-                                                Kokkos::HIPSpace>::assignable,
-                "");
+                                                Kokkos::HIPSpace>::assignable);
 
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace,
-                                       Kokkos::HIPHostPinnedSpace>::assignable,
-      "");
+                                       Kokkos::HIPHostPinnedSpace>::assignable);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace,
-                                      Kokkos::HIPHostPinnedSpace>::accessible,
-      "");
+                                      Kokkos::HIPHostPinnedSpace>::accessible);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace,
-                                                 Kokkos::HostSpace>::assignable,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace,
+                                       Kokkos::HostSpace>::assignable);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace,
-                                                 Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace,
+                                       Kokkos::HostSpace>::accessible);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace,
-                                      Kokkos::HIPManagedSpace>::assignable,
-      "");
+                                      Kokkos::HIPManagedSpace>::assignable);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace,
-                                      Kokkos::HIPManagedSpace>::accessible,
-      "");
+                                      Kokkos::HIPManagedSpace>::accessible);
 
   //--------------------------------------
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace,
-                                      Kokkos::HIPHostPinnedSpace>::assignable,
-      "");
+                                      Kokkos::HIPHostPinnedSpace>::assignable);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace,
-                                                 Kokkos::HostSpace>::assignable,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace,
+                                       Kokkos::HostSpace>::assignable);
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace,
-                                                Kokkos::HostSpace>::accessible,
-                "");
+                                                Kokkos::HostSpace>::accessible);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace,
-                                                 Kokkos::HIPSpace>::assignable,
-                "");
+                                                 Kokkos::HIPSpace>::assignable);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace,
-                                                 Kokkos::HIPSpace>::accessible,
-                "");
+                                                 Kokkos::HIPSpace>::accessible);
 
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace,
-                                       Kokkos::HIPManagedSpace>::assignable,
-      "");
+                                       Kokkos::HIPManagedSpace>::assignable);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace,
-                                      Kokkos::HIPManagedSpace>::accessible,
-      "");
+                                      Kokkos::HIPManagedSpace>::accessible);
 
   //--------------------------------------
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace,
-                                      Kokkos::HIPManagedSpace>::assignable,
-      "");
+                                      Kokkos::HIPManagedSpace>::assignable);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace,
-                                                 Kokkos::HostSpace>::assignable,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace,
+                                       Kokkos::HostSpace>::assignable);
 
-  static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace,
-                                                 Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace,
+                                       Kokkos::HostSpace>::accessible);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace,
-                                                 Kokkos::HIPSpace>::assignable,
-                "");
+                                                 Kokkos::HIPSpace>::assignable);
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace,
-                                                Kokkos::HIPSpace>::accessible,
-                "");
+                                                Kokkos::HIPSpace>::accessible);
 
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace,
-                                       Kokkos::HIPHostPinnedSpace>::assignable,
-      "");
+                                       Kokkos::HIPHostPinnedSpace>::assignable);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace,
-                                      Kokkos::HIPHostPinnedSpace>::accessible,
-      "");
+                                      Kokkos::HIPHostPinnedSpace>::accessible);
 
   //--------------------------------------
 
   static_assert(
-      !Kokkos::SpaceAccessibility<Kokkos::HIP, Kokkos::HostSpace>::accessible,
-      "");
+      !Kokkos::SpaceAccessibility<Kokkos::HIP, Kokkos::HostSpace>::accessible);
 
   static_assert(
-      Kokkos::SpaceAccessibility<Kokkos::HIP, Kokkos::HIPSpace>::accessible,
-      "");
+      Kokkos::SpaceAccessibility<Kokkos::HIP, Kokkos::HIPSpace>::accessible);
 
   static_assert(
       Kokkos::SpaceAccessibility<Kokkos::HIP,
-                                 Kokkos::HIPHostPinnedSpace>::accessible,
-      "");
+                                 Kokkos::HIPHostPinnedSpace>::accessible);
 
-  static_assert(Kokkos::SpaceAccessibility<Kokkos::HIP,
-                                           Kokkos::HIPManagedSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<Kokkos::HIP,
+                                 Kokkos::HIPManagedSpace>::accessible);
 
   static_assert(!Kokkos::SpaceAccessibility<Kokkos::HostSpace,
-                                            Kokkos::HIPSpace>::accessible,
-                "");
+                                            Kokkos::HIPSpace>::accessible);
 
   static_assert(
       Kokkos::SpaceAccessibility<Kokkos::HostSpace,
-                                 Kokkos::HIPHostPinnedSpace>::accessible,
-      "");
+                                 Kokkos::HIPHostPinnedSpace>::accessible);
 
-  static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace,
-                                           Kokkos::HIPManagedSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 Kokkos::HIPManagedSpace>::accessible);
 
   static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::HIPSpace>::Space,
-                             Kokkos::HostSpace>::value,
-                "");
+                             Kokkos::HostSpace>::value);
 
   static_assert(
       std::is_same<Kokkos::Impl::HostMirror<Kokkos::HIPHostPinnedSpace>::Space,
-                   Kokkos::HIPHostPinnedSpace>::value,
-      "");
+                   Kokkos::HIPHostPinnedSpace>::value);
 
   static_assert(
       std::is_same<Kokkos::Impl::HostMirror<Kokkos::HIPManagedSpace>::Space,
                    Kokkos::Device<Kokkos::HostSpace::execution_space,
-                                  Kokkos::HIPManagedSpace>>::value,
-      "");
+                                  Kokkos::HIPManagedSpace>>::value);
 
   static_assert(
       Kokkos::SpaceAccessibility<Kokkos::Impl::HostMirror<Kokkos::HIP>::Space,
-                                 Kokkos::HostSpace>::accessible,
-      "");
+                                 Kokkos::HostSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::Impl::HostMirror<Kokkos::HIPSpace>::Space,
-                    Kokkos::HostSpace>::accessible,
-                "");
+                Kokkos::Impl::HostMirror<Kokkos::HIPSpace>::Space,
+                Kokkos::HostSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::Impl::HostMirror<Kokkos::HIPHostPinnedSpace>::Space,
-                    Kokkos::HostSpace>::accessible,
-                "");
+                Kokkos::Impl::HostMirror<Kokkos::HIPHostPinnedSpace>::Space,
+                Kokkos::HostSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::Impl::HostMirror<Kokkos::HIPManagedSpace>::Space,
-                    Kokkos::HostSpace>::accessible,
-                "");
+                Kokkos::Impl::HostMirror<Kokkos::HIPManagedSpace>::Space,
+                Kokkos::HostSpace>::accessible);
 }
 
 template <class MemSpace, class ExecSpace>
diff --git a/core/unit_test/incremental/Test01_execspace.hpp b/core/unit_test/incremental/Test01_execspace.hpp
index 25c7138ed3c..d7b2a57b442 100644
--- a/core/unit_test/incremental/Test01_execspace.hpp
+++ b/core/unit_test/incremental/Test01_execspace.hpp
@@ -62,8 +62,10 @@ struct TestIncrExecSpace {
     auto concurrency = ExecSpace().concurrency();
     ASSERT_GT(concurrency, 0);
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
     int in_parallel = ExecSpace::in_parallel();
     ASSERT_FALSE(in_parallel);
+#endif
 
     const char* name = ExecSpace::name();
     std::cout << name << std::endl;
diff --git a/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp b/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp
deleted file mode 100644
index 92b8032bf0c..00000000000
--- a/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#include <TestOpenMP_Category.hpp>
-#include <Kokkos_Core.hpp>
-
-#include <mutex>
-
-namespace Test {
-
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-TEST(openmp, partition_master) {
-  using Mutex = Kokkos::Experimental::MasterLock<Kokkos::OpenMP>;
-
-  Mutex mtx;
-  int errors = 0;
-
-  auto master = [&errors, &mtx](int /*partition_id*/, int /*num_partitions*/) {
-    const int pool_size = Kokkos::OpenMP().impl_thread_pool_size();
-
-    {
-      std::unique_lock<Mutex> lock(mtx);
-      if (Kokkos::OpenMP::in_parallel()) {
-        ++errors;
-      }
-      if (Kokkos::OpenMP::impl_thread_pool_rank() != 0) {
-        ++errors;
-      }
-    }
-
-    {
-      int local_errors = 0;
-      Kokkos::parallel_reduce(
-          Kokkos::RangePolicy<Kokkos::OpenMP>(0, 1000),
-          [pool_size](const int, int& errs) {
-            if (Kokkos::OpenMP().impl_thread_pool_size() != pool_size) {
-              ++errs;
-            }
-          },
-          local_errors);
-      Kokkos::atomic_add(&errors, local_errors);
-    }
-
-    Kokkos::Experimental::UniqueToken<Kokkos::OpenMP> token;
-
-    Kokkos::View<int*, Kokkos::OpenMP> count("", token.size());
-
-    Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0, 1000),
-                         [=](const int) {
-                           int i = token.acquire();
-                           ++count[i];
-                           token.release(i);
-                         });
-
-    Kokkos::View<int, Kokkos::OpenMP> sum("");
-    Kokkos::parallel_for(
-        Kokkos::RangePolicy<Kokkos::OpenMP>(0, token.size()),
-        [=](const int i) { Kokkos::atomic_add(sum.data(), count[i]); });
-
-    if (sum() != 1000) {
-      Kokkos::atomic_add(&errors, 1);
-    }
-  };
-
-  master(0, 1);
-
-  ASSERT_EQ(errors, 0);
-
-  Kokkos::OpenMP::partition_master(master);
-  ASSERT_EQ(errors, 0);
-
-  Kokkos::OpenMP::partition_master(master, 4, 0);
-  ASSERT_EQ(errors, 0);
-
-  Kokkos::OpenMP::partition_master(master, 0, 4);
-  ASSERT_EQ(errors, 0);
-
-  Kokkos::OpenMP::partition_master(master, 2, 2);
-  ASSERT_EQ(errors, 0);
-
-  Kokkos::OpenMP::partition_master(master, 8, 0);
-  ASSERT_EQ(errors, 0);
-
-  Kokkos::OpenMP::partition_master(master, 0, 8);
-  ASSERT_EQ(errors, 0);
-
-  Kokkos::OpenMP::partition_master(master, 8, 8);
-  ASSERT_EQ(errors, 0);
-}
-#endif
-
-}  // namespace Test
diff --git a/core/unit_test/sycl/TestSYCL_Spaces.cpp b/core/unit_test/sycl/TestSYCL_Spaces.cpp
index 914f8432488..a4fd053e83d 100644
--- a/core/unit_test/sycl/TestSYCL_Spaces.cpp
+++ b/core/unit_test/sycl/TestSYCL_Spaces.cpp
@@ -21,235 +21,192 @@ namespace Test {
 
 TEST(sycl, space_access) {
   static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
-                                                Kokkos::HostSpace>::assignable,
-                "");
+                                                Kokkos::HostSpace>::assignable);
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::HostSpace,
-                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
-                "");
+                Kokkos::HostSpace,
+                Kokkos::Experimental::SYCLHostUSMSpace>::assignable);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::HostSpace,
-                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
-                "");
+                Kokkos::HostSpace,
+                Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::HostSpace,
-                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
-                "");
+                Kokkos::HostSpace,
+                Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::HostSpace,
-                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
-                "");
+                Kokkos::HostSpace,
+                Kokkos::Experimental::SYCLSharedUSMSpace>::assignable);
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::HostSpace,
-                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
-                "");
+                Kokkos::HostSpace,
+                Kokkos::Experimental::SYCLSharedUSMSpace>::accessible);
 
   //--------------------------------------
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLDeviceUSMSpace,
-                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
-                "");
+                Kokkos::Experimental::SYCLDeviceUSMSpace,
+                Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable);
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLDeviceUSMSpace,
-                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
-                "");
+                Kokkos::Experimental::SYCLDeviceUSMSpace,
+                Kokkos::Experimental::SYCLSharedUSMSpace>::assignable);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLDeviceUSMSpace,
-                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
-                "");
+                Kokkos::Experimental::SYCLDeviceUSMSpace,
+                Kokkos::Experimental::SYCLHostUSMSpace>::assignable);
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLDeviceUSMSpace,
-                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
-                "");
+                Kokkos::Experimental::SYCLDeviceUSMSpace,
+                Kokkos::Experimental::SYCLHostUSMSpace>::accessible);
 
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                                       Kokkos::HostSpace>::assignable,
-      "");
+                                       Kokkos::HostSpace>::assignable);
 
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                                       Kokkos::HostSpace>::accessible,
-      "");
+                                       Kokkos::HostSpace>::accessible);
 
   //--------------------------------------
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLSharedUSMSpace,
-                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
-                "");
+                Kokkos::Experimental::SYCLSharedUSMSpace,
+                Kokkos::Experimental::SYCLSharedUSMSpace>::assignable);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLSharedUSMSpace,
-                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
-                "");
+                Kokkos::Experimental::SYCLSharedUSMSpace,
+                Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable);
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLSharedUSMSpace,
-                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
-                "");
+                Kokkos::Experimental::SYCLSharedUSMSpace,
+                Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible);
 
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
-                                       Kokkos::HostSpace>::assignable,
-      "");
+                                       Kokkos::HostSpace>::assignable);
 
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
-                                       Kokkos::HostSpace>::accessible,
-      "");
+                                       Kokkos::HostSpace>::accessible);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLSharedUSMSpace,
-                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
-                "");
+                Kokkos::Experimental::SYCLSharedUSMSpace,
+                Kokkos::Experimental::SYCLHostUSMSpace>::assignable);
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLSharedUSMSpace,
-                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
-                "");
+                Kokkos::Experimental::SYCLSharedUSMSpace,
+                Kokkos::Experimental::SYCLHostUSMSpace>::accessible);
 
   //--------------------------------------
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLHostUSMSpace,
-                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
-                "");
+                Kokkos::Experimental::SYCLHostUSMSpace,
+                Kokkos::Experimental::SYCLHostUSMSpace>::assignable);
 
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
-                                       Kokkos::HostSpace>::assignable,
-      "");
+                                       Kokkos::HostSpace>::assignable);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
-                                      Kokkos::HostSpace>::accessible,
-      "");
+                                      Kokkos::HostSpace>::accessible);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLHostUSMSpace,
-                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
-                "");
+                Kokkos::Experimental::SYCLHostUSMSpace,
+                Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLHostUSMSpace,
-                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
-                "");
+                Kokkos::Experimental::SYCLHostUSMSpace,
+                Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible);
 
   static_assert(!Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLHostUSMSpace,
-                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
-                "");
+                Kokkos::Experimental::SYCLHostUSMSpace,
+                Kokkos::Experimental::SYCLSharedUSMSpace>::assignable);
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::Experimental::SYCLHostUSMSpace,
-                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
-                "");
+                Kokkos::Experimental::SYCLHostUSMSpace,
+                Kokkos::Experimental::SYCLSharedUSMSpace>::accessible);
 
   //--------------------------------------
 
   static_assert(!Kokkos::SpaceAccessibility<Kokkos::Experimental::SYCL,
-                                            Kokkos::HostSpace>::accessible,
-                "");
+                                            Kokkos::HostSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::Experimental::SYCL,
-                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
-                "");
+                Kokkos::Experimental::SYCL,
+                Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::Experimental::SYCL,
-                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
-                "");
+                Kokkos::Experimental::SYCL,
+                Kokkos::Experimental::SYCLSharedUSMSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::Experimental::SYCL,
-                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
-                "");
+                Kokkos::Experimental::SYCL,
+                Kokkos::Experimental::SYCLHostUSMSpace>::accessible);
 
   static_assert(!Kokkos::SpaceAccessibility<
-                    Kokkos::HostSpace,
-                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
-                "");
+                Kokkos::HostSpace,
+                Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::HostSpace,
-                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
-                "");
+                Kokkos::HostSpace,
+                Kokkos::Experimental::SYCLSharedUSMSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::HostSpace,
-                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
-                "");
+                Kokkos::HostSpace,
+                Kokkos::Experimental::SYCLHostUSMSpace>::accessible);
 
   static_assert(
       std::is_same<Kokkos::Impl::HostMirror<
                        Kokkos::Experimental::SYCLDeviceUSMSpace>::Space,
-                   Kokkos::HostSpace>::value,
-      "");
+                   Kokkos::HostSpace>::value);
 
   static_assert(
       std::is_same<
           Kokkos::Impl::HostMirror<
               Kokkos::Experimental::SYCLSharedUSMSpace>::Space,
           Kokkos::Device<Kokkos::HostSpace::execution_space,
-                         Kokkos::Experimental::SYCLSharedUSMSpace>>::value,
-      "");
+                         Kokkos::Experimental::SYCLSharedUSMSpace>>::value);
 
   static_assert(
       Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
-                                      Kokkos::HostSpace>::accessible,
-      "");
+                                      Kokkos::HostSpace>::accessible);
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<
-                    Kokkos::HostSpace,
-                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
-                "");
+                Kokkos::HostSpace,
+                Kokkos::Experimental::SYCLHostUSMSpace>::accessible);
 
   static_assert(std::is_same<Kokkos::Impl::HostMirror<
                                  Kokkos::Experimental::SYCLHostUSMSpace>::Space,
-                             Kokkos::Experimental::SYCLHostUSMSpace>::value,
-                "");
+                             Kokkos::Experimental::SYCLHostUSMSpace>::value);
 
   static_assert(
       std::is_same<
           Kokkos::Device<Kokkos::HostSpace::execution_space,
                          Kokkos::Experimental::SYCLSharedUSMSpace>,
           Kokkos::Device<Kokkos::HostSpace::execution_space,
-                         Kokkos::Experimental::SYCLSharedUSMSpace>>::value,
-      "");
+                         Kokkos::Experimental::SYCLSharedUSMSpace>>::value);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::Impl::HostMirror<Kokkos::Experimental::SYCL>::Space,
-                    Kokkos::HostSpace>::accessible,
-                "");
+                Kokkos::Impl::HostMirror<Kokkos::Experimental::SYCL>::Space,
+                Kokkos::HostSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::Impl::HostMirror<
-                        Kokkos::Experimental::SYCLDeviceUSMSpace>::Space,
-                    Kokkos::HostSpace>::accessible,
-                "");
+                Kokkos::Impl::HostMirror<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::Space,
+                Kokkos::HostSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::Impl::HostMirror<
-                        Kokkos::Experimental::SYCLSharedUSMSpace>::Space,
-                    Kokkos::HostSpace>::accessible,
-                "");
+                Kokkos::Impl::HostMirror<
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::Space,
+                Kokkos::HostSpace>::accessible);
 
   static_assert(Kokkos::SpaceAccessibility<
-                    Kokkos::Impl::HostMirror<
-                        Kokkos::Experimental::SYCLHostUSMSpace>::Space,
-                    Kokkos::HostSpace>::accessible,
-                "");
+                Kokkos::Impl::HostMirror<
+                    Kokkos::Experimental::SYCLHostUSMSpace>::Space,
+                Kokkos::HostSpace>::accessible);
 }
 
 TEST(sycl, uvm) {
diff --git a/core/unit_test/tools/TestEventCorrectness.hpp b/core/unit_test/tools/TestEventCorrectness.hpp
index 3c85f661aae..946169a786d 100644
--- a/core/unit_test/tools/TestEventCorrectness.hpp
+++ b/core/unit_test/tools/TestEventCorrectness.hpp
@@ -409,14 +409,19 @@ TEST(kokkosp, parallel_scan_no_fence) {
         << "skipping since the OpenMPTarget backend has unexpected fences";
 #endif
 
+  // Execute the parallel_scan first without looking for fence events.
+  // Depending on the backend implementation and the order of tests,
+  // it might be that the first call to parallel_scan is reallocating scratch
+  // memory which implies a fence when deallocating. We are not interested in
+  // detecting this event.
+  TestScanFunctor tf;
+  Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf);
+
   using namespace Kokkos::Test::Tools;
   listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
                      Config::EnableFences());
   auto success = validate_absence(
-      [=]() {
-        TestScanFunctor tf;
-        Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf);
-      },
+      [=]() { Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); },
       [=](BeginFenceEvent begin_event) {
         if (begin_event.name.find("Debug Only Check for Execution Error") !=
                 std::string::npos ||
@@ -450,13 +455,20 @@ TEST(kokkosp, parallel_scan_no_fence_view) {
         << "skipping since the OpenMPTarget backend has unexpected fences";
 #endif
 
+  // Execute the parallel_scan first without looking for fence events.
+  // Depending on the backend implementation and the order of tests,
+  // it might be that the first call to parallel_scan is reallocating scratch
+  // memory which implies a fence when deallocating. We are not interested in
+  // detecting this event.
+  TestScanFunctor tf;
+  Kokkos::View<typename TestScanFunctor::value_type> v("scan_result");
+  Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf, v);
+
   using namespace Kokkos::Test::Tools;
   listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
                      Config::EnableFences());
-  Kokkos::View<typename TestScanFunctor::value_type> v("scan_result");
   auto success = validate_absence(
       [=]() {
-        TestScanFunctor tf;
         Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf, v);
       },
       [=](BeginFenceEvent begin_event) {
diff --git a/core/unit_test/tools/TestLogicalSpaces.hpp b/core/unit_test/tools/TestLogicalSpaces.hpp
deleted file mode 100644
index 4e56f8996a0..00000000000
--- a/core/unit_test/tools/TestLogicalSpaces.hpp
+++ /dev/null
@@ -1,177 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-#include <iostream>
-#include <gtest/gtest.h>
-#include "Kokkos_Core.hpp"
-
-#include <impl/Kokkos_Stacktrace.hpp>
-
-namespace Test {
-
-void debug_print(const Kokkos_Profiling_SpaceHandle hand, const char* name,
-                 const void* ptr, const size_t size) {
-  std::cout << "Alloc: " << hand.name << ", [" << name << "," << ptr << "] "
-            << size << std::endl;
-}
-void debug_dealloc(const Kokkos_Profiling_SpaceHandle hand, const char* name,
-                   const void* ptr, const size_t size) {
-  std::cout << "Dealloc: " << hand.name << ", [" << name << "," << ptr << "] "
-            << size << std::endl;
-}
-
-void fail_on_event(const Kokkos::Profiling::SpaceHandle, const char*,
-                   const void*, const uint64_t) {
-  ASSERT_TRUE(false) << "Unexpected memory event";
-}
-
-void expect_no_events() {
-  Kokkos::Tools::Experimental::set_allocate_data_callback(&fail_on_event);
-  Kokkos::Tools::Experimental::set_deallocate_data_callback(&fail_on_event);
-}
-
-std::string expected_view_name;
-std::string expected_space_name;
-std::string error_message;
-void expect_allocation_event(const std::string evn, const std::string esn,
-                             const std::string em) {
-  expected_view_name  = evn;
-  expected_space_name = esn;
-  error_message       = em;
-  Kokkos::Tools::Experimental::set_allocate_data_callback(
-      [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*,
-         const uint64_t) {
-        ASSERT_EQ(std::string(hand.name), expected_space_name)
-            << error_message << " (bad handle)";
-        ASSERT_EQ(std::string(name), expected_view_name)
-            << error_message << " (bad view name)";
-        expect_no_events();
-      });
-}
-void expect_deallocation_event(const std::string& evn, const std::string& esn,
-                               const std::string em) {
-  expected_view_name  = evn;
-  expected_space_name = esn;
-  error_message       = em;
-  Kokkos::Tools::Experimental::set_deallocate_data_callback(
-      [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*,
-         const uint64_t) {
-        ASSERT_EQ(std::string(hand.name), expected_space_name)
-            << error_message << " (bad handle)";
-        ASSERT_EQ(std::string(name), expected_view_name)
-            << error_message << " (bad view name)";
-        expect_no_events();
-      });
-}
-
-struct TestSpaceNamer {
-  static constexpr const char* get_name() { return "TestSpace"; }
-};
-struct TestSpaceNamerTwo {
-  static constexpr const char* get_name() { return "YoDawg"; }
-};
-struct TestSpaceNamerThree {
-  static constexpr const char* get_name() { return "CustomAccessSpace"; }
-};
-using fake_memory_space = Kokkos::Experimental::LogicalMemorySpace<
-    Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace, TestSpaceNamer,
-    Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>;
-
-void test_view_construct() {
-  {
-    expect_allocation_event("puppy_view", "TestSpace", "View allocation");
-    Kokkos::View<double*, fake_memory_space> pup_view("puppy_view", 1000);
-    expect_deallocation_event("puppy_view", "TestSpace", "View free");
-  }
-  Kokkos::Tools::Experimental::pause_tools();
-}
-void test_malloc_free() {
-  expect_allocation_event("does_malloc_work", "TestSpace",
-                          "Error in malloc event");
-  auto* temp =
-      Kokkos::kokkos_malloc<fake_memory_space>("does_malloc_work", 1000);
-  expect_deallocation_event("does_malloc_work", "TestSpace", "Error in free");
-  Kokkos::kokkos_free<fake_memory_space>(temp);
-  Kokkos::Tools::Experimental::pause_tools();
-}
-void test_chained_spaces() {
-  using doubly_fake_memory_space = Kokkos::Experimental::LogicalMemorySpace<
-      fake_memory_space, Kokkos::DefaultHostExecutionSpace, TestSpaceNamerTwo,
-      Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>;
-  {
-    expect_allocation_event("xzibit_dot_jpeg", "YoDawg",
-                            "Chained space view allocation");
-    Kokkos::View<double*, doubly_fake_memory_space> pup_view("xzibit_dot_jpeg",
-                                                             1000);
-    expect_deallocation_event("xzibit_dot_jpeg", "YoDawg",
-                              "Chained space free");
-  }
-  Kokkos::Tools::Experimental::pause_tools();
-}
-void test_space_allocations() {
-  fake_memory_space debug_space;
-  expect_allocation_event("allocation_from_space", "TestSpace",
-                          "Space allocation");
-  auto* temp = debug_space.allocate("allocation_from_space", 1000);
-  expect_deallocation_event("allocation_from_space", "TestSpace",
-                            "Space deallocation");
-  debug_space.deallocate("allocation_from_space", temp, 1000);
-  Kokkos::Tools::Experimental::pause_tools();
-}
-template <typename Space>
-struct AccessCheckKernel {
-  Kokkos::View<double*, Space> data;
-  KOKKOS_FUNCTION void operator()(const int i) const { data[i] = i; }
-};
-
-template <typename Space>
-void test_allowed_access() {
-  constexpr const int data_size = 1000;
-  // We use an unmananged View here since we want to detect a memory access
-  // violation in the parallel_for and not in the initialization of the View.
-  std::vector<double> test_data(data_size);
-  Kokkos::View<double*, Space> test_view(test_data.data(), data_size);
-  AccessCheckKernel<Space> functor{test_view};
-  Kokkos::parallel_for(
-      "access_allowed",
-      Kokkos::RangePolicy<typename Space::execution_space>(0, data_size),
-      functor);
-  Kokkos::fence();
-}
-
-using semantically_independent_logical_space =
-    Kokkos::Experimental::LogicalMemorySpace<
-        Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace,
-        TestSpaceNamerThree,
-        Kokkos::Experimental::LogicalSpaceSharesAccess::no_shared_access>;
-
-TEST(defaultdevicetype, logical_space_views) { test_view_construct(); }
-TEST(defaultdevicetype, logical_space_malloc) { test_malloc_free(); }
-TEST(defaultdevicetype, logical_space_alloc) { test_space_allocations(); }
-TEST(defaultdevicetype, chained_logical_spaces) { test_chained_spaces(); }
-TEST(defaultdevicetype, access_allowed) {
-  test_allowed_access<fake_memory_space>();
-}
-// FIXME_SYCL
-#if !(defined(KOKKOS_COMPILER_INTEL_LLVM) && defined(KOKKOS_ENABLE_SYCL))
-TEST(defaultdevicetype_DeathTest, access_forbidden) {
-  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
-  ASSERT_DEATH(
-      { test_allowed_access<semantically_independent_logical_space>(); },
-      "Kokkos::View ERROR: attempt to access inaccessible memory space");
-}
-#endif
-
-}  // namespace Test
diff --git a/core/unit_test/tools/TestProfilingSection.cpp b/core/unit_test/tools/TestProfilingSection.cpp
index 318766ac455..9d35d67feb0 100644
--- a/core/unit_test/tools/TestProfilingSection.cpp
+++ b/core/unit_test/tools/TestProfilingSection.cpp
@@ -108,8 +108,8 @@ TEST(defaultdevicetype, profiling_section) {
 }
 
 using Kokkos::Profiling::ProfilingSection;
-static_assert(!std::is_default_constructible<ProfilingSection>::value, "");
-static_assert(!std::is_copy_constructible<ProfilingSection>::value, "");
-static_assert(!std::is_move_constructible<ProfilingSection>::value, "");
-static_assert(!std::is_copy_assignable<ProfilingSection>::value, "");
-static_assert(!std::is_move_assignable<ProfilingSection>::value, "");
+static_assert(!std::is_default_constructible<ProfilingSection>::value);
+static_assert(!std::is_copy_constructible<ProfilingSection>::value);
+static_assert(!std::is_move_constructible<ProfilingSection>::value);
+static_assert(!std::is_copy_assignable<ProfilingSection>::value);
+static_assert(!std::is_move_assignable<ProfilingSection>::value);
diff --git a/example/tutorial/01_hello_world/hello_world.cpp b/example/tutorial/01_hello_world/hello_world.cpp
index 5b8a21af833..22b8b6d63c8 100644
--- a/example/tutorial/01_hello_world/hello_world.cpp
+++ b/example/tutorial/01_hello_world/hello_world.cpp
@@ -58,12 +58,7 @@ struct hello_world {
   // is unnecessary but harmless.
   KOKKOS_INLINE_FUNCTION
   void operator()(const int i) const {
-    // FIXME_SYCL needs workaround for printf
-#ifndef __SYCL_DEVICE_ONLY__
-    printf("Hello from i = %i\n", i);
-#else
-    (void)i;
-#endif
+    Kokkos::printf("Hello from i = %i\n", i);
   }
 };
 
diff --git a/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
index c78f3076361..909765e1fc3 100644
--- a/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
+++ b/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
@@ -76,13 +76,9 @@ int main(int argc, char* argv[]) {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
   Kokkos::parallel_for(
       15, KOKKOS_LAMBDA(const int i) {
-  // FIXME_SYCL needs workaround for printf
-#ifndef __SYCL_DEVICE_ONLY__
-        // printf works in a CUDA parallel kernel; std::ostream does not.
-        printf("Hello from i = %i\n", i);
-#else
-	(void)i;
-#endif
+        // Kokko::printf works for all backends in a parallel kernel;
+        // std::ostream does not.
+        Kokkos::printf("Hello from i = %i\n", i);
       });
 #endif
   // You must call finalize() after you are done using Kokkos.
diff --git a/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp b/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp
index b041f8d435b..ee3f4721d91 100644
--- a/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp
+++ b/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp
@@ -47,13 +47,9 @@ struct hello_world {
     // The TeamPolicy<>::member_type provides functions to query the multi
     // dimensional index of a thread as well as the number of thread-teams and
     // the size of each team.
-#ifndef __SYCL_DEVICE_ONLY__
-    // FIXME_SYCL needs printf workaround
-    printf("Hello World: %i %i // %i %i\n", thread.league_rank(),
-           thread.team_rank(), thread.league_size(), thread.team_size());
-#else
-    (void)thread;
-#endif
+    Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(),
+                   thread.team_rank(), thread.league_size(),
+                   thread.team_size());
   }
 };
 
diff --git a/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
index 933b254f7c7..1e6812adead 100644
--- a/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
+++ b/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
@@ -57,16 +57,12 @@ int main(int narg, char* args[]) {
       policy,
       KOKKOS_LAMBDA(const team_member& thread, int& lsum) {
         lsum += 1;
-    // TeamPolicy<>::member_type provides functions to query the
-    // multidimensional index of a thread, as well as the number of
-    // thread teams and the size of each team.
-#ifndef __SYCL_DEVICE_ONLY__
-        // FIXME_SYCL needs workaround for printf
-        printf("Hello World: %i %i // %i %i\n", thread.league_rank(),
-               thread.team_rank(), thread.league_size(), thread.team_size());
-#else
-        (void)thread;
-#endif
+        // TeamPolicy<>::member_type provides functions to query the
+        // multidimensional index of a thread, as well as the number of
+        // thread teams and the size of each team.
+        Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(),
+                       thread.team_rank(), thread.league_size(),
+                       thread.team_size());
       },
       sum);
 #endif
diff --git a/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp b/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp
index 398810d1331..75d6089e9af 100644
--- a/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp
+++ b/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp
@@ -43,16 +43,11 @@ struct hello_world {
     // the operator using a team_policy acts like a parallel region for the
     // team. That means that everything outside of the nested parallel_for is
     // also executed by all threads of the team.
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, 31),
-                         [&](const int& i) {
-#ifndef __SYCL_DEVICE_ONLY__
-                           // FIXME_SYCL needs printf workaround
-                           printf("Hello World: (%i , %i) executed loop %i \n",
-                                  thread.league_rank(), thread.team_rank(), i);
-#else
-			   (void) i;
-#endif
-                         });
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(thread, 31), [&](const int& i) {
+          Kokkos::printf("Hello World: (%i , %i) executed loop %i \n",
+                         thread.league_rank(), thread.team_rank(), i);
+        });
   }
 };
 
diff --git a/generate_makefile.bash b/generate_makefile.bash
index 301a1fceb5a..25370daa3f2 100755
--- a/generate_makefile.bash
+++ b/generate_makefile.bash
@@ -170,12 +170,9 @@ display_help_text() {
       echo "                 ARMV8_THUNDERX  = ARMv8 Cavium ThunderX CPU"
       echo "                 ARMV8_THUNDERX2 = ARMv8 Cavium ThunderX2 CPU"
       echo "               [IBM]"
-      echo "                 BGQ             = IBM Blue Gene Q"
-      echo "                 Power7          = IBM POWER7 and POWER7+ CPUs"
       echo "                 Power8          = IBM POWER8 CPUs"
       echo "                 Power9          = IBM POWER9 CPUs"
       echo "               [Intel]"
-      echo "                 WSM             = Intel Westmere CPUs"
       echo "                 SNB             = Intel Sandy/Ivy Bridge CPUs"
       echo "                 HSW             = Intel Haswell CPUs"
       echo "                 BDW             = Intel Broadwell Xeon E-class CPUs"
@@ -226,7 +223,6 @@ display_help_text() {
       echo "--with-gtest=/Path/To/Gtest:  Set path to gtest.  (Used in unit and performance"
       echo "                                tests.)"
       echo "--with-hwloc=/Path/To/Hwloc:  Set path to hwloc library."
-      echo "--with-memkind=/Path/To/MemKind:  Set path to memkind library."
       echo "--with-options=[OPT]:         Additional options to Kokkos:"
       echo "                                compiler_warnings"
       echo "                                aggressive_vectorization = add ivdep on loops"
@@ -342,10 +338,6 @@ do
       KOKKOS_HWLOC=ON
       HWLOC_PATH="${key#*=}"
       ;;
-    --with-memkind*)
-      KOKKOS_MEMKIND=ON
-      MEMKIND_PATH="${key#*=}"
-      ;;
     --arch*)
       KOKKOS_ARCH="${key#*=}"
       ;;
@@ -452,15 +444,6 @@ else
     KOKKOS_HWLOC_CMD=
 fi
 
-if [ "$KOKKOS_MEMKIND" == "ON" ]; then
-    KOKKOS_MEMKIND_CMD=-DKokkos_ENABLE_MEMKIND=ON
-    if [ "$MEMKIND_PATH" != "" ]; then
-      KOKKOS_MEMKIND_PATH_CMD=-DMEMKIND_ROOT=$MEMKIND_PATH
-    fi
-else
-    KOKKOS_MEMKIND_CMD=
-fi
-
 if [ ! -e ${KOKKOS_PATH}/CMakeLists.txt ]; then
    if [ "${KOKKOS_PATH}" == "" ]; then
       CM_SCRIPT=$0
@@ -506,5 +489,5 @@ if [[ ${COMPILER} == *clang* ]]; then
    fi
 fi
 
-echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH}
-cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH}
+echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH}
+cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH}
diff --git a/gnu_generate_makefile.bash b/gnu_generate_makefile.bash
index 5ea159cdd47..7a197bb71d4 100755
--- a/gnu_generate_makefile.bash
+++ b/gnu_generate_makefile.bash
@@ -74,9 +74,6 @@ do
     --with-hwloc*)
       HWLOC_PATH="${key#*=}"
       ;;
-    --with-memkind*)
-      MEMKIND_PATH="${key#*=}"
-      ;;
     --arch*)
       KOKKOS_ARCH="${key#*=}"
       ;;
@@ -148,12 +145,9 @@ do
       echo "                 ARMv8-ThunderX  = ARMv8 Cavium ThunderX CPU"
       echo "                 ARMv8-TX2       = ARMv8 Cavium ThunderX2 CPU"
       echo "               [IBM]"
-      echo "                 BGQ             = IBM Blue Gene Q"
-      echo "                 Power7          = IBM POWER7 and POWER7+ CPUs"
       echo "                 Power8          = IBM POWER8 CPUs"
       echo "                 Power9          = IBM POWER9 CPUs"
       echo "               [Intel]"
-      echo "                 WSM             = Intel Westmere CPUs"
       echo "                 SNB             = Intel Sandy/Ivy Bridge CPUs"
       echo "                 HSW             = Intel Haswell CPUs"
       echo "                 BDW             = Intel Broadwell Xeon E-class CPUs"
@@ -198,7 +192,6 @@ do
       echo "--with-gtest=/Path/To/Gtest:  Set path to gtest.  (Used in unit and performance"
       echo "                                tests.)"
       echo "--with-hwloc=/Path/To/Hwloc:  Set path to hwloc library."
-      echo "--with-memkind=/Path/To/MemKind:  Set path to memkind library."
       echo "--with-options=[OPT]:         Additional options to Kokkos:"
       echo "                                compiler_warnings"
       echo "                                aggressive_vectorization = add ivdep on loops"
@@ -298,11 +291,6 @@ if [ ${#HWLOC_PATH} -gt 0 ]; then
   KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc"
 fi
 
-if [ ${#MEMKIND_PATH} -gt 0 ]; then
-  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" 
-  KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind"
-fi
-
 if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then
   KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}"
 fi
diff --git a/master_history.txt b/master_history.txt
index a43b5276a83..bd122a456bd 100644
--- a/master_history.txt
+++ b/master_history.txt
@@ -35,3 +35,4 @@ tag:  4.0.01     date: 04:26:2023    master: aa1f48f3    release: 5893754f
 tag:  4.1.00     date: 06:20:2023    master: 62d2b6c8    release: adde1e6a
 tag:  4.2.00     date: 11:09:2023    master: 1a3ea28f    release: abe01c88
 tag:  4.2.01     date: 01:30:2024    master: 71a9bcae    release: 221e5f7a
+tag:  4.3.00     date: 04:03:2024    master: e0dc0128    release: f08217a4
diff --git a/scripts/docker/Dockerfile.clang b/scripts/docker/Dockerfile.clang
index 5c6abc1c6de..b493c3bbff0 100644
--- a/scripts/docker/Dockerfile.clang
+++ b/scripts/docker/Dockerfile.clang
@@ -1,49 +1,13 @@
-FROM ubuntu:18.04
+FROM ubuntu:20.04
 
 RUN apt-get update && apt-get install -y \
         bc \
         git \
         build-essential \
+        clang-format-8 \
         wget \
-        ccache \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
-    KEYDUMP_FILE=keydump && \
-    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
-    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
-    gpg --import ${KEYDUMP_FILE} && \
-    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
-    rm ${KEYDUMP_FILE}*
-
-ARG CMAKE_VERSION=3.16.8
-ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
-    CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
-    CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
-    wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
-    wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
-    wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
-    grep -i ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sed -e s/linux/Linux/ | sha256sum --check && \
-    mkdir -p ${CMAKE_DIR} && \
-    sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \
-    rm cmake*
-ENV PATH=${CMAKE_DIR}/bin:$PATH
-
-ENV LLVM_DIR=/opt/llvm
-RUN LLVM_VERSION=8.0.0 && \
-    LLVM_URL=https://releases.llvm.org/${LLVM_VERSION}/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04.tar.xz && \
-    LLVM_ARCHIVE=llvm-${LLVM_VERSION}.tar.xz && \
-    SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \
-    wget --quiet ${LLVM_URL} --output-document=${LLVM_ARCHIVE} && \
-    wget --quiet ${LLVM_URL}.sig --output-document=${LLVM_ARCHIVE}.sig && \
-    gpg --verify ${LLVM_ARCHIVE}.sig ${LLVM_ARCHIVE} && \
-    mkdir -p ${LLVM_DIR} && \
-    tar -xvf ${LLVM_ARCHIVE} -C ${LLVM_DIR} --strip-components=1 && \
-    echo "${LLVM_DIR}/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig && \
-    rm -rf /root/.gnupg && \
-    rm -rf ${SCRATCH_DIR}
-ENV PATH=${LLVM_DIR}/bin:$PATH
+ENV CLANG_FORMAT_EXE=clang-format-8
diff --git a/scripts/docker/Dockerfile.openmptarget b/scripts/docker/Dockerfile.openmptarget
index 708cf533b8a..22edcda2a07 100644
--- a/scripts/docker/Dockerfile.openmptarget
+++ b/scripts/docker/Dockerfile.openmptarget
@@ -38,7 +38,7 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO
     rm ${CMAKE_SCRIPT}
 ENV PATH=${CMAKE_DIR}/bin:$PATH
 
-ARG LLVM_VERSION=llvmorg-17.0.1
+ARG LLVM_VERSION=llvmorg-17.0.3
 ENV LLVM_DIR=/opt/llvm
 RUN LLVM_URL=https://github.com/llvm/llvm-project/archive &&\
     LLVM_ARCHIVE=${LLVM_VERSION}.tar.gz &&\
diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl
index 714461bfe6a..87864da1bf7 100644
--- a/scripts/docker/Dockerfile.sycl
+++ b/scripts/docker/Dockerfile.sycl
@@ -55,3 +55,12 @@ RUN wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19133/l_one
     chmod +x ./l_oneDPL_p_2022.0.0.25335.sh && \
     ./l_oneDPL_p_2022.0.0.25335.sh -a -s --eula accept && \
     rm l_oneDPL_p_2022.0.0.25335.sh
+
+# clang++
+ENV PATH=/opt/intel/oneapi/compiler/latest/linux/bin-llvm/:$PATH
+# sycl-ls, icpx
+ENV PATH=/opt/intel/oneapi/compiler/latest/linux/bin/:$PATH
+# libsycl
+ENV LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/lib:$LD_LIBRARY_PATH
+# libsvml
+ENV LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin:$LD_LIBRARY_PATH
diff --git a/scripts/testing_scripts/generate_makefile.bash b/scripts/testing_scripts/generate_makefile.bash
index ae1db3186f7..830d7b12d90 100755
--- a/scripts/testing_scripts/generate_makefile.bash
+++ b/scripts/testing_scripts/generate_makefile.bash
@@ -59,9 +59,6 @@ do
     --with-hwloc*)
       HWLOC_PATH="${key#*=}"
       ;;
-    --with-memkind*)
-      MEMKIND_PATH="${key#*=}"
-      ;;
     --arch*)
       KOKKOS_ARCH="${key#*=}"
       ;;
@@ -136,12 +133,9 @@ do
       echo "                 ARMv8-ThunderX  = ARMv8 Cavium ThunderX CPU"
       echo "                 ARMv8-TX2       = ARMv8 Cavium ThunderX2 CPU"
       echo "               [IBM]"
-      echo "                 BGQ             = IBM Blue Gene Q"
-      echo "                 Power7          = IBM POWER7 and POWER7+ CPUs"
       echo "                 Power8          = IBM POWER8 CPUs"
       echo "                 Power9          = IBM POWER9 CPUs"
       echo "               [Intel]"
-      echo "                 WSM             = Intel Westmere CPUs"
       echo "                 SNB             = Intel Sandy/Ivy Bridge CPUs"
       echo "                 HSW             = Intel Haswell CPUs"
       echo "                 BDW             = Intel Broadwell Xeon E-class CPUs"
@@ -177,7 +171,6 @@ do
       echo "--with-gtest=/Path/To/Gtest:  Set path to gtest.  (Used in unit and performance"
       echo "                                tests.)"
       echo "--with-hwloc=/Path/To/Hwloc:  Set path to hwloc library."
-      echo "--with-memkind=/Path/To/MemKind:  Set path to memkind library."
       echo "--with-options=[OPT]:         Additional options to Kokkos:"
       echo "                                compiler_warnings"
       echo "                                aggressive_vectorization = add ivdep on loops"
@@ -269,11 +262,6 @@ if [ ${#HWLOC_PATH} -gt 0 ]; then
   KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc"
 fi
 
-if [ ${#MEMKIND_PATH} -gt 0 ]; then
-  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" 
-  KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind"
-fi
-
 if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then
   KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}"
 fi
diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp
index 521160b76fc..6d0956f3832 100644
--- a/simd/src/Kokkos_SIMD_AVX2.hpp
+++ b/simd/src/Kokkos_SIMD_AVX2.hpp
@@ -30,9 +30,11 @@
     "Kokkos_SIMD_AVX2.hpp must be included before Kokkos_SIMD_Common_Math.hpp!"
 #endif
 
-// FIXME_HIP ROCm 5.6 and 5.7 can't compile with the intrinsic used here.
-#if defined(__HIPCC__) && (HIP_VERSION_MAJOR == 5) && \
-    ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7))
+// FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used here.
+#if defined(__HIPCC__) &&                                        \
+    (((HIP_VERSION_MAJOR == 5) &&                                \
+      ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7))) || \
+     ((HIP_VERSION_MAJOR == 6) && ((HIP_VERSION_MINOR == 0))))
 #define KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
 #endif
 
@@ -563,10 +565,18 @@ class simd<double, simd_abi::avx2_fixed_size<4>> {
                                                        element_aligned_tag) {
     m_value = _mm256_loadu_pd(ptr);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    m_value = _mm256_load_pd(ptr);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
     _mm256_storeu_pd(ptr, m_value);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    _mm256_store_pd(ptr, m_value);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256d()
       const {
     return m_value;
@@ -818,10 +828,18 @@ class simd<float, simd_abi::avx2_fixed_size<4>> {
                                                        element_aligned_tag) {
     m_value = _mm_loadu_ps(ptr);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    m_value = _mm_load_ps(ptr);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
     _mm_storeu_ps(ptr, m_value);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    _mm_store_ps(ptr, m_value);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128()
       const {
     return m_value;
@@ -1059,17 +1077,31 @@ class simd<std::int32_t, simd_abi::avx2_fixed_size<4>> {
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
                                                        element_aligned_tag) {
-    // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here.
+    // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used
+    // here.
 #ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
     m_value = _mm_loadu_si128(reinterpret_cast<__m128i const*>(ptr));
 #else
     m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true)));
+#endif
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here.
+#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+    m_value = _mm_load_si128(reinterpret_cast<__m128i const*>(ptr));
+#else
+    m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true)));
 #endif
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
     _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128i()
       const {
     return m_value;
@@ -1111,6 +1143,11 @@ class simd<std::int32_t, simd_abi::avx2_fixed_size<4>> {
     return simd(
         _mm_add_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs)));
   }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm_mullo_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs)));
+  }
 
   [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
       simd const& lhs, int rhs) noexcept {
@@ -1249,6 +1286,15 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> {
 #else
     m_value = _mm256_maskload_epi64(reinterpret_cast<long long const*>(ptr),
                                     static_cast<__m256i>(mask_type(true)));
+#endif
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+    m_value = _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr));
+#else
+    m_value = _mm256_maskload_epi64(reinterpret_cast<long long const*>(ptr),
+                                    static_cast<__m256i>(mask_type(true)));
 #endif
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
@@ -1256,6 +1302,11 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> {
     _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr),
                            static_cast<__m256i>(mask_type(true)), m_value);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr),
+                           static_cast<__m256i>(mask_type(true)), m_value);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
       const {
     return m_value;
@@ -1278,6 +1329,13 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> {
         _mm256_add_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
   }
 
+  // fallback simd multiplication using generator constructor
+  // multiplying vectors of 64-bit signed integers is not available in AVX2
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd([&](std::size_t i) { return lhs[i] * rhs[i]; });
+  }
+
   // AVX2 only has eq and gt comparisons for int64
   [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
   operator==(simd const& lhs, simd const& rhs) noexcept {
@@ -1306,17 +1364,19 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> {
     return !(lhs == rhs);
   }
 
+  // fallback simd shift right arithmetic using generator constructor
   // Shift right arithmetic for 64bit packed ints is not availalbe in AVX2
-  // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd(
-  //     simd const& lhs, int rhs) noexcept {
-  //   return simd(_mm256_srai_epi64(static_cast<__m256i>(lhs), rhs));
-  // }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, int rhs) noexcept {
+    return simd([&](std::size_t i) { return lhs[i] >> rhs; });
+  }
 
-  // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd(
-  //     simd const& lhs, simd const& rhs) noexcept {
-  //   return simd(_mm256_srav_epi64(static_cast<__m256i>(lhs),
-  //                                 static_cast<__m256i>(rhs))));
-  // }
+  // fallback simd shift right arithmetic using generator constructor
+  // Shift right arithmetic for 64bit packed ints is not availalbe in AVX2
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd([&](std::size_t i) { return lhs[i] >> rhs[i]; });
+  }
 
   [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
       simd const& lhs, int rhs) noexcept {
@@ -1444,6 +1504,15 @@ class simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> {
 #else
     m_value = _mm256_maskload_epi64(reinterpret_cast<long long const*>(ptr),
                                     static_cast<__m256i>(mask_type(true)));
+#endif
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+    m_value = _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr));
+#else
+    m_value = _mm256_maskload_epi64(reinterpret_cast<long long const*>(ptr),
+                                    static_cast<__m256i>(mask_type(true)));
 #endif
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
@@ -1460,6 +1529,14 @@ class simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> {
     return simd(
         _mm256_sub_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
   }
+
+  // fallback simd multiplication using generator constructor
+  // multiplying vectors of 64-bit unsigned integers is not available in AVX2
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd([&](std::size_t i) { return lhs[i] * rhs[i]; });
+  }
+
   [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
       simd const& lhs, int rhs) noexcept {
     return _mm256_srli_epi64(static_cast<__m256i>(lhs), rhs);
@@ -1588,6 +1665,11 @@ class const_where_expression<simd_mask<double, simd_abi::avx2_fixed_size<4>>,
                         static_cast<__m256d>(m_value));
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(double* mem, vector_aligned_tag) const {
+    _mm256_maskstore_pd(mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)),
+                        static_cast<__m256d>(m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       double* mem,
       simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) const {
@@ -1624,6 +1706,11 @@ class where_expression<simd_mask<double, simd_abi::avx2_fixed_size<4>>,
         mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask))));
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(double const* mem, vector_aligned_tag) {
+    m_value = value_type(_mm256_maskload_pd(
+        mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask))));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       double const* mem,
       simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) {
@@ -1667,6 +1754,11 @@ class const_where_expression<simd_mask<float, simd_abi::avx2_fixed_size<4>>,
                      static_cast<__m128>(m_value));
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(float* mem, vector_aligned_tag) const {
+    _mm_maskstore_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)),
+                     static_cast<__m128>(m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       float* mem,
       simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) const {
@@ -1703,6 +1795,11 @@ class where_expression<simd_mask<float, simd_abi::avx2_fixed_size<4>>,
         _mm_maskload_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask))));
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(float const* mem, vector_aligned_tag) {
+    m_value = value_type(
+        _mm_maskload_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask))));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       float const* mem,
       simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) {
@@ -1746,6 +1843,12 @@ class const_where_expression<
     _mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask),
                         static_cast<__m128i>(m_value));
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::int32_t* mem, vector_aligned_tag) const {
+    _mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask),
+                        static_cast<__m128i>(m_value));
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       std::int32_t* mem,
@@ -1786,6 +1889,16 @@ class where_expression<simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>>,
     m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask)));
 #endif
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(std::int32_t const* mem, vector_aligned_tag) {
+#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+    __m128i tmp = _mm_load_si128(reinterpret_cast<__m128i const*>(mem));
+    m_value     = value_type(_mm_and_si128(tmp, static_cast<__m128i>(m_mask)));
+#else
+    m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask)));
+#endif
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       std::int32_t const* mem,
@@ -1833,6 +1946,13 @@ class const_where_expression<
                            static_cast<__m256i>(m_mask),
                            static_cast<__m256i>(m_value));
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(std::int64_t* mem,
+                                                     vector_aligned_tag) const {
+    _mm256_maskstore_epi64(reinterpret_cast<long long*>(mem),
+                           static_cast<__m256i>(m_mask),
+                           static_cast<__m256i>(m_value));
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       std::int64_t* mem,
@@ -1874,6 +1994,17 @@ class where_expression<simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>>,
         reinterpret_cast<long long const*>(mem), static_cast<__m256i>(m_mask)));
 #endif
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::int64_t const* mem,
+                                                       vector_aligned_tag) {
+#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+    __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem));
+    m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask)));
+#else
+    m_value = value_type(_mm256_maskload_epi64(
+        reinterpret_cast<long long const*>(mem), static_cast<__m256i>(m_mask)));
+#endif
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       std::int64_t const* mem,
@@ -1922,6 +2053,13 @@ class const_where_expression<
                            static_cast<__m256i>(m_mask),
                            static_cast<__m256i>(m_value));
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(std::uint64_t* mem,
+                                                     vector_aligned_tag) const {
+    _mm256_maskstore_epi64(reinterpret_cast<long long*>(mem),
+                           static_cast<__m256i>(m_mask),
+                           static_cast<__m256i>(m_value));
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       std::uint64_t* mem,
@@ -1963,6 +2101,17 @@ class where_expression<simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>>,
         reinterpret_cast<long long const*>(mem), static_cast<__m256i>(m_mask)));
 #endif
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::uint64_t const* mem,
+                                                       vector_aligned_tag) {
+#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+    __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem));
+    m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask)));
+#else
+    m_value = value_type(_mm256_maskload_epi64(
+        reinterpret_cast<long long const*>(mem), static_cast<__m256i>(m_mask)));
+#endif
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       std::uint64_t const* mem,
diff --git a/simd/src/Kokkos_SIMD_AVX512.hpp b/simd/src/Kokkos_SIMD_AVX512.hpp
index c5d1717ad4e..7fa35c204ae 100644
--- a/simd/src/Kokkos_SIMD_AVX512.hpp
+++ b/simd/src/Kokkos_SIMD_AVX512.hpp
@@ -193,10 +193,18 @@ class simd<double, simd_abi::avx512_fixed_size<8>> {
                                                        element_aligned_tag) {
     m_value = _mm512_loadu_pd(ptr);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    m_value = _mm512_load_pd(ptr);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
     _mm512_storeu_pd(ptr, m_value);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    _mm512_store_pd(ptr, m_value);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512d()
       const {
     return m_value;
@@ -475,10 +483,18 @@ class simd<float, simd_abi::avx512_fixed_size<8>> {
                                                        element_aligned_tag) {
     m_value = _mm256_loadu_ps(ptr);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    m_value = _mm256_load_ps(ptr);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
     _mm256_storeu_ps(ptr, m_value);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    _mm256_store_ps(ptr, m_value);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256()
       const {
     return m_value;
@@ -735,15 +751,25 @@ class simd<std::int32_t, simd_abi::avx512_fixed_size<8>> {
   operator[](std::size_t i) const {
     return reinterpret_cast<value_type const*>(&m_value)[i];
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       element_aligned_tag) {
+    m_value = _mm256_mask_loadu_epi32(
+        _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    m_value = _mm256_mask_load_epi32(
+        _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
     _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)),
                              m_value);
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
-                                                       element_aligned_tag) {
-    m_value = _mm256_mask_loadu_epi32(
-        _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr);
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)),
+                            m_value);
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
       const {
@@ -934,21 +960,30 @@ class simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> {
   operator[](std::size_t i) const {
     return reinterpret_cast<value_type const*>(&m_value)[i];
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       element_aligned_tag) {
+    m_value = _mm256_mask_loadu_epi32(
+        _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    m_value = _mm256_mask_load_epi32(
+        _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
     _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)),
                              m_value);
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
-                                                       element_aligned_tag) {
-    m_value = _mm256_mask_loadu_epi32(
-        _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr);
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)),
+                            m_value);
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
       const {
     return m_value;
   }
-
   [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
       simd const& lhs, simd const& rhs) noexcept {
     return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs),
@@ -1130,10 +1165,19 @@ class simd<std::int64_t, simd_abi::avx512_fixed_size<8>> {
                                                        element_aligned_tag) {
     m_value = _mm512_loadu_si512(ptr);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    m_value = _mm512_load_si512(ptr);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
     _mm512_storeu_si512(ptr, m_value);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    _mm512_store_si512(ptr, m_value);
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i()
       const {
     return m_value;
@@ -1331,10 +1375,19 @@ class simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> {
                                                        element_aligned_tag) {
     m_value = _mm512_loadu_si512(ptr);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    m_value = _mm512_load_si512(ptr);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
     _mm512_storeu_si512(ptr, m_value);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    _mm512_store_si512(ptr, m_value);
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i()
       const {
     return m_value;
@@ -1505,6 +1558,11 @@ class const_where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>,
                           static_cast<__m512d>(m_value));
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(double* mem, vector_aligned_tag) const {
+    _mm512_mask_store_pd(mem, static_cast<__mmask8>(m_mask),
+                         static_cast<__m512d>(m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       double* mem,
       simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) const {
@@ -1541,6 +1599,11 @@ class where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>,
         _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask), mem));
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(double const* mem, vector_aligned_tag) {
+    m_value = value_type(_mm512_mask_load_pd(
+        _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask), mem));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       double const* mem,
       simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) {
@@ -1584,6 +1647,11 @@ class const_where_expression<simd_mask<float, simd_abi::avx512_fixed_size<8>>,
                           static_cast<__m256>(m_value));
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(float* mem, vector_aligned_tag) const {
+    _mm256_mask_store_ps(mem, static_cast<__mmask8>(m_mask),
+                         static_cast<__m256>(m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       float* mem,
       simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) const {
@@ -1619,6 +1687,10 @@ class where_expression<simd_mask<float, simd_abi::avx512_fixed_size<8>>,
     m_value = value_type(_mm256_mask_loadu_ps(
         _mm256_set1_ps(0.0), static_cast<__mmask8>(m_mask), mem));
   }
+  void copy_from(float const* mem, vector_aligned_tag) {
+    m_value = value_type(_mm256_mask_load_ps(
+        _mm256_set1_ps(0.0), static_cast<__mmask8>(m_mask), mem));
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       float const* mem,
@@ -1666,6 +1738,12 @@ class const_where_expression<
     _mm256_mask_storeu_epi32(mem, static_cast<__mmask8>(m_mask),
                              static_cast<__m256i>(m_value));
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::int32_t* mem, vector_aligned_tag) const {
+    _mm256_mask_store_epi32(mem, static_cast<__mmask8>(m_mask),
+                            static_cast<__m256i>(m_value));
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       std::int32_t* mem,
@@ -1702,6 +1780,11 @@ class where_expression<simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>>,
     m_value = value_type(_mm256_mask_loadu_epi32(
         _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem));
   }
+  void copy_from(std::int32_t const* mem, vector_aligned_tag) {
+    m_value = value_type(_mm256_mask_load_epi32(
+        _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem));
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       std::int32_t const* mem,
@@ -1710,6 +1793,7 @@ class where_expression<simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>>,
         static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask),
         static_cast<__m256i>(index), mem, 4));
   }
+
   template <class U,
             std::enable_if_t<
                 std::is_convertible_v<
@@ -1748,6 +1832,12 @@ class const_where_expression<
     _mm256_mask_storeu_epi32(mem, static_cast<__mmask8>(m_mask),
                              static_cast<__m256i>(m_value));
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::uint32_t* mem, vector_aligned_tag) const {
+    _mm256_mask_store_epi32(mem, static_cast<__mmask8>(m_mask),
+                            static_cast<__m256i>(m_value));
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       std::uint32_t* mem,
@@ -1784,6 +1874,12 @@ class where_expression<simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>>,
     m_value = value_type(_mm256_mask_loadu_epi32(
         _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem));
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(std::uint32_t const* mem, vector_aligned_tag) {
+    m_value = value_type(_mm256_mask_load_epi32(
+        _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem));
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       std::uint32_t const* mem,
@@ -1792,6 +1888,7 @@ class where_expression<simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>>,
         static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask),
         static_cast<__m256i>(index), mem, 4));
   }
+
   template <class U,
             std::enable_if_t<
                 std::is_convertible_v<
@@ -1830,6 +1927,12 @@ class const_where_expression<
     _mm512_mask_storeu_epi64(mem, static_cast<__mmask8>(m_mask),
                              static_cast<__m512i>(m_value));
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::int64_t* mem, vector_aligned_tag) const {
+    _mm512_mask_store_epi64(mem, static_cast<__mmask8>(m_mask),
+                            static_cast<__m512i>(m_value));
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       std::int64_t* mem,
@@ -1866,6 +1969,12 @@ class where_expression<simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>>,
     m_value = value_type(_mm512_mask_loadu_epi64(
         _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem));
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(std::int64_t const* mem, vector_aligned_tag) {
+    m_value = value_type(_mm512_mask_load_epi64(
+        _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem));
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       std::int64_t const* mem,
@@ -1874,6 +1983,7 @@ class where_expression<simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>>,
         static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask),
         static_cast<__m256i>(index), mem, 8));
   }
+
   template <class U,
             std::enable_if_t<
                 std::is_convertible_v<
@@ -1912,6 +2022,12 @@ class const_where_expression<
     _mm512_mask_storeu_epi64(mem, static_cast<__mmask8>(m_mask),
                              static_cast<__m512i>(m_value));
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::uint64_t* mem, vector_aligned_tag) const {
+    _mm512_mask_store_epi64(mem, static_cast<__mmask8>(m_mask),
+                            static_cast<__m512i>(m_value));
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       std::uint64_t* mem,
@@ -1949,6 +2065,11 @@ class where_expression<simd_mask<std::uint64_t, simd_abi::avx512_fixed_size<8>>,
         _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem));
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(std::uint64_t const* mem, vector_aligned_tag) {
+    m_value = value_type(_mm512_mask_load_epi64(
+        _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       std::uint64_t const* mem,
       simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) {
@@ -1956,6 +2077,7 @@ class where_expression<simd_mask<std::uint64_t, simd_abi::avx512_fixed_size<8>>,
         static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask),
         static_cast<__m256i>(index), mem, 8));
   }
+
   template <class U,
             std::enable_if_t<
                 std::is_convertible_v<
diff --git a/simd/src/Kokkos_SIMD_Common.hpp b/simd/src/Kokkos_SIMD_Common.hpp
index 87edf994533..c39749b8b84 100644
--- a/simd/src/Kokkos_SIMD_Common.hpp
+++ b/simd/src/Kokkos_SIMD_Common.hpp
@@ -31,7 +31,16 @@ class simd;
 template <class T, class Abi>
 class simd_mask;
 
-struct element_aligned_tag {};
+class simd_alignment_vector_aligned {};
+
+template <typename... Flags>
+struct simd_flags {};
+
+inline constexpr simd_flags<> simd_flag_default{};
+inline constexpr simd_flags<simd_alignment_vector_aligned> simd_flag_aligned{};
+
+using element_aligned_tag = simd_flags<>;
+using vector_aligned_tag  = simd_flags<simd_alignment_vector_aligned>;
 
 // class template declarations for const_where_expression and where_expression
 
@@ -117,48 +126,6 @@ template <class T>
   return const_where_expression(mask, value);
 }
 
-// fallback simd multiplication using generator constructor
-// At the time of this writing, this fallback is only used
-// to multiply vectors of 64-bit signed integers for the AVX2 backend
-
-template <class T, class Abi>
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator*(
-    simd<T, Abi> const& lhs, simd<T, Abi> const& rhs) {
-  return simd<T, Abi>([&](std::size_t i) { return lhs[i] * rhs[i]; });
-}
-
-// fallback simd shift using generator constructor
-// At the time of this edit, only the fallback for shift vectors of
-// 64-bit signed integers for the AVX2 backend is used
-
-template <typename T, typename Abi,
-          typename = std::enable_if_t<std::is_integral_v<T>>>
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator>>(
-    simd<T, Abi> const& lhs, int rhs) {
-  return simd<T, Abi>([&](std::size_t i) { return lhs[i] >> rhs; });
-}
-
-template <typename T, typename Abi,
-          typename = std::enable_if_t<std::is_integral_v<T>>>
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator<<(
-    simd<T, Abi> const& lhs, int rhs) {
-  return simd<T, Abi>([&](std::size_t i) { return lhs[i] << rhs; });
-}
-
-template <typename T, typename Abi,
-          typename = std::enable_if_t<std::is_integral_v<T>>>
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator>>(
-    simd<T, Abi> const& lhs, simd<T, Abi> const& rhs) {
-  return simd<T, Abi>([&](std::size_t i) { return lhs[i] >> rhs[i]; });
-}
-
-template <typename T, typename Abi,
-          typename = std::enable_if_t<std::is_integral_v<T>>>
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator<<(
-    simd<T, Abi> const& lhs, simd<T, Abi> const& rhs) {
-  return simd<T, Abi>([&](std::size_t i) { return lhs[i] << rhs[i]; });
-}
-
 // The code below provides:
 // operator@(simd<T, Abi>, Arithmetic)
 // operator@(Arithmetic, simd<T, Abi>)
diff --git a/simd/src/Kokkos_SIMD_NEON.hpp b/simd/src/Kokkos_SIMD_NEON.hpp
index 43ece203890..efc81135d16 100644
--- a/simd/src/Kokkos_SIMD_NEON.hpp
+++ b/simd/src/Kokkos_SIMD_NEON.hpp
@@ -363,10 +363,18 @@ class simd<double, simd_abi::neon_fixed_size<2>> {
                                                        element_aligned_tag) {
     m_value = vld1q_f64(ptr);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    m_value = vld1q_f64(ptr);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
     vst1q_f64(ptr, m_value);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    vst1q_f64(ptr, m_value);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit
   operator float64x2_t() const {
     return m_value;
@@ -607,10 +615,18 @@ class simd<float, simd_abi::neon_fixed_size<2>> {
                                                        element_aligned_tag) {
     m_value = vld1_f32(ptr);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    m_value = vld1_f32(ptr);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
     vst1_f32(ptr, m_value);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    vst1_f32(ptr, m_value);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit
   operator float32x2_t() const {
     return m_value;
@@ -844,10 +860,18 @@ class simd<std::int32_t, simd_abi::neon_fixed_size<2>> {
                                                        element_aligned_tag) {
     m_value = vld1_s32(ptr);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    m_value = vld1_s32(ptr);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
     vst1_s32(ptr, m_value);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    vst1_s32(ptr, m_value);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int32x2_t()
       const {
     return m_value;
@@ -868,7 +892,11 @@ class simd<std::int32_t, simd_abi::neon_fixed_size<2>> {
     return simd(
         vadd_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs)));
   }
-
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        vmul_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs)));
+  }
   [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
   operator==(simd const& lhs, simd const& rhs) noexcept {
     return mask_type(
@@ -1044,10 +1072,18 @@ class simd<std::int64_t, simd_abi::neon_fixed_size<2>> {
                                                        element_aligned_tag) {
     m_value = vld1q_s64(ptr);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    m_value = vld1q_s64(ptr);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
     vst1q_s64(ptr, m_value);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    vst1q_s64(ptr, m_value);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int64x2_t()
       const {
     return m_value;
@@ -1068,7 +1104,10 @@ class simd<std::int64_t, simd_abi::neon_fixed_size<2>> {
     return simd(
         vaddq_s64(static_cast<int64x2_t>(lhs), static_cast<int64x2_t>(rhs)));
   }
-
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd([&](std::size_t i) { return lhs[i] * rhs[i]; });
+  }
   [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
   operator==(simd const& lhs, simd const& rhs) noexcept {
     return mask_type(
@@ -1246,6 +1285,18 @@ class simd<std::uint64_t, simd_abi::neon_fixed_size<2>> {
                                                        element_aligned_tag) {
     m_value = vld1q_u64(ptr);
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       vector_aligned_tag) {
+    m_value = vld1q_u64(ptr);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
+      value_type* ptr, element_aligned_tag) const {
+    vst1q_u64(ptr, m_value);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr,
+                                                     vector_aligned_tag) const {
+    vst1q_u64(ptr, m_value);
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator uint64x2_t()
       const {
     return m_value;
@@ -1261,7 +1312,10 @@ class simd<std::uint64_t, simd_abi::neon_fixed_size<2>> {
     return simd(
         vaddq_u64(static_cast<uint64x2_t>(lhs), static_cast<uint64x2_t>(rhs)));
   }
-
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd([&](std::size_t i) { return lhs[i] * rhs[i]; });
+  }
   [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator&(
       simd const& lhs, simd const& rhs) noexcept {
     return simd(
@@ -1386,6 +1440,11 @@ class const_where_expression<simd_mask<double, simd_abi::neon_fixed_size<2>>,
     if (m_mask[1]) mem[1] = m_value[1];
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(double* mem, vector_aligned_tag) const {
+    if (m_mask[0]) mem[0] = m_value[0];
+    if (m_mask[1]) mem[1] = m_value[1];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       double* mem,
       simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) const {
@@ -1421,6 +1480,11 @@ class where_expression<simd_mask<double, simd_abi::neon_fixed_size<2>>,
     if (m_mask[1]) m_value[1] = mem[1];
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(double const* mem, vector_aligned_tag) {
+    if (m_mask[0]) m_value[0] = mem[0];
+    if (m_mask[1]) m_value[1] = mem[1];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       double const* mem,
       simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) {
@@ -1464,6 +1528,11 @@ class const_where_expression<simd_mask<float, simd_abi::neon_fixed_size<2>>,
     if (m_mask[1]) mem[1] = m_value[1];
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(float* mem, vector_aligned_tag) const {
+    if (m_mask[0]) mem[0] = m_value[0];
+    if (m_mask[1]) mem[1] = m_value[1];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       float* mem,
       simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) const {
@@ -1498,6 +1567,10 @@ class where_expression<simd_mask<float, simd_abi::neon_fixed_size<2>>,
     if (m_mask[0]) m_value[0] = mem[0];
     if (m_mask[1]) m_value[1] = mem[1];
   }
+  void copy_from(float const* mem, vector_aligned_tag) {
+    if (m_mask[0]) m_value[0] = mem[0];
+    if (m_mask[1]) m_value[1] = mem[1];
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       float const* mem,
@@ -1542,6 +1615,12 @@ class const_where_expression<
     if (m_mask[0]) mem[0] = m_value[0];
     if (m_mask[1]) mem[1] = m_value[1];
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::int32_t* mem, vector_aligned_tag) const {
+    if (m_mask[0]) mem[0] = m_value[0];
+    if (m_mask[1]) mem[1] = m_value[1];
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       std::int32_t* mem,
@@ -1577,6 +1656,12 @@ class where_expression<simd_mask<std::int32_t, simd_abi::neon_fixed_size<2>>,
     if (m_mask[0]) m_value[0] = mem[0];
     if (m_mask[1]) m_value[1] = mem[1];
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(std::int32_t const* mem, vector_aligned_tag) {
+    if (m_mask[0]) m_value[0] = mem[0];
+    if (m_mask[1]) m_value[1] = mem[1];
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       std::int32_t const* mem,
@@ -1584,6 +1669,7 @@ class where_expression<simd_mask<std::int32_t, simd_abi::neon_fixed_size<2>>,
     if (m_mask[0]) m_value[0] = mem[index[0]];
     if (m_mask[1]) m_value[1] = mem[index[1]];
   }
+
   template <
       class U,
       std::enable_if_t<
@@ -1622,6 +1708,12 @@ class const_where_expression<
     if (m_mask[0]) mem[0] = m_value[0];
     if (m_mask[1]) mem[1] = m_value[1];
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::int64_t* mem, vector_aligned_tag) const {
+    if (m_mask[0]) mem[0] = m_value[0];
+    if (m_mask[1]) mem[1] = m_value[1];
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       std::int64_t* mem,
@@ -1657,6 +1749,12 @@ class where_expression<simd_mask<std::int64_t, simd_abi::neon_fixed_size<2>>,
     if (m_mask[0]) m_value[0] = mem[0];
     if (m_mask[1]) m_value[1] = mem[1];
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(std::int64_t const* mem, vector_aligned_tag) {
+    if (m_mask[0]) m_value[0] = mem[0];
+    if (m_mask[1]) m_value[1] = mem[1];
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       std::int64_t const* mem,
@@ -1664,6 +1762,7 @@ class where_expression<simd_mask<std::int64_t, simd_abi::neon_fixed_size<2>>,
     if (m_mask[0]) m_value[0] = mem[index[0]];
     if (m_mask[1]) m_value[1] = mem[index[1]];
   }
+
   template <
       class U,
       std::enable_if_t<std::is_convertible_v<
@@ -1702,6 +1801,12 @@ class const_where_expression<
     if (m_mask[0]) mem[0] = m_value[0];
     if (m_mask[1]) mem[1] = m_value[1];
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::uint64_t* mem, vector_aligned_tag) const {
+    if (m_mask[0]) mem[0] = m_value[0];
+    if (m_mask[1]) mem[1] = m_value[1];
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void scatter_to(
       std::uint64_t* mem,
@@ -1737,6 +1842,12 @@ class where_expression<simd_mask<std::uint64_t, simd_abi::neon_fixed_size<2>>,
     if (m_mask[0]) m_value[0] = mem[0];
     if (m_mask[1]) m_value[1] = mem[1];
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(std::uint64_t const* mem, vector_aligned_tag) {
+    if (m_mask[0]) m_value[0] = mem[0];
+    if (m_mask[1]) m_value[1] = mem[1];
+  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void gather_from(
       std::uint64_t const* mem,
@@ -1744,6 +1855,7 @@ class where_expression<simd_mask<std::uint64_t, simd_abi::neon_fixed_size<2>>,
     if (m_mask[0]) m_value[0] = mem[index[0]];
     if (m_mask[1]) m_value[1] = mem[index[1]];
   }
+
   template <class U,
             std::enable_if_t<
                 std::is_convertible_v<
diff --git a/simd/src/Kokkos_SIMD_Scalar.hpp b/simd/src/Kokkos_SIMD_Scalar.hpp
index 7443f5596b5..9de46dbed9f 100644
--- a/simd/src/Kokkos_SIMD_Scalar.hpp
+++ b/simd/src/Kokkos_SIMD_Scalar.hpp
@@ -127,9 +127,16 @@ class simd<T, simd_abi::scalar> {
                                              element_aligned_tag) {
     m_value = *ptr;
   }
+  KOKKOS_FORCEINLINE_FUNCTION void copy_from(T const* ptr, vector_aligned_tag) {
+    m_value = *ptr;
+  }
   KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, element_aligned_tag) const {
     *ptr = m_value;
   }
+  KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, vector_aligned_tag) const {
+    *ptr = m_value;
+  }
+
   KOKKOS_FORCEINLINE_FUNCTION reference operator[](std::size_t) {
     return m_value;
   }
@@ -308,6 +315,10 @@ class const_where_expression<simd_mask<T, simd_abi::scalar>,
   void copy_to(T* mem, element_aligned_tag) const {
     if (static_cast<bool>(m_mask)) *mem = static_cast<T>(m_value);
   }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void copy_to(T* mem, vector_aligned_tag) const {
+    if (static_cast<bool>(m_mask)) *mem = static_cast<T>(m_value);
+  }
   template <class Integral>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<std::is_integral_v<Integral>>
   scatter_to(T* mem, simd<Integral, simd_abi::scalar> const& index) const {
@@ -315,13 +326,13 @@ class const_where_expression<simd_mask<T, simd_abi::scalar>,
       mem[static_cast<Integral>(index)] = static_cast<T>(m_value);
   }
 
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
-  impl_get_value() const {
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION value_type const& impl_get_value()
+      const {
     return m_value;
   }
 
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
-  impl_get_mask() const {
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION mask_type const& impl_get_mask()
+      const {
     return m_mask;
   }
 };
@@ -344,6 +355,10 @@ class where_expression<simd_mask<T, simd_abi::scalar>,
   void copy_from(T const* mem, element_aligned_tag) {
     if (static_cast<bool>(this->m_mask)) this->m_value = *mem;
   }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void copy_from(T const* mem, vector_aligned_tag) {
+    if (static_cast<bool>(this->m_mask)) this->m_value = *mem;
+  }
   template <class Integral>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<std::is_integral_v<Integral>>
   gather_from(T const* mem, simd<Integral, simd_abi::scalar> const& index) {
diff --git a/simd/unit_tests/TestSIMD.cpp b/simd/unit_tests/TestSIMD.cpp
index 61c076e8246..7a1f9be2a0f 100644
--- a/simd/unit_tests/TestSIMD.cpp
+++ b/simd/unit_tests/TestSIMD.cpp
@@ -21,3 +21,4 @@
 #include <TestSIMD_Condition.hpp>
 #include <TestSIMD_GeneratorCtors.hpp>
 #include <TestSIMD_WhereExpressions.hpp>
+#include <TestSIMD_Reductions.hpp>
diff --git a/simd/unit_tests/include/SIMDTesting_Ops.hpp b/simd/unit_tests/include/SIMDTesting_Ops.hpp
index 6529f20e66a..c587ccf3046 100644
--- a/simd/unit_tests/include/SIMDTesting_Ops.hpp
+++ b/simd/unit_tests/include/SIMDTesting_Ops.hpp
@@ -209,4 +209,165 @@ class shift_left {
   }
 };
 
+class cbrt_op {
+ public:
+  template <typename T>
+  auto on_host(T const& a) const {
+#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4)
+    return Kokkos::Experimental::cbrt(a);
+#else
+    return Kokkos::cbrt(a);
+#endif
+  }
+  template <typename T>
+  auto on_host_serial(T const& a) const {
+    return Kokkos::cbrt(a);
+  }
+};
+
+class exp_op {
+ public:
+  template <typename T>
+  auto on_host(T const& a) const {
+#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4)
+    return Kokkos::Experimental::exp(a);
+#else
+    return Kokkos::exp(a);
+#endif
+  }
+  template <typename T>
+  auto on_host_serial(T const& a) const {
+    return Kokkos::exp(a);
+  }
+};
+
+class log_op {
+ public:
+  template <typename T>
+  auto on_host(T const& a) const {
+#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4)
+    return Kokkos::Experimental::log(a);
+#else
+    return Kokkos::log(a);
+#endif
+  }
+  template <typename T>
+  auto on_host_serial(T const& a) const {
+    return Kokkos::log(a);
+  }
+};
+
+class hmin {
+ public:
+  template <typename T>
+  auto on_host(T const& a) const {
+    return Kokkos::Experimental::hmin(a);
+  }
+  template <typename T>
+  auto on_host_serial(T const& a) const {
+    using DataType = typename T::value_type::value_type;
+
+    auto const& v = a.impl_get_value();
+    auto const& m = a.impl_get_mask();
+    auto result   = Kokkos::reduction_identity<DataType>::min();
+    for (std::size_t i = 0; i < v.size(); ++i) {
+      if (m[i]) result = Kokkos::min(result, v[i]);
+    }
+    return result;
+  }
+
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const {
+    return Kokkos::Experimental::hmin(a);
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const {
+    using DataType = typename T::value_type::value_type;
+
+    auto const& v = a.impl_get_value();
+    auto const& m = a.impl_get_mask();
+    auto result   = Kokkos::reduction_identity<DataType>::min();
+    for (std::size_t i = 0; i < v.size(); ++i) {
+      if (m[i]) result = Kokkos::min(result, v[i]);
+    }
+    return result;
+  }
+};
+
+class hmax {
+ public:
+  template <typename T>
+  auto on_host(T const& a) const {
+    return Kokkos::Experimental::hmax(a);
+  }
+  template <typename T>
+  auto on_host_serial(T const& a) const {
+    using DataType = typename T::value_type::value_type;
+
+    auto const& v = a.impl_get_value();
+    auto const& m = a.impl_get_mask();
+    auto result   = Kokkos::reduction_identity<DataType>::max();
+    for (std::size_t i = 0; i < v.size(); ++i) {
+      if (m[i]) result = Kokkos::max(result, v[i]);
+    }
+    return result;
+  }
+
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const {
+    return Kokkos::Experimental::hmax(a);
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const {
+    using DataType = typename T::value_type::value_type;
+
+    auto const& v = a.impl_get_value();
+    auto const& m = a.impl_get_mask();
+    auto result   = Kokkos::reduction_identity<DataType>::max();
+    for (std::size_t i = 0; i < v.size(); ++i) {
+      if (m[i]) result = Kokkos::max(result, v[i]);
+    }
+    return result;
+  }
+};
+
+class reduce {
+ public:
+  template <typename T>
+  auto on_host(T const& a) const {
+    using DataType = typename T::value_type::value_type;
+    return Kokkos::Experimental::reduce(a, DataType(0), std::plus<>());
+  }
+  template <typename T>
+  auto on_host_serial(T const& a) const {
+    using DataType = typename T::value_type::value_type;
+
+    auto const& v = a.impl_get_value();
+    auto const& m = a.impl_get_mask();
+    auto result   = Kokkos::reduction_identity<DataType>::sum();
+    for (std::size_t i = 0; i < v.size(); ++i) {
+      if (m[i]) result += v[i];
+    }
+    return result;
+  }
+
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const {
+    using DataType = typename T::value_type::value_type;
+    return Kokkos::Experimental::reduce(a, DataType(0), std::plus<>());
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const {
+    using DataType = typename T::value_type::value_type;
+
+    auto const& v = a.impl_get_value();
+    auto const& m = a.impl_get_mask();
+    auto result   = Kokkos::reduction_identity<DataType>::sum();
+    for (std::size_t i = 0; i < v.size(); ++i) {
+      if (m[i]) result += v[i];
+    }
+    return result;
+  }
+};
+
 #endif
diff --git a/simd/unit_tests/include/SIMDTesting_Utilities.hpp b/simd/unit_tests/include/SIMDTesting_Utilities.hpp
index ae2ab2c697c..d36e1e5afc5 100644
--- a/simd/unit_tests/include/SIMDTesting_Utilities.hpp
+++ b/simd/unit_tests/include/SIMDTesting_Utilities.hpp
@@ -93,7 +93,7 @@ class load_element_aligned {
   bool host_load(T const* mem, std::size_t n,
                  Kokkos::Experimental::simd<T, Abi>& result) const {
     if (n < result.size()) return false;
-    result.copy_from(mem, Kokkos::Experimental::element_aligned_tag());
+    result.copy_from(mem, Kokkos::Experimental::simd_flag_default);
     return true;
   }
   template <class T, class Abi>
@@ -101,7 +101,26 @@ class load_element_aligned {
       T const* mem, std::size_t n,
       Kokkos::Experimental::simd<T, Abi>& result) const {
     if (n < result.size()) return false;
-    result.copy_from(mem, Kokkos::Experimental::element_aligned_tag());
+    result.copy_from(mem, Kokkos::Experimental::simd_flag_default);
+    return true;
+  }
+};
+
+class load_vector_aligned {
+ public:
+  template <class T, class Abi>
+  bool host_load(T const* mem, std::size_t n,
+                 Kokkos::Experimental::simd<T, Abi>& result) const {
+    if (n < result.size()) return false;
+    result.copy_from(mem, Kokkos::Experimental::simd_flag_aligned);
+    return true;
+  }
+  template <class T, class Abi>
+  KOKKOS_INLINE_FUNCTION bool device_load(
+      T const* mem, std::size_t n,
+      Kokkos::Experimental::simd<T, Abi>& result) const {
+    if (n < result.size()) return false;
+    result.copy_from(mem, Kokkos::Experimental::simd_flag_aligned);
     return true;
   }
 };
@@ -116,8 +135,7 @@ class load_masked {
     for (std::size_t i = 0; i < n; ++i) {
       mask[i] = true;
     }
-    where(mask, result)
-        .copy_from(mem, Kokkos::Experimental::element_aligned_tag());
+    where(mask, result).copy_from(mem, Kokkos::Experimental::simd_flag_default);
     where(!mask, result) = 0;
     return true;
   }
@@ -130,8 +148,7 @@ class load_masked {
     for (std::size_t i = 0; i < n; ++i) {
       mask[i] = true;
     }
-    where(mask, result)
-        .copy_from(mem, Kokkos::Experimental::element_aligned_tag());
+    where(mask, result).copy_from(mem, Kokkos::Experimental::simd_flag_default);
     where(!mask, result) = T(0);
     return true;
   }
diff --git a/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp b/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp
index 4af08c266bb..23e3826c752 100644
--- a/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp
+++ b/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp
@@ -37,10 +37,10 @@ inline void host_check_gen_ctor() {
   }
 
   simd_type rhs;
-  rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag());
+  rhs.copy_from(init, Kokkos::Experimental::simd_flag_default);
 
   simd_type blend;
-  blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag());
+  blend.copy_from(expected, Kokkos::Experimental::simd_flag_default);
 
 #if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC))
   if constexpr (std::is_same_v<Abi, Kokkos::Experimental::simd_abi::scalar>) {
@@ -98,7 +98,7 @@ KOKKOS_INLINE_FUNCTION void device_check_gen_ctor() {
 
   simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; });
   simd_type rhs;
-  rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag());
+  rhs.copy_from(init, Kokkos::Experimental::simd_flag_default);
   device_check_equality(basic, rhs, lanes);
 
   simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; });
@@ -106,7 +106,7 @@ KOKKOS_INLINE_FUNCTION void device_check_gen_ctor() {
       KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; });
 
   simd_type blend;
-  blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag());
+  blend.copy_from(expected, Kokkos::Experimental::simd_flag_default);
   device_check_equality(result, blend, lanes);
 }
 
diff --git a/simd/unit_tests/include/TestSIMD_MathOps.hpp b/simd/unit_tests/include/TestSIMD_MathOps.hpp
index 802e41efe5f..59f2f6c18fd 100644
--- a/simd/unit_tests/include/TestSIMD_MathOps.hpp
+++ b/simd/unit_tests/include/TestSIMD_MathOps.hpp
@@ -61,13 +61,18 @@ void host_check_math_op_one_loader(UnaryOp unary_op, std::size_t n,
     simd_type arg;
     bool const loaded_arg = loader.host_load(args + i, nlanes, arg);
     if (!loaded_arg) continue;
-    auto computed_result = unary_op.on_host(arg);
 
-    decltype(computed_result) expected_result;
+    decltype(unary_op.on_host(arg)) expected_result;
     for (std::size_t lane = 0; lane < simd_type::size(); ++lane) {
-      if (lane < nlanes)
+      if (lane < nlanes) {
+        if constexpr (std::is_same_v<UnaryOp, cbrt_op> ||
+                      std::is_same_v<UnaryOp, exp_op> ||
+                      std::is_same_v<UnaryOp, log_op>)
+          arg[lane] = Kokkos::abs(arg[lane]);
         expected_result[lane] = unary_op.on_host_serial(T(arg[lane]));
+      }
     }
+    auto computed_result = unary_op.on_host(arg);
     host_check_equality(expected_result, computed_result, nlanes);
   }
 }
@@ -78,6 +83,7 @@ inline void host_check_math_op_all_loaders(Op op, std::size_t n,
   host_check_math_op_one_loader<Abi, load_element_aligned>(op, n, args...);
   host_check_math_op_one_loader<Abi, load_masked>(op, n, args...);
   host_check_math_op_one_loader<Abi, load_as_scalars>(op, n, args...);
+  host_check_math_op_one_loader<Abi, load_vector_aligned>(op, n, args...);
 }
 
 template <typename Abi, typename DataType, size_t n>
@@ -96,6 +102,13 @@ inline void host_check_all_math_ops(const DataType (&first_args)[n],
   // TODO: Place fallback implementations for all simd integer types
   if constexpr (std::is_floating_point_v<DataType>) {
     host_check_math_op_all_loaders<Abi>(divides(), n, first_args, second_args);
+
+#if defined(__INTEL_COMPILER) && \
+    (defined(KOKKOS_ARCH_AVX2) || defined(KOKKOS_ARCH_AVX512XEON))
+    host_check_math_op_all_loaders<Abi>(cbrt_op(), n, first_args);
+    host_check_math_op_all_loaders<Abi>(exp_op(), n, first_args);
+    host_check_math_op_all_loaders<Abi>(log_op(), n, first_args);
+#endif
   }
 }
 
@@ -109,23 +122,29 @@ inline void host_check_abi_size() {
 template <typename Abi, typename DataType>
 inline void host_check_math_ops() {
   constexpr size_t n = 11;
+  constexpr size_t alignment =
+      Kokkos::Experimental::simd<DataType, Abi>::size() * sizeof(DataType);
 
   host_check_abi_size<Abi, DataType>();
 
   if constexpr (!std::is_integral_v<DataType>) {
-    DataType const first_args[n]  = {0.1,  0.4,  0.5, 0.7, 1.0, 1.5,
-                                    -2.0, 10.0, 0.0, 1.2, -2.8};
-    DataType const second_args[n] = {1.0,  0.2, 1.1,  1.8,  -0.1, -3.0,
-                                     -2.4, 1.0, 13.0, -3.2, -2.1};
+    alignas(alignment) DataType const first_args[n] = {
+        0.1, 0.4, 0.5, 0.7, 1.0, 1.5, -2.0, 10.0, 0.0, 1.2, -2.8};
+    alignas(alignment) DataType const second_args[n] = {
+        1.0, 0.2, 1.1, 1.8, -0.1, -3.0, -2.4, 1.0, 13.0, -3.2, -2.1};
     host_check_all_math_ops<Abi>(first_args, second_args);
   } else {
     if constexpr (std::is_signed_v<DataType>) {
-      DataType const first_args[n]  = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2};
-      DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2};
+      alignas(alignment)
+          DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2};
+      alignas(alignment) DataType const second_args[n] = {1,  2, 1,  1,  1, -3,
+                                                          -2, 1, 13, -3, -2};
       host_check_all_math_ops<Abi>(first_args, second_args);
     } else {
-      DataType const first_args[n]  = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2};
-      DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2};
+      alignas(alignment)
+          DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2};
+      alignas(alignment)
+          DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2};
       host_check_all_math_ops<Abi>(first_args, second_args);
     }
   }
@@ -202,6 +221,7 @@ KOKKOS_INLINE_FUNCTION void device_check_math_op_all_loaders(Op op,
   device_check_math_op_one_loader<Abi, load_element_aligned>(op, n, args...);
   device_check_math_op_one_loader<Abi, load_masked>(op, n, args...);
   device_check_math_op_one_loader<Abi, load_as_scalars>(op, n, args...);
+  device_check_math_op_one_loader<Abi, load_vector_aligned>(op, n, args...);
 }
 
 template <typename Abi, typename DataType, size_t n>
@@ -282,8 +302,13 @@ TEST(simd, host_math_ops) {
 }
 
 TEST(simd, device_math_ops) {
-  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::IndexType<int>>(0, 1),
-                       simd_device_math_ops_functor());
+#ifdef KOKKOS_ENABLE_OPENMPTARGET  // FIXME_OPENMPTARGET
+  GTEST_SKIP()
+      << "skipping because of a non-deterministic failure reporting: "
+         "Failure to synchronize stream (nil): Error in "
+         "cuStreamSynchronize: an illegal memory access was encountered";
+#endif
+  Kokkos::parallel_for(1, simd_device_math_ops_functor());
 }
 
 #endif
diff --git a/simd/unit_tests/include/TestSIMD_Reductions.hpp b/simd/unit_tests/include/TestSIMD_Reductions.hpp
new file mode 100644
index 00000000000..b3c7ac9a01e
--- /dev/null
+++ b/simd/unit_tests/include/TestSIMD_Reductions.hpp
@@ -0,0 +1,184 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_TEST_SIMD_REDUCTIONS_HPP
+#define KOKKOS_TEST_SIMD_REDUCTIONS_HPP
+
+#include <Kokkos_SIMD.hpp>
+#include <SIMDTesting_Utilities.hpp>
+
+template <typename Abi, typename Loader, typename ReductionOp, typename T>
+inline void host_check_reduction_one_loader(ReductionOp reduce_op,
+                                            std::size_t n, T const* args) {
+  Loader loader;
+  using simd_type = Kokkos::Experimental::simd<T, Abi>;
+  using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
+  constexpr std::size_t width = simd_type::size();
+
+  for (std::size_t i = 0; i < n; i += width) {
+    std::size_t const nremaining = n - i;
+    std::size_t const nlanes     = Kokkos::min(nremaining, width);
+    simd_type arg;
+    bool const loaded_arg = loader.host_load(args + i, nlanes, arg);
+    if (!loaded_arg) continue;
+
+    mask_type mask(false);
+    for (std::size_t j = 0; j < n; ++j) {
+      mask[j] = true;
+    }
+    auto value    = where(mask, arg);
+    auto expected = reduce_op.on_host_serial(value);
+    auto computed = reduce_op.on_host(value);
+
+    gtest_checker().equality(expected, computed);
+  }
+}
+
+template <typename Abi, typename ReductionOp, typename T>
+inline void host_check_reduction_all_loaders(ReductionOp reduce_op,
+                                             std::size_t n, T const* args) {
+  host_check_reduction_one_loader<Abi, load_element_aligned>(reduce_op, n,
+                                                             args);
+  host_check_reduction_one_loader<Abi, load_masked>(reduce_op, n, args);
+  host_check_reduction_one_loader<Abi, load_as_scalars>(reduce_op, n, args);
+}
+
+template <typename Abi, typename DataType, size_t n>
+inline void host_check_all_reductions(const DataType (&args)[n]) {
+  host_check_reduction_all_loaders<Abi>(hmin(), n, args);
+  host_check_reduction_all_loaders<Abi>(hmax(), n, args);
+  host_check_reduction_all_loaders<Abi>(reduce(), n, args);
+}
+
+template <typename Abi, typename DataType>
+inline void host_check_reductions() {
+  constexpr size_t n = 11;
+
+  if constexpr (std::is_signed_v<DataType>) {
+    DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2};
+    host_check_all_reductions<Abi>(args);
+  } else {
+    DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2};
+    host_check_all_reductions<Abi>(args);
+  }
+}
+
+template <typename Abi, typename... DataTypes>
+inline void host_check_reductions_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (host_check_reductions<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+inline void host_check_reductions_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (host_check_reductions_all_types<Abis>(DataTypes()), ...);
+}
+
+template <typename Abi, typename Loader, typename ReductionOp, typename T>
+KOKKOS_INLINE_FUNCTION void device_check_reduction_one_loader(
+    ReductionOp reduce_op, std::size_t n, T const* args) {
+  Loader loader;
+  using simd_type = Kokkos::Experimental::simd<T, Abi>;
+  using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
+  constexpr std::size_t width = simd_type::size();
+
+  for (std::size_t i = 0; i < n; i += width) {
+    std::size_t const nremaining = n - i;
+    std::size_t const nlanes     = Kokkos::min(nremaining, width);
+    simd_type arg;
+    bool const loaded_arg = loader.device_load(args + i, nlanes, arg);
+    if (!loaded_arg) continue;
+
+    mask_type mask(false);
+    for (std::size_t j = 0; j < n; ++j) {
+      mask[j] = true;
+    }
+    auto value    = where(mask, arg);
+    auto expected = reduce_op.on_device_serial(value);
+    auto computed = reduce_op.on_device(value);
+
+    kokkos_checker().equality(expected, computed);
+  }
+}
+
+template <typename Abi, typename ReductionOp, typename T>
+KOKKOS_INLINE_FUNCTION void device_check_reduction_all_loaders(
+    ReductionOp reduce_op, std::size_t n, T const* args) {
+  device_check_reduction_one_loader<Abi, load_element_aligned>(reduce_op, n,
+                                                               args);
+  device_check_reduction_one_loader<Abi, load_masked>(reduce_op, n, args);
+  device_check_reduction_one_loader<Abi, load_as_scalars>(reduce_op, n, args);
+}
+
+template <typename Abi, typename DataType, size_t n>
+KOKKOS_INLINE_FUNCTION void device_check_all_reductions(
+    const DataType (&args)[n]) {
+  device_check_reduction_all_loaders<Abi>(hmin(), n, args);
+  device_check_reduction_all_loaders<Abi>(hmax(), n, args);
+  device_check_reduction_all_loaders<Abi>(reduce(), n, args);
+}
+
+template <typename Abi, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_reductions() {
+  constexpr size_t n = 11;
+
+  if constexpr (std::is_signed_v<DataType>) {
+    DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2};
+    device_check_all_reductions<Abi>(args);
+  } else {
+    DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2};
+    device_check_all_reductions<Abi>(args);
+  }
+}
+
+template <typename Abi, typename... DataTypes>
+KOKKOS_INLINE_FUNCTION void device_check_reductions_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (device_check_reductions<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+KOKKOS_INLINE_FUNCTION void device_check_reductions_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (device_check_reductions_all_types<Abis>(DataTypes()), ...);
+}
+
+class simd_device_reduction_functor {
+ public:
+  KOKKOS_INLINE_FUNCTION void operator()(int) const {
+    device_check_reductions_all_abis(
+        Kokkos::Experimental::Impl::device_abi_set());
+  }
+};
+
+TEST(simd, host_reductions) {
+  host_check_reductions_all_abis(Kokkos::Experimental::Impl::host_abi_set());
+}
+
+TEST(simd, device_reductions) {
+#ifdef KOKKOS_ENABLE_OPENMPTARGET  // FIXME_OPENMPTARGET
+  GTEST_SKIP()
+      << "skipping because of a non-deterministic failure reporting: "
+         "Failure to synchronize stream (nil): Error in "
+         "cuStreamSynchronize: an illegal memory access was encountered";
+#endif
+  Kokkos::parallel_for(1, simd_device_reduction_functor());
+}
+
+#endif
diff --git a/simd/unit_tests/include/TestSIMD_ShiftOps.hpp b/simd/unit_tests/include/TestSIMD_ShiftOps.hpp
index f6fdcb920ed..ffdd2cba4a0 100644
--- a/simd/unit_tests/include/TestSIMD_ShiftOps.hpp
+++ b/simd/unit_tests/include/TestSIMD_ShiftOps.hpp
@@ -85,10 +85,11 @@ inline void host_check_shift_op_all_loaders(ShiftOp shift_op,
                                                    shift_by, n);
   host_check_shift_on_one_loader<Abi, load_as_scalars>(shift_op, test_vals,
                                                        shift_by, n);
+  host_check_shift_on_one_loader<Abi, load_vector_aligned>(shift_op, test_vals,
+                                                           shift_by, n);
 
   Kokkos::Experimental::simd<DataType, Abi> shift_by_lanes;
-  shift_by_lanes.copy_from(shift_by,
-                           Kokkos::Experimental::element_aligned_tag());
+  shift_by_lanes.copy_from(shift_by, Kokkos::Experimental::simd_flag_default);
 
   host_check_shift_by_lanes_on_one_loader<Abi, load_element_aligned>(
       shift_op, test_vals, shift_by_lanes);
@@ -96,6 +97,8 @@ inline void host_check_shift_op_all_loaders(ShiftOp shift_op,
                                                             shift_by_lanes);
   host_check_shift_by_lanes_on_one_loader<Abi, load_as_scalars>(
       shift_op, test_vals, shift_by_lanes);
+  host_check_shift_by_lanes_on_one_loader<Abi, load_vector_aligned>(
+      shift_op, test_vals, shift_by_lanes);
 }
 
 template <typename Abi, typename DataType>
@@ -104,12 +107,14 @@ inline void host_check_shift_ops() {
     using simd_type                 = Kokkos::Experimental::simd<DataType, Abi>;
     constexpr std::size_t width     = simd_type::size();
     constexpr std::size_t num_cases = 8;
+    constexpr size_t alignment =
+        Kokkos::Experimental::simd<DataType, Abi>::size() * sizeof(DataType);
 
     DataType max = std::numeric_limits<DataType>::max();
 
-    DataType shift_by[num_cases] = {
+    alignas(alignment) DataType shift_by[num_cases] = {
         0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1};
-    DataType test_vals[width];
+    alignas(alignment) DataType test_vals[width];
     for (std::size_t i = 0; i < width; ++i) {
       DataType inc = max / width;
       test_vals[i] = i * inc + 1;
@@ -201,10 +206,11 @@ KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders(
                                                      shift_by, n);
   device_check_shift_on_one_loader<Abi, load_as_scalars>(shift_op, test_vals,
                                                          shift_by, n);
+  device_check_shift_on_one_loader<Abi, load_vector_aligned>(
+      shift_op, test_vals, shift_by, n);
 
   Kokkos::Experimental::simd<DataType, Abi> shift_by_lanes;
-  shift_by_lanes.copy_from(shift_by,
-                           Kokkos::Experimental::element_aligned_tag());
+  shift_by_lanes.copy_from(shift_by, Kokkos::Experimental::simd_flag_default);
 
   device_check_shift_by_lanes_on_one_loader<Abi, load_element_aligned>(
       shift_op, test_vals, shift_by_lanes);
@@ -212,6 +218,8 @@ KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders(
       shift_op, test_vals, shift_by_lanes);
   device_check_shift_by_lanes_on_one_loader<Abi, load_as_scalars>(
       shift_op, test_vals, shift_by_lanes);
+  device_check_shift_by_lanes_on_one_loader<Abi, load_vector_aligned>(
+      shift_op, test_vals, shift_by_lanes);
 }
 
 template <typename Abi, typename DataType>
diff --git a/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp b/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp
index 129f2b0d5c9..152fd9e9840 100644
--- a/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp
+++ b/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp
@@ -29,7 +29,7 @@ inline void host_check_where_expr_scatter_to() {
   std::size_t nlanes = simd_type::size();
   DataType init[]    = {11, 13, 17, 19, 23, 29, 31, 37};
   simd_type src;
-  src.copy_from(init, Kokkos::Experimental::element_aligned_tag());
+  src.copy_from(init, Kokkos::Experimental::simd_flag_default);
 
   for (std::size_t idx = 0; idx < nlanes; ++idx) {
     mask_type mask(true);
@@ -46,7 +46,7 @@ inline void host_check_where_expr_scatter_to() {
     where(mask, src).scatter_to(dst, index);
 
     simd_type dst_simd;
-    dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag());
+    dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default);
 
     host_check_equality(expected_result, dst_simd, nlanes);
   }
@@ -107,7 +107,7 @@ KOKKOS_INLINE_FUNCTION void device_check_where_expr_scatter_to() {
   std::size_t nlanes = simd_type::size();
   DataType init[]    = {11, 13, 17, 19, 23, 29, 31, 37};
   simd_type src;
-  src.copy_from(init, Kokkos::Experimental::element_aligned_tag());
+  src.copy_from(init, Kokkos::Experimental::simd_flag_default);
 
   for (std::size_t idx = 0; idx < nlanes; ++idx) {
     mask_type mask(true);
@@ -124,7 +124,7 @@ KOKKOS_INLINE_FUNCTION void device_check_where_expr_scatter_to() {
     where(mask, src).scatter_to(dst, index);
 
     simd_type dst_simd;
-    dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag());
+    dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default);
 
     device_check_equality(expected_result, dst_simd, nlanes);
   }
diff --git a/tpls/desul/Config.hpp.cmake.in b/tpls/desul/Config.hpp.cmake.in
index a7bc738191e..aed7ecfabc9 100644
--- a/tpls/desul/Config.hpp.cmake.in
+++ b/tpls/desul/Config.hpp.cmake.in
@@ -14,6 +14,8 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #cmakedefine DESUL_ATOMICS_ENABLE_HIP
 #cmakedefine DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION
 #cmakedefine DESUL_ATOMICS_ENABLE_SYCL
+#cmakedefine DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
 #cmakedefine DESUL_ATOMICS_ENABLE_OPENMP
+#cmakedefine DESUL_ATOMICS_ENABLE_OPENACC
 
 #endif
diff --git a/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp b/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp
index 082fc132de5..15c6d78d94b 100644
--- a/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp
+++ b/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp
@@ -88,15 +88,18 @@ using sycl_atomic_ref = sycl::atomic_ref<T,
                                          sycl::access::address_space::generic_space>;
 #endif
 
-// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead
 #ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED
-// FIXME_SYCL The compiler forces us to use device_image_scope. Drop this when possible.
+#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL
+template <class T>
+using sycl_device_global = sycl::ext::oneapi::experimental::device_global<T>;
+#else
 template <class T>
 using sycl_device_global = sycl::ext::oneapi::experimental::device_global<
     T,
     decltype(sycl::ext::oneapi::experimental::properties(
         sycl::ext::oneapi::experimental::device_image_scope))>;
 #endif
+#endif
 
 }  // namespace Impl
 }  // namespace desul
diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange.hpp
index e91569e1dee..72639fc4932 100644
--- a/tpls/desul/include/desul/atomics/Compare_Exchange.hpp
+++ b/tpls/desul/include/desul/atomics/Compare_Exchange.hpp
@@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #ifdef DESUL_HAVE_OPENMP_ATOMICS
 #include <desul/atomics/Compare_Exchange_OpenMP.hpp>
 #endif
+#ifdef DESUL_HAVE_OPENACC_ATOMICS
+#include <desul/atomics/Compare_Exchange_OpenACC.hpp>
+#endif
 #ifdef DESUL_HAVE_SYCL_ATOMICS
 #include <desul/atomics/Compare_Exchange_SYCL.hpp>
 #endif
diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp
new file mode 100644
index 00000000000..77149bd4741
--- /dev/null
+++ b/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp
@@ -0,0 +1,153 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_
+
+#include <openacc.h>
+
+#include <desul/atomics/Common.hpp>
+#include <desul/atomics/Thread_Fence_OpenACC.hpp>
+#include <type_traits>
+
+namespace desul {
+namespace Impl {
+
+#ifdef __NVCOMPILER
+
+#pragma acc routine seq
+template <class T, class MemoryOrder, class MemoryScope>
+T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope /*scope*/) {
+  if constexpr (std::is_arithmetic_v<T> && ((sizeof(T) == 4) || (sizeof(T) == 8))) {
+    T return_val;
+#pragma acc atomic capture
+    {
+      return_val = *dest;
+      *dest = value;
+    }
+    return return_val;
+  } else {
+    // FIXME_OPENACC
+    if (acc_on_device(acc_device_not_host)) {
+      printf(
+          "DESUL error in device_atomic_exchange(): Not supported atomic operation in "
+          "the OpenACC backend\n");
+    }
+    //  Acquire a lock for the address
+    // while (!lock_address_openacc((void*)dest, scope)) {
+    // }
+    // device_atomic_thread_fence(MemoryOrderAcquire(), scope);
+    T return_val = *dest;
+    *dest = value;
+    // device_atomic_thread_fence(MemoryOrderRelease(), scope);
+    // unlock_address_openacc((void*)dest, scope);
+    return return_val;
+  }
+}
+
+#pragma acc routine seq
+template <class T, class MemoryOrder, class MemoryScope>
+T device_atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope scope) {
+  // Floating point types treated separetely to work around compiler errors
+  // "parse invalid cast opcode for cast from 'i32' to 'float'".
+  // Also not just "forwarding" arguments to atomicCAS because it does not have an
+  // overload that takes int64_t
+  if constexpr (std::is_integral_v<T> && ((sizeof(T) == 4) || (sizeof(T) == 8))) {
+    static_assert(sizeof(unsigned int) == 4);
+    static_assert(sizeof(unsigned long long int) == 8);
+    using cas_t =
+        std::conditional_t<(sizeof(T) == 4), unsigned int, unsigned long long int>;
+    cas_t return_val = atomicCAS(reinterpret_cast<cas_t*>(dest),
+                                 reinterpret_cast<cas_t&>(compare),
+                                 reinterpret_cast<cas_t&>(value));
+    return reinterpret_cast<T&>(return_val);
+#ifdef DESUL_CUDA_ARCH_IS_PRE_PASCAL
+  } else if constexpr (std::is_same_v<T, float>) {
+#else
+  } else if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+#endif
+    return atomicCAS(dest, compare, value);
+  } else {
+    // FIXME_OPENACC
+    if (acc_on_device(acc_device_not_host)) {
+      printf(
+          "DESUL error in device_atomic_compare_exchange(): Not supported atomic "
+          "operation in the OpenACC backend\n");
+    }
+    T current_val = *dest;
+    // Acquire a lock for the address
+    // while (!lock_address_openacc((void*)dest, scope)) {
+    //}
+    // device_atomic_thread_fence(MemoryOrderAcquire(), scope);
+    if (current_val == compare) {
+      *dest = value;
+      // device_atomic_thread_fence(MemoryOrderRelease(), scope);
+    }
+    // unlock_address_openacc((void*)dest, scope);
+    return current_val;
+  }
+}
+
+#else  // not NVHPC
+
+#pragma acc routine seq
+template <class T, class MemoryOrder, class MemoryScope>
+T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope) {
+  if constexpr (std::is_arithmetic_v<T>) {
+    T return_val;
+#pragma acc atomic capture
+    {
+      return_val = *dest;
+      *dest = value;
+    }
+    return return_val;
+  } else {
+    // FIXME_OPENACC
+    printf(
+        "DESUL error in device_atomic_exchange(): Not supported atomic operation in "
+        "the OpenACC backend\n");
+    //  Acquire a lock for the address
+    // while (!lock_address_openacc((void*)dest, scope)) {
+    // }
+    // device_atomic_thread_fence(MemoryOrderAcquire(), scope);
+    T return_val = *dest;
+    *dest = value;
+    // device_atomic_thread_fence(MemoryOrderRelease(), scope);
+    // unlock_address_openacc((void*)dest, scope);
+    return return_val;
+  }
+}
+
+#pragma acc routine seq
+template <class T, class MemoryOrder, class MemoryScope>
+T device_atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope scope) {
+  // FIXME_OPENACC
+  printf(
+      "DESUL error in device_atomic_compare_exchange(): Not supported atomic operation "
+      "in the OpenACC backend\n");
+  T current_val = *dest;
+  // Acquire a lock for the address
+  // while (!lock_address_openacc((void*)dest, scope)) {
+  //}
+  // device_atomic_thread_fence(MemoryOrderAcquire(), scope);
+  if (current_val == compare) {
+    *dest = value;
+    // device_atomic_thread_fence(MemoryOrderRelease(), scope);
+  }
+  // unlock_address_openacc((void*)dest, scope);
+  return current_val;
+}
+
+#endif
+
+}  // namespace Impl
+}  // namespace desul
+
+#endif
diff --git a/tpls/desul/include/desul/atomics/Fetch_Op.hpp b/tpls/desul/include/desul/atomics/Fetch_Op.hpp
index adf75c57437..1b161397c74 100644
--- a/tpls/desul/include/desul/atomics/Fetch_Op.hpp
+++ b/tpls/desul/include/desul/atomics/Fetch_Op.hpp
@@ -23,6 +23,9 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #ifdef DESUL_HAVE_OPENMP_ATOMICS
 #include <desul/atomics/Fetch_Op_OpenMP.hpp>
 #endif
+#ifdef DESUL_HAVE_OPENACC_ATOMICS
+#include <desul/atomics/Fetch_Op_OpenACC.hpp>
+#endif
 #ifdef DESUL_HAVE_SYCL_ATOMICS
 #include <desul/atomics/Fetch_Op_SYCL.hpp>
 #endif
diff --git a/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp b/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp
new file mode 100644
index 00000000000..ab570ac5787
--- /dev/null
+++ b/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp
@@ -0,0 +1,431 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_
+#define DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_
+
+#include <algorithm>  // min, max
+#include <desul/atomics/Common.hpp>
+#include <type_traits>
+
+namespace desul {
+namespace Impl {
+
+#ifdef __NVCOMPILER
+
+template <class T>
+inline constexpr bool is_openacc_integral_type_v =
+    std::is_same_v<T, int> || std::is_same_v<T, unsigned int> ||
+    std::is_same_v<T, unsigned long long>;
+
+template <class T>
+inline constexpr bool is_openacc_arithmetic_type_v = std::is_same_v<T, float> ||
+#ifndef DESUL_CUDA_ARCH_IS_PRE_PASCAL
+                                                     std::is_same_v<T, double> ||
+#endif
+                                                     is_openacc_integral_type_v<T>;
+
+#else
+
+template <class T>
+inline constexpr bool is_openacc_integral_type_v = std::is_integral_v<T>;
+
+template <class T>
+inline constexpr bool is_openacc_arithmetic_type_v = std::is_arithmetic_v<T>;
+
+#endif
+
+//<editor-fold
+// desc="device_atomic_fetch_{add,sub,mul,div,lshift,rshift,mod,max,min,and,or,xor}">
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_add(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T old;
+#pragma acc atomic capture
+  {
+    old = *ptr;
+    *ptr += val;
+  }
+  return old;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_inc(
+    T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T old;
+#pragma acc atomic capture
+  {
+    old = *ptr;
+    *ptr += T(1);
+  }
+  return old;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_sub(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T old;
+#pragma acc atomic capture
+  {
+    old = *ptr;
+    *ptr -= val;
+  }
+  return old;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_dec(
+    T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T old;
+#pragma acc atomic capture
+  {
+    old = *ptr;
+    *ptr -= T(1);
+  }
+  return old;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_mul(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T old;
+#pragma acc atomic capture
+  {
+    old = *ptr;
+    *ptr *= val;
+  }
+  return old;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_div(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T old;
+#pragma acc atomic capture
+  {
+    old = *ptr;
+    *ptr /= val;
+  }
+  return old;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_fetch_lshift(
+    T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T old;
+#pragma acc atomic capture
+  {
+    old = *ptr;
+    *ptr = *ptr << val;
+  }
+  return old;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_fetch_rshift(
+    T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T old;
+#pragma acc atomic capture
+  {
+    old = *ptr;
+    *ptr = *ptr >> val;
+  }
+  return old;
+}
+
+#ifdef __NVCOMPILER
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_max(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T old;
+  old = atomicMax(ptr, val);
+  return old;
+}
+#endif
+
+#ifdef __NVCOMPILER
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_min(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  int old;
+  old = atomicMin(ptr, val);
+  return old;
+}
+#endif
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_fetch_and(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T old;
+#pragma acc atomic capture
+  {
+    old = *ptr;
+    *ptr &= val;
+  }
+  return old;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_fetch_or(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T old;
+#pragma acc atomic capture
+  {
+    old = *ptr;
+    *ptr |= val;
+  }
+  return old;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_fetch_xor(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T old;
+#pragma acc atomic capture
+  {
+    old = *ptr;
+    *ptr ^= val;
+  }
+  return old;
+}
+//</editor-fold>
+
+//<editor-fold
+// desc="device_atomic_{add,sub,mul,div,lshift,rshift,mod,max,min,and,or,xor}_fetch">
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_add_fetch(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T tmp;
+#pragma acc atomic capture
+  {
+    *ptr += val;
+    tmp = *ptr;
+  }
+  return tmp;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_inc_fetch(
+    T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T tmp;
+#pragma acc atomic capture
+  {
+    *ptr += T(1);
+    tmp = *ptr;
+  }
+  return tmp;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_sub_fetch(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T tmp;
+#pragma acc atomic capture
+  {
+    *ptr -= val;
+    tmp = *ptr;
+  }
+  return tmp;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_dec_fetch(
+    T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T tmp;
+#pragma acc atomic capture
+  {
+    *ptr -= T(1);
+    tmp = *ptr;
+  }
+  return tmp;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_mul_fetch(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T tmp;
+#pragma acc atomic capture
+  {
+    *ptr *= val;
+    tmp = *ptr;
+  }
+  return tmp;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_div_fetch(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T tmp;
+#pragma acc atomic capture
+  {
+    *ptr /= val;
+    tmp = *ptr;
+  }
+  return tmp;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_lshift_fetch(
+    T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T tmp;
+#pragma acc atomic capture
+  {
+    *ptr = *ptr << val;
+    tmp = *ptr;
+  }
+  return tmp;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_rshift_fetch(
+    T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T tmp;
+#pragma acc atomic capture
+  {
+    *ptr = *ptr >> val;
+    tmp = *ptr;
+  }
+  return tmp;
+}
+
+#ifdef __NVCOMPILER
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_max_fetch(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T tmp;
+  tmp = atomicMax(ptr, val);
+  tmp = std::max(tmp, val);
+  return tmp;
+}
+#endif
+
+#ifdef __NVCOMPILER
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_min_fetch(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T tmp;
+  tmp = atomicMin(ptr, val);
+  tmp = std::min(tmp, val);
+  return tmp;
+}
+#endif
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_and_fetch(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T tmp;
+#pragma acc atomic capture
+  {
+    *ptr &= val;
+    tmp = *ptr;
+  }
+  return tmp;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_or_fetch(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T tmp;
+#pragma acc atomic capture
+  {
+    *ptr |= val;
+    tmp = *ptr;
+  }
+  return tmp;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_xor_fetch(
+    T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T tmp;
+#pragma acc atomic capture
+  {
+    *ptr ^= val;
+    tmp = *ptr;
+  }
+  return tmp;
+}
+//</editor-fold>
+
+//<editor-fold desc="device_atomic_{store,load}">
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, void> device_atomic_store(
+    T* const ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+#pragma acc atomic write
+  *ptr = val;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, void> device_atomic_store(
+    T* const ptr, const T val, MemoryOrderRelease, MemoryScopeDevice) {
+  if (acc_on_device(acc_device_not_host)) {
+    printf(
+        "DESUL error in device_atomic_store(MemoryOrderRelease): Not supported atomic "
+        "operation in the OpenACC backend\n");
+  }
+#pragma acc atomic write
+  *ptr = val;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_load(
+    const T* const ptr, MemoryOrderRelaxed, MemoryScopeDevice) {
+  T retval;
+#pragma acc atomic read
+  retval = *ptr;
+  return retval;
+}
+
+#pragma acc routine seq
+template <class T>
+std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_load(
+    const T* const ptr, MemoryOrderAcquire, MemoryScopeDevice) {
+  if (acc_on_device(acc_device_not_host)) {
+    printf(
+        "DESUL error in device_atomic_load(MemoryOrderAcquire): Not supported atomic "
+        "operation in the OpenACC backend\n");
+  }
+  T retval;
+#pragma acc atomic read
+  retval = *ptr;
+  return retval;
+}
+//</editor-fold>
+
+}  // namespace Impl
+}  // namespace desul
+
+#endif
diff --git a/tpls/desul/include/desul/atomics/Generic.hpp b/tpls/desul/include/desul/atomics/Generic.hpp
index fef10222e34..fa71477c299 100644
--- a/tpls/desul/include/desul/atomics/Generic.hpp
+++ b/tpls/desul/include/desul/atomics/Generic.hpp
@@ -18,11 +18,14 @@ SPDX-License-Identifier: (BSD-3-Clause)
 
 namespace desul {
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION void atomic_thread_fence(MemoryOrder order, MemoryScope scope) {
   DESUL_IF_ON_DEVICE(return Impl::device_atomic_thread_fence(order, scope);)
   DESUL_IF_ON_HOST(return Impl::host_atomic_thread_fence(order, scope);)
 }
+
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) {
@@ -30,6 +33,7 @@ atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) {
   DESUL_IF_ON_HOST(return Impl::host_atomic_exchange(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope scope) {
@@ -40,6 +44,7 @@ atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope sc
 }
 
 // Fetch_Oper atomics: return value before operation
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -47,6 +52,7 @@ atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_add(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -54,6 +60,7 @@ atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_sub(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -61,6 +68,7 @@ atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_max(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -68,6 +76,7 @@ atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_min(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -75,6 +84,7 @@ atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mul(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -82,6 +92,7 @@ atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_div(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -89,6 +100,7 @@ atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mod(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -96,6 +108,7 @@ atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_and(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -103,6 +116,7 @@ atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_or(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -110,6 +124,7 @@ atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_xor(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -117,6 +132,7 @@ atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope sco
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_nand(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest,
                                             const unsigned int val,
@@ -126,6 +142,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_lshift(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest,
                                             const unsigned int val,
@@ -136,6 +153,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest,
 }
 
 // Oper Fetch atomics: return value after operation
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -143,6 +161,7 @@ atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_add_fetch(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -150,6 +169,7 @@ atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_sub_fetch(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -157,6 +177,7 @@ atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_max_fetch(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -164,6 +185,7 @@ atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_min_fetch(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -171,6 +193,7 @@ atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_mul_fetch(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -178,6 +201,7 @@ atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_div_fetch(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -185,6 +209,7 @@ atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_mod_fetch(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -192,6 +217,7 @@ atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_and_fetch(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -199,6 +225,7 @@ atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope
   DESUL_IF_ON_HOST(return Impl::host_atomic_or_fetch(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -206,6 +233,7 @@ atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop
   DESUL_IF_ON_HOST(return Impl::host_atomic_xor_fetch(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
@@ -213,6 +241,7 @@ atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope sco
   DESUL_IF_ON_HOST(return Impl::host_atomic_nand_fetch(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest,
                                             const unsigned int val,
@@ -222,6 +251,7 @@ DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_lshift_fetch(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest,
                                             const unsigned int val,
@@ -233,6 +263,7 @@ DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest,
 
 // Other atomics
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T atomic_load(const T* const dest,
                                     MemoryOrder order,
@@ -241,6 +272,7 @@ DESUL_INLINE_FUNCTION T atomic_load(const T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_load(dest, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION void atomic_store(T* const dest,
                                         const T val,
@@ -250,6 +282,7 @@ DESUL_INLINE_FUNCTION void atomic_store(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_store(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION void atomic_add(T* const dest,
                                       const T val,
@@ -259,6 +292,7 @@ DESUL_INLINE_FUNCTION void atomic_add(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_add(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION void atomic_sub(T* const dest,
                                       const T val,
@@ -268,6 +302,7 @@ DESUL_INLINE_FUNCTION void atomic_sub(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_sub(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION void atomic_mul(T* const dest,
                                       const T val,
@@ -277,6 +312,7 @@ DESUL_INLINE_FUNCTION void atomic_mul(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_mul(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION void atomic_div(T* const dest,
                                       const T val,
@@ -286,6 +322,7 @@ DESUL_INLINE_FUNCTION void atomic_div(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_div(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION void atomic_min(T* const dest,
                                       const T val,
@@ -295,6 +332,7 @@ DESUL_INLINE_FUNCTION void atomic_min(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_min(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION void atomic_max(T* const dest,
                                       const T val,
@@ -304,6 +342,7 @@ DESUL_INLINE_FUNCTION void atomic_max(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_max(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest,
                                          MemoryOrder order,
@@ -312,6 +351,7 @@ DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_inc_fetch(dest, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest,
                                          MemoryOrder order,
@@ -320,6 +360,7 @@ DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_dec_fetch(dest, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest,
                                          MemoryOrder order,
@@ -328,6 +369,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc(dest, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) {
@@ -335,6 +377,7 @@ atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope)
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc_mod(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest,
                                          MemoryOrder order,
@@ -343,6 +386,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec(dest, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION T
 atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) {
@@ -350,6 +394,7 @@ atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope)
   DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec_mod(dest, val, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION void atomic_inc(T* const dest,
                                       MemoryOrder order,
@@ -358,6 +403,7 @@ DESUL_INLINE_FUNCTION void atomic_inc(T* const dest,
   DESUL_IF_ON_HOST(return Impl::host_atomic_inc(dest, order, scope);)
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T, class MemoryOrder, class MemoryScope>
 DESUL_INLINE_FUNCTION void atomic_dec(T* const dest,
                                       MemoryOrder order,
@@ -367,6 +413,7 @@ DESUL_INLINE_FUNCTION void atomic_dec(T* const dest,
 }
 
 // FIXME
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T,
           class SuccessMemoryOrder,
           class FailureMemoryOrder,
@@ -387,6 +434,7 @@ DESUL_INLINE_FUNCTION bool atomic_compare_exchange_strong(
   }
 }
 
+DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
 template <class T,
           class SuccessMemoryOrder,
           class FailureMemoryOrder,
diff --git a/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp b/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp
index 8216f9a797c..e1170ed2aae 100644
--- a/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp
+++ b/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp
@@ -57,14 +57,35 @@ void finalize_lock_arrays_sycl(sycl::queue q);
  * \brief This global variable in SYCL space is what kernels use to get access
  * to the lock arrays.
  *
- * There is only one single instance of this global variable for the entire
- * executable, whose definition will be in Kokkos_SYCL_Locks.cpp (and whose
- * declaration here must be extern). This one instance will be initialized
- * by initialize_host_sycl_lock_arrays and need not be modified afterwards.
+ * When relocatable device code is enabled, there is only one single instance of this
+ * global variable for the entire executable, whose definition will be in
+ * Kokkos_SYCL_Locks.cpp (and whose declaration here must then be extern). This one
+ * instance will be initialized by initialize_host_sycl_lock_arrays and need not be
+ * modified afterwards.
+ *
+ * When relocatable device code is disabled, an instance of this variable will be
+ * created in every translation unit that sees this header file (we make this clear by
+ * marking it static, meaning no other translation unit can link to it). Since the
+ * Kokkos_SYCL_Locks.cpp translation unit cannot initialize the instances in other
+ * translation units, we must update this SYCL global variable based on the Host global
+ * variable prior to running any kernels that will use it. That is the purpose of the
+ * ensure_sycl_lock_arrays_on_device function.
  */
-SYCL_EXTERNAL extern sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_DEVICE;
+#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
+SYCL_EXTERNAL extern
+#else
+static
+#endif
+    sycl_device_global<int32_t*>
+        SYCL_SPACE_ATOMIC_LOCKS_DEVICE;
 
-SYCL_EXTERNAL extern sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_NODE;
+#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
+SYCL_EXTERNAL extern
+#else
+static
+#endif
+    sycl_device_global<int32_t*>
+        SYCL_SPACE_ATOMIC_LOCKS_NODE;
 
 #define SYCL_SPACE_ATOMIC_MASK 0x1FFFF
 
@@ -128,6 +149,34 @@ inline void unlock_address_sycl(void* ptr, MemoryScopeNode) {
   lock_node_ref.exchange(0);
 }
 
+#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
+inline
+#else
+inline static
+#endif
+    void
+    copy_sycl_lock_arrays_to_device(sycl::queue q) {
+  static bool once = [&q]() {
+#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL
+    q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE,
+             &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h,
+             sizeof(int32_t*));
+    q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE,
+             &SYCL_SPACE_ATOMIC_LOCKS_NODE_h,
+             sizeof(int32_t*));
+#else
+    auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h;
+    auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h;
+    q.single_task([=] {
+      SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr;
+      SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr;
+    });
+#endif
+    return true;
+  }();
+  (void)once;
+}
+
 #else  // not supported
 
 template <typename /*AlwaysInt*/ = int>
@@ -155,7 +204,26 @@ inline bool lock_address_sycl(void*, MemoryScopeNode) {
 inline void unlock_address_sycl(void*, MemoryScopeDevice) { assert(false); }
 
 inline void unlock_address_sycl(void*, MemoryScopeNode) { assert(false); }
+
+#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
+inline
+#else
+inline static
+#endif
+    void
+    copy_sycl_lock_arrays_to_device(sycl::queue) {
+}
+
 #endif
 }  // namespace Impl
+
+#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
+inline void ensure_sycl_lock_arrays_on_device(sycl::queue) {}
+#else
+static inline void ensure_sycl_lock_arrays_on_device(sycl::queue q) {
+  Impl::copy_sycl_lock_arrays_to_device(q);
+}
+#endif
+
 }  // namespace desul
 #endif
diff --git a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp
index cb97f4a906d..b6a399100b1 100644
--- a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp
+++ b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp
@@ -17,6 +17,9 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #ifdef DESUL_HAVE_HIP_ATOMICS
 #include <desul/atomics/Lock_Based_Fetch_Op_HIP.hpp>
 #endif
+#ifdef DESUL_HAVE_OPENACC_ATOMICS
+#include <desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp>
+#endif
 #ifdef DESUL_HAVE_SYCL_ATOMICS
 #include <desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp>
 #endif
diff --git a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp
new file mode 100644
index 00000000000..d4dd74588bd
--- /dev/null
+++ b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp
@@ -0,0 +1,81 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_
+#define DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_
+
+#include <desul/atomics/Common.hpp>
+#include <desul/atomics/Lock_Array.hpp>
+#include <desul/atomics/Thread_Fence.hpp>
+#include <type_traits>
+
+namespace desul {
+namespace Impl {
+
+template <class Oper,
+          class T,
+          class MemoryOrder,
+          class MemoryScope,
+          // equivalent to:
+          //   requires !atomic_always_lock_free(sizeof(T))
+          std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0>
+inline T device_atomic_fetch_oper(const Oper& op,
+                                  T* const dest,
+                                  dont_deduce_this_parameter_t<const T> val,
+                                  MemoryOrder /*order*/,
+                                  MemoryScope scope) {
+  if (acc_on_device(acc_device_not_host)) {
+    printf(
+        "DESUL error in device_atomic_fetch_oper(): Not supported atomic operation in "
+        "the OpenACC backend\n");
+  }
+  // Acquire a lock for the address
+  while (!lock_address((void*)dest, scope)) {
+  }
+
+  device_atomic_thread_fence(MemoryOrderAcquire(), scope);
+  T return_val = *dest;
+  *dest = op.apply(return_val, val);
+  device_atomic_thread_fence(MemoryOrderRelease(), scope);
+  unlock_address((void*)dest, scope);
+  return return_val;
+}
+
+template <class Oper,
+          class T,
+          class MemoryOrder,
+          class MemoryScope,
+          // equivalent to:
+          //   requires !atomic_always_lock_free(sizeof(T))
+          std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0>
+inline T device_atomic_oper_fetch(const Oper& op,
+                                  T* const dest,
+                                  dont_deduce_this_parameter_t<const T> val,
+                                  MemoryOrder /*order*/,
+                                  MemoryScope scope) {
+  if (acc_on_device(acc_device_not_host)) {
+    printf(
+        "DESUL error in device_atomic_oper_fetch(): Not supported atomic operation in "
+        "the OpenACC backend\n");
+  }
+  // Acquire a lock for the address
+  while (!lock_address((void*)dest, scope)) {
+  }
+
+  device_atomic_thread_fence(MemoryOrderAcquire(), scope);
+  T return_val = op.apply(*dest, val);
+  *dest = return_val;
+  device_atomic_thread_fence(MemoryOrderRelease(), scope);
+  unlock_address((void*)dest, scope);
+  return return_val;
+}
+
+}  // namespace Impl
+}  // namespace desul
+
+#endif
diff --git a/tpls/desul/include/desul/atomics/Macros.hpp b/tpls/desul/include/desul/atomics/Macros.hpp
index 3a14b93d323..d11beb0c805 100644
--- a/tpls/desul/include/desul/atomics/Macros.hpp
+++ b/tpls/desul/include/desul/atomics/Macros.hpp
@@ -57,6 +57,10 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #define DESUL_HAVE_OPENMP_ATOMICS
 #endif
 
+#if defined(DESUL_ATOMICS_ENABLE_OPENACC)
+#define DESUL_HAVE_OPENACC_ATOMICS
+#endif
+
 // ONLY use GNUC atomics if not explicitly say to use OpenMP atomics
 #if !defined(DESUL_HAVE_OPENMP_ATOMICS) && defined(__GNUC__)
 #define DESUL_HAVE_GCC_ATOMICS
@@ -123,6 +127,30 @@ static constexpr bool desul_impl_omp_on_host() { return false; }
 #endif
 #endif
 
+#if defined(DESUL_HAVE_OPENACC_ATOMICS)
+#include <openacc.h>
+#ifdef __NVCOMPILER
+// FIXME_OPENACC We cannot determine in a constant expresion whether we are on host or
+// on device with NVHPC.  We use the device implementation on both sides.
+#define DESUL_IF_ON_DEVICE(CODE) \
+  { DESUL_IMPL_STRIP_PARENS(CODE) }
+#define DESUL_IF_ON_HOST(CODE) \
+  {}
+#else
+#define DESUL_IF_ON_DEVICE(CODE)                      \
+  if constexpr (acc_on_device(acc_device_not_host)) { \
+    DESUL_IMPL_STRIP_PARENS(CODE)                     \
+  }
+#define DESUL_IF_ON_HOST(CODE)                    \
+  if constexpr (acc_on_device(acc_device_host)) { \
+    DESUL_IMPL_STRIP_PARENS(CODE)                 \
+  }
+#endif
+#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE _Pragma("acc routine seq")
+#else
+#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE
+#endif
+
 #if !defined(DESUL_IF_ON_HOST) && !defined(DESUL_IF_ON_DEVICE)
 #if (defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDA_ARCH__)) ||         \
     (defined(DESUL_ATOMICS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \
diff --git a/tpls/desul/include/desul/atomics/Thread_Fence.hpp b/tpls/desul/include/desul/atomics/Thread_Fence.hpp
index 24078aae07f..6a741f6d478 100644
--- a/tpls/desul/include/desul/atomics/Thread_Fence.hpp
+++ b/tpls/desul/include/desul/atomics/Thread_Fence.hpp
@@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #ifdef DESUL_HAVE_OPENMP_ATOMICS
 #include <desul/atomics/Thread_Fence_OpenMP.hpp>
 #endif
+#ifdef DESUL_HAVE_OPENACC_ATOMICS
+#include <desul/atomics/Thread_Fence_OpenACC.hpp>
+#endif
 #ifdef DESUL_HAVE_SYCL_ATOMICS
 #include <desul/atomics/Thread_Fence_SYCL.hpp>
 #endif
diff --git a/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp b/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp
new file mode 100644
index 00000000000..a5c8aa1c8a7
--- /dev/null
+++ b/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp
@@ -0,0 +1,25 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_
+#define DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_
+
+namespace desul {
+namespace Impl {
+
+#pragma acc routine seq
+template <class MemoryOrder, class MemoryScope>
+void device_atomic_thread_fence(MemoryOrder, MemoryScope) {
+  // FIXME_OPENACC: The current OpenACC standard does not support explicit thread fence
+  // operations.
+}
+
+}  // namespace Impl
+}  // namespace desul
+
+#endif
diff --git a/tpls/desul/src/Lock_Array_SYCL.cpp b/tpls/desul/src/Lock_Array_SYCL.cpp
index 9e84c60e41a..6660c76e11a 100644
--- a/tpls/desul/src/Lock_Array_SYCL.cpp
+++ b/tpls/desul/src/Lock_Array_SYCL.cpp
@@ -14,10 +14,12 @@ SPDX-License-Identifier: (BSD-3-Clause)
 
 namespace desul::Impl {
 
+#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
 SYCL_EXTERNAL
 sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_DEVICE;
 SYCL_EXTERNAL
 sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_NODE;
+#endif
 
 int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
 int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
@@ -31,19 +33,7 @@ void init_lock_arrays_sycl<int>(sycl::queue q) {
   SYCL_SPACE_ATOMIC_LOCKS_NODE_h =
       sycl::malloc_host<int32_t>(SYCL_SPACE_ATOMIC_MASK + 1, q);
 
-  // FIXME_SYCL Once supported, the following should be replaced by
-  // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE,
-  //          &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h,
-  //          sizeof(int32_t*));
-  // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE,
-  //          &SYCL_SPACE_ATOMIC_LOCKS_NODE_h,
-  //          sizeof(int32_t*));
-  auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h;
-  auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h;
-  q.single_task([=] {
-    SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr;
-    SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr;
-  });
+  copy_sycl_lock_arrays_to_device(q);
 
   q.memset(SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h,
            0,
@@ -63,7 +53,10 @@ void finalize_lock_arrays_sycl<int>(sycl::queue q) {
   sycl::free(SYCL_SPACE_ATOMIC_LOCKS_NODE_h, q);
   SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
   SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
+  copy_sycl_lock_arrays_to_device(q);
+#endif
 }
 
-} // namespace desul::Impl
+}  // namespace desul::Impl
 #endif
diff --git a/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp b/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp
index ab1561bd47f..25389a2fa5e 100644
--- a/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp
+++ b/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp
@@ -27,165 +27,165 @@ namespace detail {
 
 // For no unique address emulation, this is the case taken when neither are empty.
 // For real `[[no_unique_address]]`, this case is always taken.
-template <class _T, class _U, class _Enable = void> struct __compressed_pair {
-  _MDSPAN_NO_UNIQUE_ADDRESS _T __t_val;
-  _MDSPAN_NO_UNIQUE_ADDRESS _U __u_val;
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { return __t_val; }
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept {
-    return __t_val;
+template <class _T1, class _T2, class _Enable = void> struct __compressed_pair {
+  _MDSPAN_NO_UNIQUE_ADDRESS _T1 __t1_val{};
+  _MDSPAN_NO_UNIQUE_ADDRESS _T2 __t2_val{};
+  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return __t1_val; }
+  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept {
+    return __t1_val;
   }
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { return __u_val; }
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept {
-    return __u_val;
+  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return __t2_val; }
+  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept {
+    return __t2_val;
   }
 
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair() noexcept = default;
+  constexpr __compressed_pair() = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair const &) noexcept = default;
+  constexpr __compressed_pair(__compressed_pair const &) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair &&) noexcept = default;
+  constexpr __compressed_pair(__compressed_pair &&) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
   _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair &
-  operator=(__compressed_pair const &) noexcept = default;
+  operator=(__compressed_pair const &) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
   _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair &
-  operator=(__compressed_pair &&) noexcept = default;
+  operator=(__compressed_pair &&) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__compressed_pair() noexcept = default;
-  template <class _TLike, class _ULike>
-  MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u)
-      : __t_val((_TLike &&) __t), __u_val((_ULike &&) __u) {}
+  ~__compressed_pair() = default;
+  template <class _T1Like, class _T2Like>
+  MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2)
+      : __t1_val((_T1Like &&) __t1), __t2_val((_T2Like &&) __t2) {}
 };
 
 #if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
 
 // First empty.
-template <class _T, class _U>
+template <class _T1, class _T2>
 struct __compressed_pair<
-    _T, _U,
-    std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && !_MDSPAN_TRAIT(std::is_empty, _U)>>
-    : private _T {
-  _U __u_val;
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept {
-    return *static_cast<_T *>(this);
+    _T1, _T2,
+    std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T1) && !_MDSPAN_TRAIT(std::is_empty, _T2)>>
+    : private _T1 {
+  _T2 __t2_val{};
+  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept {
+    return *static_cast<_T1 *>(this);
   }
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept {
-    return *static_cast<_T const *>(this);
+  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept {
+    return *static_cast<_T1 const *>(this);
   }
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { return __u_val; }
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept {
-    return __u_val;
+  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return __t2_val; }
+  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept {
+    return __t2_val;
   }
 
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair() noexcept = default;
+  constexpr __compressed_pair() = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair const &) noexcept = default;
+  constexpr __compressed_pair(__compressed_pair const &) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair &&) noexcept = default;
+  constexpr __compressed_pair(__compressed_pair &&) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
   _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair &
-  operator=(__compressed_pair const &) noexcept = default;
+  operator=(__compressed_pair const &) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
   _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair &
-  operator=(__compressed_pair &&) noexcept = default;
+  operator=(__compressed_pair &&) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__compressed_pair() noexcept = default;
-  template <class _TLike, class _ULike>
-  MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u)
-      : _T((_TLike &&) __t), __u_val((_ULike &&) __u) {}
+  ~__compressed_pair() = default;
+  template <class _T1Like, class _T2Like>
+  MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2)
+      : _T1((_T1Like &&) __t1), __t2_val((_T2Like &&) __t2) {}
 };
 
 // Second empty.
-template <class _T, class _U>
+template <class _T1, class _T2>
 struct __compressed_pair<
-    _T, _U,
-    std::enable_if_t<!_MDSPAN_TRAIT(std::is_empty, _T) && _MDSPAN_TRAIT(std::is_empty, _U)>>
-    : private _U {
-  _T __t_val;
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { return __t_val; }
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept {
-    return __t_val;
+    _T1, _T2,
+    std::enable_if_t<!_MDSPAN_TRAIT(std::is_empty, _T1) && _MDSPAN_TRAIT(std::is_empty, _T2)>>
+    : private _T2 {
+  _T1 __t1_val{};
+  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return __t1_val; }
+  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept {
+    return __t1_val;
   }
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept {
-    return *static_cast<_U *>(this);
+  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept {
+    return *static_cast<_T2 *>(this);
   }
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept {
-    return *static_cast<_U const *>(this);
+  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept {
+    return *static_cast<_T2 const *>(this);
   }
 
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair() noexcept = default;
+  constexpr __compressed_pair() = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair const &) noexcept = default;
+  constexpr __compressed_pair(__compressed_pair const &) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair &&) noexcept = default;
+  constexpr __compressed_pair(__compressed_pair &&) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
   _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair &
-  operator=(__compressed_pair const &) noexcept = default;
+  operator=(__compressed_pair const &) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
   _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair &
-  operator=(__compressed_pair &&) noexcept = default;
+  operator=(__compressed_pair &&) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__compressed_pair() noexcept = default;
+  ~__compressed_pair() = default;
 
-  template <class _TLike, class _ULike>
-  MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u)
-      : _U((_ULike &&) __u), __t_val((_TLike &&) __t) {}
+  template <class _T1Like, class _T2Like>
+  MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2)
+      : _T2((_T2Like &&) __t2), __t1_val((_T1Like &&) __t1) {}
 };
 
 // Both empty.
-template <class _T, class _U>
+template <class _T1, class _T2>
 struct __compressed_pair<
-    _T, _U,
-    std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && _MDSPAN_TRAIT(std::is_empty, _U)>>
+    _T1, _T2,
+    std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T1) && _MDSPAN_TRAIT(std::is_empty, _T2)>>
     // We need to use the __no_unique_address_emulation wrapper here to avoid
     // base class ambiguities.
 #ifdef _MDSPAN_COMPILER_MSVC
 // MSVC doesn't allow you to access public static member functions of a type
 // when you *happen* to privately inherit from that type.
-    : protected __no_unique_address_emulation<_T, 0>,
-      protected __no_unique_address_emulation<_U, 1>
+    : protected __no_unique_address_emulation<_T1, 0>,
+      protected __no_unique_address_emulation<_T2, 1>
 #else
-    : private __no_unique_address_emulation<_T, 0>,
-      private __no_unique_address_emulation<_U, 1>
+    : private __no_unique_address_emulation<_T1, 0>,
+      private __no_unique_address_emulation<_T2, 1>
 #endif
 {
-  using __first_base_t = __no_unique_address_emulation<_T, 0>;
-  using __second_base_t = __no_unique_address_emulation<_U, 1>;
+  using __first_base_t = __no_unique_address_emulation<_T1, 0>;
+  using __second_base_t = __no_unique_address_emulation<_T2, 1>;
 
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept {
+  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept {
     return this->__first_base_t::__ref();
   }
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept {
+  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept {
     return this->__first_base_t::__ref();
   }
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept {
+  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept {
     return this->__second_base_t::__ref();
   }
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept {
+  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept {
     return this->__second_base_t::__ref();
   }
 
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair() noexcept = default;
+  constexpr __compressed_pair() = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair const &) noexcept = default;
+  constexpr __compressed_pair(__compressed_pair const &) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __compressed_pair(__compressed_pair &&) noexcept = default;
+  constexpr __compressed_pair(__compressed_pair &&) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
   _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair &
-  operator=(__compressed_pair const &) noexcept = default;
+  operator=(__compressed_pair const &) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
   _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair &
-  operator=(__compressed_pair &&) noexcept = default;
+  operator=(__compressed_pair &&) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__compressed_pair() noexcept = default;
-  template <class _TLike, class _ULike>
-  MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) noexcept
-    : __first_base_t(_T((_TLike &&) __t)),
-      __second_base_t(_U((_ULike &&) __u))
+  ~__compressed_pair() = default;
+  template <class _T1Like, class _T2Like>
+  MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) noexcept
+    : __first_base_t(_T1((_T1Like &&) __t1)),
+      __second_base_t(_T2((_T2Like &&) __t2))
   { }
 };
 
diff --git a/tpls/mdspan/include/experimental/__p0009_bits/config.hpp b/tpls/mdspan/include/experimental/__p0009_bits/config.hpp
index d35e201cebd..8e42a37ba7c 100644
--- a/tpls/mdspan/include/experimental/__p0009_bits/config.hpp
+++ b/tpls/mdspan/include/experimental/__p0009_bits/config.hpp
@@ -35,10 +35,17 @@
 #define MDSPAN_CXX_STD_14 201402L
 #define MDSPAN_CXX_STD_17 201703L
 #define MDSPAN_CXX_STD_20 202002L
+// Note GCC has not updated this in version 13
+#ifdef __clang__
+#define MDSPAN_CXX_STD_23 202302L
+#else
+#define MDSPAN_CXX_STD_23 202100L
+#endif
 
 #define MDSPAN_HAS_CXX_14 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14)
 #define MDSPAN_HAS_CXX_17 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_17)
 #define MDSPAN_HAS_CXX_20 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_20)
+#define MDSPAN_HAS_CXX_23 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_23)
 
 static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or later.");
 
@@ -224,7 +231,7 @@ static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or
 #endif
 
 #ifndef MDSPAN_CONDITIONAL_EXPLICIT
-#  if MDSPAN_HAS_CXX_20 && !defined(_MDSPAN_COMPILER_MSVC)
+#  if MDSPAN_HAS_CXX_20
 #    define MDSPAN_CONDITIONAL_EXPLICIT(COND) explicit(COND)
 #  else
 #    define MDSPAN_CONDITIONAL_EXPLICIT(COND)
diff --git a/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp b/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp
index 0dd31c4cd0a..9a28c3ed5ca 100644
--- a/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp
+++ b/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp
@@ -55,6 +55,14 @@ __check_compatible_extents(
   return {};
 }
 
+template<class IndexType, class ... Arguments>
+MDSPAN_INLINE_FUNCTION
+static constexpr bool are_valid_indices() {
+    return
+      (std::is_convertible<Arguments, IndexType>::value && ... && true) &&
+      (std::is_nothrow_constructible<IndexType, Arguments>::value && ... && true);
+}
+
 // ------------------------------------------------------------------
 // ------------ static_array ----------------------------------------
 // ------------------------------------------------------------------
@@ -140,7 +148,8 @@ struct index_sequence_scan_impl<R, FirstVal, Values...> {
 
 template <size_t R, size_t FirstVal>
 struct index_sequence_scan_impl<R, FirstVal> {
-#if defined(__NVCC__) || defined(__NVCOMPILER)
+#if defined(__NVCC__) || defined(__NVCOMPILER) ||                              \
+    defined(_MDSPAN_COMPILER_INTEL)
   // NVCC warns about pointless comparison with 0 for R==0 and r being const
   // evaluatable and also 0.
   MDSPAN_INLINE_FUNCTION
@@ -167,7 +176,7 @@ template <> struct index_sequence_scan_impl<0> {
 // all static values.
 
 template <class T, size_t N> struct possibly_empty_array {
-  T vals[N];
+  T vals[N]{};
   MDSPAN_INLINE_FUNCTION
   constexpr T &operator[](size_t r) { return vals[r]; }
   MDSPAN_INLINE_FUNCTION
@@ -251,12 +260,17 @@ struct maybe_static_array {
 
 #ifdef __cpp_lib_span
   MDSPAN_TEMPLATE_REQUIRES(class T, size_t N,
-                           /* requires */ (N == m_size_dynamic))
+                           /* requires */ (N == m_size_dynamic && N > 0))
   MDSPAN_INLINE_FUNCTION
   constexpr maybe_static_array(const std::span<T, N> &vals) {
     for (size_t r = 0; r < N; r++)
       m_dyn_vals[r] = static_cast<TDynamic>(vals[r]);
   }
+
+  MDSPAN_TEMPLATE_REQUIRES(class T, size_t N,
+                           /* requires */ (N == m_size_dynamic && N == 0))
+  MDSPAN_INLINE_FUNCTION
+  constexpr maybe_static_array(const std::span<T, N> &) : m_dyn_vals{} {}
 #endif
 
   // constructors from all values
@@ -423,9 +437,9 @@ template <class IndexType, size_t... Extents> class extents {
       class OtherIndexType, size_t N,
       /* requires */
       (
-          _MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) &&
+          _MDSPAN_TRAIT(std::is_convertible, const OtherIndexType&, index_type) &&
           _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type,
-              OtherIndexType) &&
+              const OtherIndexType&) &&
           (N == m_rank || N == m_rank_dynamic)))
   MDSPAN_INLINE_FUNCTION
   MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic)
@@ -436,8 +450,8 @@ template <class IndexType, size_t... Extents> class extents {
   MDSPAN_TEMPLATE_REQUIRES(
       class OtherIndexType, size_t N,
       /* requires */
-      (_MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) &&
-       _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, OtherIndexType) &&
+      (_MDSPAN_TRAIT(std::is_convertible, const OtherIndexType&, index_type) &&
+       _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const OtherIndexType&) &&
        (N == m_rank || N == m_rank_dynamic)))
   MDSPAN_INLINE_FUNCTION
   MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic)
@@ -454,6 +468,7 @@ template <class IndexType, size_t... Extents> class extents {
       size_t DynCount, size_t R, class OtherExtents, class... DynamicValues,
       /* requires */ ((R < m_rank) && (static_extent(R) == dynamic_extent)))
   MDSPAN_INLINE_FUNCTION
+  constexpr
   vals_t __construct_vals_from_extents(std::integral_constant<size_t, DynCount>,
                                        std::integral_constant<size_t, R>,
                                        const OtherExtents &exts,
@@ -468,6 +483,7 @@ template <class IndexType, size_t... Extents> class extents {
       size_t DynCount, size_t R, class OtherExtents, class... DynamicValues,
       /* requires */ ((R < m_rank) && (static_extent(R) != dynamic_extent)))
   MDSPAN_INLINE_FUNCTION
+  constexpr
   vals_t __construct_vals_from_extents(std::integral_constant<size_t, DynCount>,
                                        std::integral_constant<size_t, R>,
                                        const OtherExtents &exts,
@@ -481,6 +497,7 @@ template <class IndexType, size_t... Extents> class extents {
       size_t DynCount, size_t R, class OtherExtents, class... DynamicValues,
       /* requires */ ((R == m_rank) && (DynCount == m_rank_dynamic)))
   MDSPAN_INLINE_FUNCTION
+  constexpr
   vals_t __construct_vals_from_extents(std::integral_constant<size_t, DynCount>,
                                        std::integral_constant<size_t, R>,
                                        const OtherExtents &,
@@ -491,17 +508,20 @@ template <class IndexType, size_t... Extents> class extents {
 public:
 
   // Converting constructor from other extents specializations
-  MDSPAN_TEMPLATE_REQUIRES(
-      class OtherIndexType, size_t... OtherExtents,
-      /* requires */
-      (
-          /* multi-stage check to protect from invalid pack expansion when sizes
-             don't match? */
-          decltype(detail::__check_compatible_extents(
-              std::integral_constant<bool, sizeof...(Extents) ==
-                                               sizeof...(OtherExtents)>{},
+    MDSPAN_TEMPLATE_REQUIRES(
+        class OtherIndexType, size_t... OtherExtents,
+        /* requires */
+        (
+            /* multi-stage check to protect from invalid pack expansion when sizes
+            don't match? */
+            decltype(detail::__check_compatible_extents(
+              // using: sizeof...(Extents) == sizeof...(OtherExtents) as the second argument fails with MSVC+NVCC with some obscure expansion error
+              // MSVC: 19.38.33133 NVCC: 12.0
+              std::integral_constant<bool, extents<int, Extents...>::rank() == extents<int, OtherExtents...>::rank()>{},
               std::integer_sequence<size_t, Extents...>{},
-              std::integer_sequence<size_t, OtherExtents...>{}))::value))
+              std::integer_sequence<size_t, OtherExtents...>{}))::value
+      )
+  )
   MDSPAN_INLINE_FUNCTION
   MDSPAN_CONDITIONAL_EXPLICIT((((Extents != dynamic_extent) &&
                                 (OtherExtents == dynamic_extent)) ||
@@ -518,10 +538,14 @@ template <class IndexType, size_t... Extents> class extents {
   MDSPAN_INLINE_FUNCTION friend constexpr bool
   operator==(const extents &lhs,
              const extents<OtherIndexType, OtherExtents...> &rhs) noexcept {
-    bool value = true;
-    for (size_type r = 0; r < m_rank; r++)
-      value &= rhs.extent(r) == lhs.extent(r);
-    return value;
+    if constexpr (rank() != extents<OtherIndexType, OtherExtents...>::rank()) {
+      return false;
+    } else {
+      using common_t = std::common_type_t<index_type, OtherIndexType>;
+      for (size_type r = 0; r < m_rank; r++)
+        if(static_cast<common_t>(rhs.extent(r)) != static_cast<common_t>(lhs.extent(r))) return false;
+    }
+    return true;
   }
 
 #if !(MDSPAN_HAS_CXX_20)
@@ -570,7 +594,7 @@ using dextents = typename detail::__make_dextents<IndexType, Rank>::type;
 template <class... IndexTypes>
 extents(IndexTypes...)
     -> extents<size_t,
-               size_t((IndexTypes(), ::MDSPAN_IMPL_STANDARD_NAMESPACE::dynamic_extent))...>;
+               ((void) sizeof(IndexTypes), ::MDSPAN_IMPL_STANDARD_NAMESPACE::dynamic_extent)...>;
 #endif
 
 // Helper type traits for identifying a class as extents.
diff --git a/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp b/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp
index af44494a98d..83ed9ef7fe3 100644
--- a/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp
+++ b/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp
@@ -18,6 +18,9 @@
 #include "macros.hpp"
 #include "trait_backports.hpp"
 #include "extents.hpp"
+#include "../__p2642_bits/layout_padded_fwd.hpp"
+#include <cassert>
+#include <type_traits>
 
 namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 
@@ -108,6 +111,36 @@ class layout_left::mapping {
         */
     }
 
+#if MDSPAN_HAS_CXX_17
+    /**
+     * Converting constructor from `layout_left_padded::mapping`.
+     *
+     * This overload participates in overload resolution only if _Mapping is a layout_left_padded mapping and
+     * extents_type is constructible from _Mapping::extents_type.
+     *
+     * \note There is currently a difference from p2642r2, where this function is specified as taking
+     * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible.
+     */
+    MDSPAN_TEMPLATE_REQUIRES(
+      class _Mapping,
+      /* requires */ (
+        MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::is_layout_left_padded_mapping<_Mapping>::value
+        && std::is_constructible_v<extents_type, typename _Mapping::extents_type>
+      )
+    )
+    MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<typename _Mapping::extents_type, extents_type>))
+    mapping(const _Mapping& __other) noexcept
+      : __extents(__other.extents())
+    {
+      MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::
+          check_padded_layout_converting_constructor_mandates<extents_type,
+                                                                _Mapping>();
+      MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::
+          check_padded_layout_converting_constructor_preconditions<
+              extents_type>(__other);
+    }
+#endif
+
     MDSPAN_TEMPLATE_REQUIRES(
       class OtherExtents,
       /* requires */ (
@@ -124,13 +157,14 @@ class layout_left::mapping {
         * other.required_span_size() is a representable value of type index_type
         */
        #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG)
-       index_type stride = 1;
-       for(rank_type r=0; r<__extents.rank(); r++) {
-         if(stride != static_cast<index_type>(other.stride(r))) {
-           // Note this throw will lead to a terminate if triggered since this function is marked noexcept
-           throw std::runtime_error("Assigning layout_stride to layout_left with invalid strides.");
+       if constexpr (extents_type::rank() > 0) {
+         index_type stride = 1;
+         using common_t = std::common_type_t<index_type, typename OtherExtents::index_type>;
+         for(rank_type r=0; r<__extents.rank(); r++) {
+           if(static_cast<common_t>(stride) != static_cast<common_t>(other.stride(r)))
+             std::abort(); // ("Assigning layout_stride to layout_left with invalid strides.");
+           stride *= __extents.extent(r);
          }
-         stride *= __extents.extent(r);
        }
        #endif
     }
@@ -155,10 +189,7 @@ class layout_left::mapping {
       class... Indices,
       /* requires */ (
         (sizeof...(Indices) == extents_type::rank()) &&
-        _MDSPAN_FOLD_AND(
-           (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) &&
-            _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices))
-        )
+        (detail::are_valid_indices<index_type, Indices...>())
       )
     )
     _MDSPAN_HOST_DEVICE
@@ -172,9 +203,9 @@ class layout_left::mapping {
     MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return true; }
     MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; }
 
-    MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return true; }
-    MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return true; }
-    MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_exhaustive() noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; }
 
     MDSPAN_INLINE_FUNCTION
     constexpr index_type stride(rank_type i) const noexcept
@@ -187,7 +218,10 @@ class layout_left::mapping {
       return value;
     }
 
-    template<class OtherExtents>
+    MDSPAN_TEMPLATE_REQUIRES(
+      class OtherExtents,
+      /* requires */ ( Extents::rank() == OtherExtents::rank())
+    )
     MDSPAN_INLINE_FUNCTION
     friend constexpr bool operator==(mapping const& lhs, mapping<OtherExtents> const& rhs) noexcept {
       return lhs.extents() == rhs.extents();
@@ -195,7 +229,10 @@ class layout_left::mapping {
 
     // In C++ 20 the not equal exists if equal is found
 #if !(MDSPAN_HAS_CXX_20)
-    template<class OtherExtents>
+    MDSPAN_TEMPLATE_REQUIRES(
+      class OtherExtents,
+      /* requires */ ( Extents::rank() == OtherExtents::rank())
+    )
     MDSPAN_INLINE_FUNCTION
     friend constexpr bool operator!=(mapping const& lhs, mapping<OtherExtents> const& rhs) noexcept {
       return lhs.extents() != rhs.extents();
@@ -215,6 +252,17 @@ class layout_left::mapping {
 private:
    _MDSPAN_NO_UNIQUE_ADDRESS extents_type __extents{};
 
+   // [mdspan.submdspan.mapping], submdspan mapping specialization
+   template<class... SliceSpecifiers>
+    MDSPAN_INLINE_FUNCTION
+    constexpr auto submdspan_mapping_impl(
+       SliceSpecifiers... slices) const;
+
+   template<class... SliceSpecifiers>
+     friend constexpr auto submdspan_mapping(
+       const mapping& src, SliceSpecifiers... slices) {
+         return src.submdspan_mapping_impl(slices...);
+     }
 };
 
 
diff --git a/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp b/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp
index a0586484202..3d3927df7bc 100644
--- a/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp
+++ b/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp
@@ -20,6 +20,7 @@
 #include "extents.hpp"
 #include <stdexcept>
 #include "layout_stride.hpp"
+#include "../__p2642_bits/layout_padded_fwd.hpp"
 
 namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 
@@ -113,6 +114,34 @@ class layout_right::mapping {
         */
     }
 
+    /**
+     * Converting constructor from `layout_right_padded::mapping`.
+     *
+     * This overload participates in overload resolution only if _Mapping is a layout_right_padded mapping and
+     * extents_type is constructible from _Mapping::extents_type.
+     *
+     * \note There is currently a difference from p2642r2, where this function is specified as taking
+     * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible.
+     */
+#if MDSPAN_HAS_CXX_17
+    MDSPAN_TEMPLATE_REQUIRES(
+        class _Mapping,
+        /* requires */ (
+        MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::is_layout_right_padded_mapping<_Mapping>::value
+        && std::is_constructible_v<extents_type, typename _Mapping::extents_type>))
+    MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<typename _Mapping::extents_type, extents_type>))
+    mapping(const _Mapping &__other) noexcept
+        : __extents(__other.extents())
+    {
+      MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::
+          check_padded_layout_converting_constructor_mandates<extents_type,
+                                                                _Mapping>();
+      MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::
+          check_padded_layout_converting_constructor_preconditions<
+              extents_type>(__other);
+    }
+#endif
+
     MDSPAN_TEMPLATE_REQUIRES(
       class OtherExtents,
       /* requires */ (
@@ -129,13 +158,14 @@ class layout_right::mapping {
         * other.required_span_size() is a representable value of type index_type
         */
        #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG)
-       index_type stride = 1;
-       for(rank_type r=__extents.rank(); r>0; r--) {
-         if(stride != static_cast<index_type>(other.stride(r-1))) {
-           // Note this throw will lead to a terminate if triggered since this function is marked noexcept
-           throw std::runtime_error("Assigning layout_stride to layout_right with invalid strides.");
+       if constexpr (extents_type::rank() > 0) {
+         index_type stride = 1;
+         using common_t = std::common_type_t<index_type, typename OtherExtents::index_type>;
+         for(rank_type r=__extents.rank(); r>0; r--) {
+           if(static_cast<common_t>(stride) != static_cast<common_t>(other.stride(r-1)))
+             std::abort(); // ("Assigning layout_stride to layout_right with invalid strides.");
+           stride *= __extents.extent(r-1);
          }
-         stride *= __extents.extent(r-1);
        }
        #endif
     }
@@ -157,13 +187,10 @@ class layout_right::mapping {
     //--------------------------------------------------------------------------------
 
     MDSPAN_TEMPLATE_REQUIRES(
-      class... Indices,
+      class ... Indices,
       /* requires */ (
-        (sizeof...(Indices) == extents_type::rank()) &&
-        _MDSPAN_FOLD_AND(
-           (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) &&
-            _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices))
-        )
+      (sizeof...(Indices) == extents_type::rank()) &&
+      (detail::are_valid_indices<index_type, Indices...>())
       )
     )
     _MDSPAN_HOST_DEVICE
@@ -174,9 +201,9 @@ class layout_right::mapping {
     MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; }
     MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return true; }
     MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; }
-    MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return true; }
-    MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return true; }
-    MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_exhaustive() noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; }
 
     MDSPAN_INLINE_FUNCTION
     constexpr index_type stride(rank_type i) const noexcept
@@ -189,7 +216,10 @@ class layout_right::mapping {
       return value;
     }
 
-    template<class OtherExtents>
+    MDSPAN_TEMPLATE_REQUIRES(
+      class OtherExtents,
+      /* requires */ ( Extents::rank() == OtherExtents::rank())
+    )
     MDSPAN_INLINE_FUNCTION
     friend constexpr bool operator==(mapping const& lhs, mapping<OtherExtents> const& rhs) noexcept {
       return lhs.extents() == rhs.extents();
@@ -197,7 +227,10 @@ class layout_right::mapping {
 
     // In C++ 20 the not equal exists if equal is found
 #if !(MDSPAN_HAS_CXX_20)
-    template<class OtherExtents>
+    MDSPAN_TEMPLATE_REQUIRES(
+      class OtherExtents,
+      /* requires */ (Extents::rank() == OtherExtents::rank())
+    )
     MDSPAN_INLINE_FUNCTION
     friend constexpr bool operator!=(mapping const& lhs, mapping<OtherExtents> const& rhs) noexcept {
       return lhs.extents() != rhs.extents();
@@ -217,6 +250,17 @@ class layout_right::mapping {
 private:
    _MDSPAN_NO_UNIQUE_ADDRESS extents_type __extents{};
 
+   // [mdspan.submdspan.mapping], submdspan mapping specialization
+   template<class... SliceSpecifiers>
+   MDSPAN_INLINE_FUNCTION
+   constexpr auto submdspan_mapping_impl(
+       SliceSpecifiers... slices) const;
+
+   template<class... SliceSpecifiers>
+     friend constexpr auto submdspan_mapping(
+       const mapping& src, SliceSpecifiers... slices) {
+         return src.submdspan_mapping_impl(slices...);
+     }
 };
 
 } // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp b/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp
index 030a494529b..15ad577d149 100644
--- a/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp
+++ b/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp
@@ -88,7 +88,7 @@ struct layout_stride {
     : private detail::__no_unique_address_emulation<
         detail::__compressed_pair<
           Extents,
-          std::array<typename Extents::index_type, Extents::rank()>
+          detail::possibly_empty_array<typename Extents::index_type, Extents::rank()>
         >
       >
 #endif
@@ -109,7 +109,7 @@ struct layout_stride {
 
     //----------------------------------------------------------------------------
 
-    using __strides_storage_t = std::array<index_type, extents_type::rank()>;
+    using __strides_storage_t = detail::possibly_empty_array<index_type, extents_type::rank()>;
     using __member_pair_t = detail::__compressed_pair<extents_type, __strides_storage_t>;
 
 #if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
@@ -158,14 +158,16 @@ struct layout_stride {
       template <class OtherExtents>
       MDSPAN_INLINE_FUNCTION
       static constexpr bool _eq_impl(mapping const& self, mapping<OtherExtents> const& other) noexcept {
-        return    _MDSPAN_FOLD_AND((self.stride(Idxs) == other.stride(Idxs)) /* && ... */)
-               && _MDSPAN_FOLD_AND((self.extents().extent(Idxs) == other.extents().extent(Idxs)) /* || ... */);
+        using common_t = std::common_type_t<index_type, typename OtherExtents::index_type>;
+        return    _MDSPAN_FOLD_AND((static_cast<common_t>(self.stride(Idxs)) == static_cast<common_t>(other.stride(Idxs))) /* && ... */)
+               && _MDSPAN_FOLD_AND((static_cast<common_t>(self.extents().extent(Idxs)) == static_cast<common_t>(other.extents().extent(Idxs))) /* || ... */);
       }
       template <class OtherExtents>
       MDSPAN_INLINE_FUNCTION
       static constexpr bool _not_eq_impl(mapping const& self, mapping<OtherExtents> const& other) noexcept {
-        return    _MDSPAN_FOLD_OR((self.stride(Idxs) != other.stride(Idxs)) /* || ... */)
-               || _MDSPAN_FOLD_OR((self.extents().extent(Idxs) != other.extents().extent(Idxs)) /* || ... */);
+        using common_t = std::common_type_t<index_type, typename OtherExtents::index_type>;
+        return    _MDSPAN_FOLD_OR((static_cast<common_t>(self.stride(Idxs)) != static_cast<common_t>(other.stride(Idxs))) /* || ... */)
+               || _MDSPAN_FOLD_OR((static_cast<common_t>(self.extents().extent(Idxs)) != static_cast<common_t>(other.extents().extent(Idxs))) /* || ... */);
       }
 
       template <class... Integral>
@@ -205,6 +207,11 @@ struct layout_stride {
       }
 #endif
 
+      MDSPAN_INLINE_FUNCTION
+      static constexpr std::array<index_type, extents_type::rank()> return_strides(const __strides_storage_t& s) {
+        return std::array<index_type, extents_type::rank()>{s[Idxs]...};
+      }
+
       template<size_t K>
       MDSPAN_INLINE_FUNCTION
       static constexpr size_t __return_zero() { return 0; }
@@ -218,6 +225,21 @@ struct layout_stride {
     // Can't use defaulted parameter in the __deduction_workaround template because of a bug in MSVC warning C4348.
     using __impl = __deduction_workaround<std::make_index_sequence<Extents::rank()>>;
 
+    static constexpr __strides_storage_t strides_storage(std::true_type) {
+      __strides_storage_t s{};
+
+      extents_type e;
+      index_type stride = 1;
+      for(int r = static_cast<int>(extents_type::rank() - 1); r >= 0; r--) {
+        s[r] = stride;
+        stride *= e.extent(r);
+      }
+
+      return s;
+    }
+    static constexpr __strides_storage_t strides_storage(std::false_type) {
+      return {};
+    }
 
     //----------------------------------------------------------------------------
 
@@ -233,7 +255,21 @@ struct layout_stride {
 
     //--------------------------------------------------------------------------------
 
-    MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept = default;
+    MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept
+#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
+      : __members{
+#else
+      : __base_t(__base_t{__member_pair_t(
+#endif
+          extents_type(),
+          __strides_storage_t(strides_storage(std::integral_constant<bool, (extents_type::rank() > 0)>{}))
+#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
+        }
+#else
+        )})
+#endif
+    {}
+
     MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default;
 
     MDSPAN_TEMPLATE_REQUIRES(
@@ -332,10 +368,10 @@ struct layout_stride {
     )
 #endif
     MDSPAN_CONDITIONAL_EXPLICIT(
-      (!std::is_convertible<typename StridedLayoutMapping::extents_type, extents_type>::value) &&
-      (detail::__is_mapping_of<layout_left, StridedLayoutMapping> ||
-       detail::__is_mapping_of<layout_right, StridedLayoutMapping> ||
-       detail::__is_mapping_of<layout_stride, StridedLayoutMapping>)
+      !(std::is_convertible<typename StridedLayoutMapping::extents_type, extents_type>::value &&
+       (detail::__is_mapping_of<layout_left, StridedLayoutMapping> ||
+        detail::__is_mapping_of<layout_right, StridedLayoutMapping> ||
+        detail::__is_mapping_of<layout_stride, StridedLayoutMapping>))
     ) // needs two () due to comma
     MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14
     mapping(StridedLayoutMapping const& other) noexcept // NOLINT(google-explicit-constructor)
@@ -374,7 +410,7 @@ struct layout_stride {
 
     MDSPAN_INLINE_FUNCTION
     constexpr std::array< index_type, extents_type::rank() > strides() const noexcept {
-      return __strides_storage();
+      return __impl::return_strides(__strides_storage());
     }
 
     MDSPAN_INLINE_FUNCTION
@@ -393,8 +429,7 @@ struct layout_stride {
       class... Indices,
       /* requires */ (
         sizeof...(Indices) == Extents::rank() &&
-        _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) /*&& ...*/ ) &&
-        _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices) /*&& ...*/)
+        (detail::are_valid_indices<index_type, Indices...>())
       )
     )
     MDSPAN_FORCE_INLINE_FUNCTION
@@ -410,17 +445,37 @@ struct layout_stride {
 
     MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; }
     MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept {
-      return required_span_size() == __get_size(extents(), std::make_index_sequence<extents_type::rank()>());
+      if constexpr (extents_type::rank() == 0)
+        return true;
+      else {
+        index_type span_size = required_span_size();
+        if (span_size == static_cast<index_type>(0)) {
+          if constexpr (extents_type::rank() == 1) {
+            return stride(0) == 1;
+          } else {
+            rank_type r_largest = 0;
+            for (rank_type r = 1; r < extents_type::rank(); r++) {
+              if (stride(r) > stride(r_largest)) {
+                r_largest = r;
+              }
+            }
+            for (rank_type r = 0; r < extents_type::rank(); r++) {
+              if (extents().extent(r) == 0 && r != r_largest) {
+                return false;
+              }
+            }
+            return true;
+          }
+        } else {
+          return required_span_size() == __get_size(extents(), std::make_index_sequence<extents_type::rank()>());
+        }
+      }
     }
     MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; }
 
 
     MDSPAN_INLINE_FUNCTION
-    constexpr index_type stride(rank_type r) const noexcept
-#if MDSPAN_HAS_CXX_20
-      requires ( Extents::rank() > 0 )
-#endif
-    {
+    constexpr index_type stride(rank_type r) const noexcept {
       return __strides_storage()[r];
     }
 
@@ -444,10 +499,13 @@ struct layout_stride {
     MDSPAN_INLINE_FUNCTION
     friend constexpr bool operator==(const mapping& x, const StridedLayoutMapping& y) noexcept {
       bool strides_match = true;
-      for(rank_type r = 0; r < extents_type::rank(); r++)
-        strides_match = strides_match && (x.stride(r) == y.stride(r));
+      if constexpr (extents_type::rank() > 0) {
+        using common_t = std::common_type_t<index_type, typename StridedLayoutMapping::index_type>;
+        for(rank_type r = 0; r < extents_type::rank(); r++)
+          strides_match = strides_match && (static_cast<common_t>(x.stride(r)) == static_cast<common_t>(y.stride(r)));
+      }
       return (x.extents() == y.extents()) &&
-             (__impl::__OFFSET(y)== static_cast<typename StridedLayoutMapping::index_type>(0)) &&
+             (__impl::__OFFSET(y) == static_cast<typename StridedLayoutMapping::index_type>(0)) &&
              strides_match;
     }
 
@@ -489,6 +547,17 @@ struct layout_stride {
     }
 #endif
 
+   // [mdspan.submdspan.mapping], submdspan mapping specialization
+   template<class... SliceSpecifiers>
+   MDSPAN_INLINE_FUNCTION
+   constexpr auto submdspan_mapping_impl(
+       SliceSpecifiers... slices) const;
+
+   template<class... SliceSpecifiers>
+     friend constexpr auto submdspan_mapping(
+       const mapping& src, SliceSpecifiers... slices) {
+      return src.submdspan_mapping_impl(slices...);
+    }
   };
 };
 
diff --git a/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp b/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp
index 6febe300215..d6ec49e65bf 100644
--- a/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp
+++ b/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp
@@ -55,6 +55,13 @@ class mdspan
     ReferenceType __callop(mdspan const& __self, const std::array<SizeType, N>& indices) noexcept {
       return __self.__accessor_ref().access(__self.__ptr_ref(), __self.__mapping_ref()(indices[Idxs]...));
     }
+#ifdef __cpp_lib_span
+    template <class ReferenceType, class SizeType, size_t N>
+    MDSPAN_FORCE_INLINE_FUNCTION static constexpr
+    ReferenceType __callop(mdspan const& __self, const std::span<SizeType, N>& indices) noexcept {
+      return __self.__accessor_ref().access(__self.__ptr_ref(), __self.__mapping_ref()(indices[Idxs]...));
+    }
+#endif
   };
 
 public:
@@ -109,9 +116,8 @@ class mdspan
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) &&
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) &&
       ((sizeof...(SizeTypes) == rank()) || (sizeof...(SizeTypes) == rank_dynamic())) &&
+      (detail::are_valid_indices<index_type, SizeTypes...>()) &&
       _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) &&
       _MDSPAN_TRAIT(std::is_default_constructible, accessor_type)
     )
@@ -125,8 +131,8 @@ class mdspan
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType, size_t N,
     /* requires */ (
-      _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) &&
-      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) &&
+      _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) &&
+      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) &&
       ((N == rank()) || (N == rank_dynamic())) &&
       _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) &&
       _MDSPAN_TRAIT(std::is_default_constructible, accessor_type)
@@ -142,8 +148,8 @@ class mdspan
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType, size_t N,
     /* requires */ (
-      _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) &&
-      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) &&
+      _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) &&
+      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) &&
       ((N == rank()) || (N == rank_dynamic())) &&
       _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) &&
       _MDSPAN_TRAIT(std::is_default_constructible, accessor_type)
@@ -160,7 +166,7 @@ class mdspan
     (MDSPAN_INLINE_FUNCTION constexpr),
     mdspan, (data_handle_type p, const extents_type& exts), ,
     /* requires */ (_MDSPAN_TRAIT(std::is_default_constructible, accessor_type) &&
-                    _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type))
+                    _MDSPAN_TRAIT(std::is_constructible, mapping_type, const extents_type&))
   ) : __members(std::move(p), __map_acc_pair_t(mapping_type(exts), accessor_type()))
   { }
 
@@ -179,10 +185,14 @@ class mdspan
   MDSPAN_TEMPLATE_REQUIRES(
     class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherAccessor,
     /* requires */ (
-      _MDSPAN_TRAIT(std::is_constructible, mapping_type, typename OtherLayoutPolicy::template mapping<OtherExtents>) &&
-      _MDSPAN_TRAIT(std::is_constructible, accessor_type, OtherAccessor)
+      _MDSPAN_TRAIT(std::is_constructible, mapping_type, const typename OtherLayoutPolicy::template mapping<OtherExtents>&) &&
+      _MDSPAN_TRAIT(std::is_constructible, accessor_type, const OtherAccessor&)
     )
   )
+  MDSPAN_CONDITIONAL_EXPLICIT(
+    !_MDSPAN_TRAIT(std::is_convertible, const typename OtherLayoutPolicy::template mapping<OtherExtents>&, mapping_type) ||
+    !_MDSPAN_TRAIT(std::is_convertible, const OtherAccessor&, accessor_type)
+  )
   MDSPAN_INLINE_FUNCTION
   constexpr mdspan(const mdspan<OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>& other)
     : __members(other.__ptr_ref(), __map_acc_pair_t(other.__mapping_ref(), other.__accessor_ref()))
@@ -226,8 +236,8 @@ class mdspan
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType,
     /* requires */ (
-      _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) &&
-      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType)
+      _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) &&
+      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
@@ -240,8 +250,8 @@ class mdspan
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType,
     /* requires */ (
-      _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) &&
-      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType)
+      _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) &&
+      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
@@ -271,9 +281,8 @@ class mdspan
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) &&
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) &&
-      extents_type::rank() == sizeof...(SizeTypes)
+      extents_type::rank() == sizeof...(SizeTypes) &&
+      (detail::are_valid_indices<index_type, SizeTypes...>())
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
@@ -285,8 +294,8 @@ class mdspan
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType,
     /* requires */ (
-      _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) &&
-      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType)
+      _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) &&
+      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
@@ -299,8 +308,8 @@ class mdspan
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType,
     /* requires */ (
-      _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) &&
-      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType)
+      _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) &&
+      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
@@ -311,7 +320,7 @@ class mdspan
   #endif // __cpp_lib_span
   #endif // MDSPAN_USE_PAREN_OPERATOR
 
-  MDSPAN_INLINE_FUNCTION constexpr size_t size() const noexcept {
+  MDSPAN_INLINE_FUNCTION constexpr size_type size() const noexcept {
     return __impl::__size(*this);
   };
 
@@ -346,13 +355,13 @@ class mdspan
   //--------------------------------------------------------------------------------
   // [mdspan.basic.obs], mdspan observers of the mapping
 
-  MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return mapping_type::is_always_unique(); };
-  MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return mapping_type::is_always_exhaustive(); };
-  MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return mapping_type::is_always_strided(); };
+  MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() { return mapping_type::is_always_unique(); };
+  MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() { return mapping_type::is_always_exhaustive(); };
+  MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() { return mapping_type::is_always_strided(); };
 
-  MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return __mapping_ref().is_unique(); };
-  MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return __mapping_ref().is_exhaustive(); };
-  MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return __mapping_ref().is_strided(); };
+  MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const { return __mapping_ref().is_unique(); };
+  MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const { return __mapping_ref().is_exhaustive(); };
+  MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const { return __mapping_ref().is_strided(); };
   MDSPAN_INLINE_FUNCTION constexpr index_type stride(size_t r) const { return __mapping_ref().stride(r); };
 
 private:
@@ -374,7 +383,7 @@ class mdspan
 #if defined(_MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION)
 MDSPAN_TEMPLATE_REQUIRES(
   class ElementType, class... SizeTypes,
-  /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_integral, SizeTypes) /* && ... */) &&
+  /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, size_t) /* && ... */) &&
   (sizeof...(SizeTypes) > 0)
 )
 MDSPAN_DEDUCTION_GUIDE explicit mdspan(ElementType*, SizeTypes...)
diff --git a/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp b/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp
index 3950273a83d..bdc5925f715 100644
--- a/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp
+++ b/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp
@@ -103,8 +103,8 @@ class mdarray {
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) &&
-      _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) &&
+      (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices<index_type, SizeTypes...>()) &&
+        _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) &&
       _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) &&
       (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t) ||
        container_is_array<container_type>::value) &&
@@ -133,61 +133,29 @@ class mdarray {
   ) : map_(m), ctr_(container_is_array<container_type>::construct(map_))
   { }
 
-  // Constructors from container
-  MDSPAN_TEMPLATE_REQUIRES(
-    class... SizeTypes,
-    /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) &&
-      _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) &&
-      _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)
-    )
-  )
-  MDSPAN_INLINE_FUNCTION
-  explicit constexpr mdarray(const container_type& ctr, SizeTypes... dynamic_extents)
-    : map_(extents_type(dynamic_extents...)), ctr_(ctr)
-  { assert(ctr.size() >= static_cast<size_t>(map_.required_span_size())); }
-
-
   MDSPAN_FUNCTION_REQUIRES(
     (MDSPAN_INLINE_FUNCTION constexpr),
-    mdarray, (const container_type& ctr, const extents_type& exts), ,
+    mdarray, (const extents_type& exts, const container_type& ctr), ,
     /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type))
   ) : map_(exts), ctr_(ctr)
   { assert(ctr.size() >= static_cast<size_t>(map_.required_span_size())); }
 
-  constexpr mdarray(const container_type& ctr, const mapping_type& m)
+  constexpr mdarray(const mapping_type& m, const container_type& ctr)
     : map_(m), ctr_(ctr)
   { assert(ctr.size() >= static_cast<size_t>(map_.required_span_size())); }
 
-
-  // Constructors from container
-  MDSPAN_TEMPLATE_REQUIRES(
-    class... SizeTypes,
-    /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) &&
-      _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) &&
-      _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)
-    )
-  )
-  MDSPAN_INLINE_FUNCTION
-  explicit constexpr mdarray(container_type&& ctr, SizeTypes... dynamic_extents)
-    : map_(extents_type(dynamic_extents...)), ctr_(std::move(ctr))
-  { assert(ctr_.size() >= static_cast<size_t>(map_.required_span_size())); }
-
-
   MDSPAN_FUNCTION_REQUIRES(
     (MDSPAN_INLINE_FUNCTION constexpr),
-    mdarray, (container_type&& ctr, const extents_type& exts), ,
+    mdarray, (const extents_type& exts, container_type&& ctr), ,
     /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type))
   ) : map_(exts), ctr_(std::move(ctr))
   { assert(ctr_.size() >= static_cast<size_t>(map_.required_span_size())); }
 
-  constexpr mdarray(container_type&& ctr, const mapping_type& m)
+  constexpr mdarray(const mapping_type& m, container_type&& ctr)
     : map_(m), ctr_(std::move(ctr))
   { assert(ctr_.size() >= static_cast<size_t>(map_.required_span_size())); }
 
 
-
   MDSPAN_TEMPLATE_REQUIRES(
     class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherContainer,
     /* requires */ (
@@ -229,7 +197,7 @@ class mdarray {
                     _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type))
   )
   MDSPAN_INLINE_FUNCTION
-  constexpr mdarray(const container_type& ctr, const extents_type& exts, const Alloc& a)
+  constexpr mdarray(const extents_type& exts, const container_type& ctr, const Alloc& a)
     : map_(exts), ctr_(ctr, a)
   { assert(ctr_.size() >= static_cast<size_t>(map_.required_span_size())); }
 
@@ -238,7 +206,7 @@ class mdarray {
     /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc))
   )
   MDSPAN_INLINE_FUNCTION
-  constexpr mdarray(const container_type& ctr, const mapping_type& map, const Alloc& a)
+  constexpr mdarray(const mapping_type& map, const container_type& ctr, const Alloc& a)
     : map_(map), ctr_(ctr, a)
   { assert(ctr_.size() >= static_cast<size_t>(map_.required_span_size())); }
 
@@ -248,7 +216,7 @@ class mdarray {
                     _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type))
   )
   MDSPAN_INLINE_FUNCTION
-  constexpr mdarray(container_type&& ctr, const extents_type& exts, const Alloc& a)
+  constexpr mdarray(const extents_type& exts, container_type&& ctr, const Alloc& a)
     : map_(exts), ctr_(std::move(ctr), a)
   { assert(ctr_.size() >= static_cast<size_t>(map_.required_span_size())); }
 
@@ -257,7 +225,7 @@ class mdarray {
     /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc))
   )
   MDSPAN_INLINE_FUNCTION
-  constexpr mdarray(container_type&& ctr, const mapping_type& map, const Alloc& a)
+  constexpr mdarray(const mapping_type& map, container_type&& ctr, const Alloc& a)
     : map_(map), ctr_(std::move(ctr), a)
   { assert(ctr_.size() >= map_.required_span_size()); }
 
@@ -344,8 +312,8 @@ class mdarray {
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) &&
-      extents_type::rank() == sizeof...(SizeTypes)
+        (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices<index_type, SizeTypes...>()) &&
+        extents_type::rank() == sizeof...(SizeTypes)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
@@ -356,8 +324,8 @@ class mdarray {
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) &&
-      extents_type::rank() == sizeof...(SizeTypes)
+        (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices<index_type, SizeTypes...>()) &&
+        extents_type::rank() == sizeof...(SizeTypes)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
@@ -433,8 +401,9 @@ class mdarray {
     class OtherElementType, class OtherExtents,
     class OtherLayoutType, class OtherAccessorType,
     /* requires */ (
-      _MDSPAN_TRAIT(std::is_assignable, mdspan_type,
-                       mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType>)
+      _MDSPAN_TRAIT(std::is_assignable,
+                      mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType>,
+                      mdspan_type)
     )
   )
   constexpr operator mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType> () {
@@ -445,8 +414,9 @@ class mdarray {
     class OtherElementType, class OtherExtents,
     class OtherLayoutType, class OtherAccessorType,
     /* requires */ (
-      _MDSPAN_TRAIT(std::is_assignable, const_mdspan_type,
-                      mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType>)
+      _MDSPAN_TRAIT(std::is_assignable,
+                      mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType>,
+                      const_mdspan_type)
     )
   )
   constexpr operator mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType> () const {
diff --git a/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp b/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp
index 58f38620ba1..89ba8202fb1 100644
--- a/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp
+++ b/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp
@@ -20,7 +20,6 @@
 #include <type_traits>
 
 namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
-namespace MDSPAN_IMPL_PROPOSED_NAMESPACE {
 
 namespace {
   template<class T>
@@ -29,6 +28,7 @@ namespace {
   template<class T, T val>
   struct __mdspan_is_integral_constant<std::integral_constant<T,val>>: std::true_type {};
 }
+
 // Slice Specifier allowing for strides and compile time extent
 template <class OffsetType, class ExtentType, class StrideType>
 struct strided_slice {
@@ -36,14 +36,13 @@ struct strided_slice {
   using extent_type = ExtentType;
   using stride_type = StrideType;
 
-  OffsetType offset;
-  ExtentType extent;
-  StrideType stride;
+  _MDSPAN_NO_UNIQUE_ADDRESS OffsetType offset{};
+  _MDSPAN_NO_UNIQUE_ADDRESS ExtentType extent{};
+  _MDSPAN_NO_UNIQUE_ADDRESS StrideType stride{};
 
   static_assert(std::is_integral_v<OffsetType> || __mdspan_is_integral_constant<OffsetType>::value);
   static_assert(std::is_integral_v<ExtentType> || __mdspan_is_integral_constant<ExtentType>::value);
   static_assert(std::is_integral_v<StrideType> || __mdspan_is_integral_constant<StrideType>::value);
 };
 
-} // MDSPAN_IMPL_PROPOSED_NAMESPACE
 } // MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp b/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp
index b9672b7f9ac..abddd0b59df 100644
--- a/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp
+++ b/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp
@@ -20,23 +20,21 @@
 #include "submdspan_mapping.hpp"
 
 namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
-namespace MDSPAN_IMPL_PROPOSED_NAMESPACE {
 template <class ElementType, class Extents, class LayoutPolicy,
           class AccessorPolicy, class... SliceSpecifiers>
 MDSPAN_INLINE_FUNCTION
 constexpr auto
 submdspan(const mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy> &src,
           SliceSpecifiers... slices) {
-  const auto sub_mapping_offset = submdspan_mapping(src.mapping(), slices...);
+  const auto sub_submdspan_mapping_result = submdspan_mapping(src.mapping(), slices...);
   // NVCC has a problem with the deduction so lets figure out the type
-  using sub_mapping_t = std::remove_cv_t<decltype(sub_mapping_offset.mapping)>;
+  using sub_mapping_t = std::remove_cv_t<decltype(sub_submdspan_mapping_result.mapping)>;
   using sub_extents_t = typename sub_mapping_t::extents_type;
   using sub_layout_t = typename sub_mapping_t::layout_type;
   using sub_accessor_t = typename AccessorPolicy::offset_policy;
   return mdspan<ElementType, sub_extents_t, sub_layout_t, sub_accessor_t>(
-      src.accessor().offset(src.data_handle(), sub_mapping_offset.offset),
-      sub_mapping_offset.mapping,
+      src.accessor().offset(src.data_handle(), sub_submdspan_mapping_result.offset),
+      sub_submdspan_mapping_result.mapping,
       sub_accessor_t(src.accessor()));
 }
-} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE
 } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp b/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp
index f56ce023f16..c3b2f78fb99 100644
--- a/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp
+++ b/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp
@@ -20,7 +20,6 @@
 
 #include "strided_slice.hpp"
 namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
-namespace MDSPAN_IMPL_PROPOSED_NAMESPACE {
 namespace detail {
 
 // Mapping from submapping ranks to srcmapping ranks
@@ -319,5 +318,4 @@ constexpr auto submdspan_extents(const extents<IndexType, Extents...> &src_exts,
   return detail::extents_constructor<ext_t::rank(), ext_t>::next_extent(
       src_exts, slices...);
 }
-} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE
 } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp b/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp
index 48778d57e75..ca6948c9a9f 100644
--- a/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp
+++ b/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp
@@ -22,21 +22,15 @@
 #include <utility> // index_sequence
 
 namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
-namespace MDSPAN_IMPL_PROPOSED_NAMESPACE {
 //******************************************
 // Return type of submdspan_mapping overloads
 //******************************************
-template <class Mapping> struct mapping_offset {
-  Mapping mapping;
+template <class LayoutMapping> struct submdspan_mapping_result {
+  _MDSPAN_NO_UNIQUE_ADDRESS LayoutMapping mapping{};
   size_t offset;
 };
-} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE
 
 namespace detail {
-using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::first_of;
-using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::stride_of;
-using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::inv_map_rank;
-
 // constructs sub strides
 template <class SrcMapping, class... slice_strides, size_t... InvMapIdxs>
 MDSPAN_INLINE_FUNCTION
@@ -98,17 +92,15 @@ struct preserve_layout_left_mapping<std::index_sequence<Idx...>, SubRank,
     #pragma    diag_suppress = implicit_return_from_non_void_function
 #endif
 // Actual submdspan mapping call
-template <class Extents, class... SliceSpecifiers>
+template <class Extents>
+template <class... SliceSpecifiers>
 MDSPAN_INLINE_FUNCTION
 constexpr auto
-submdspan_mapping(const layout_left::mapping<Extents> &src_mapping,
-                  SliceSpecifiers... slices) {
-  using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents;
-  using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset;
+layout_left::mapping<Extents>::submdspan_mapping_impl(SliceSpecifiers... slices) const {
 
   // compute sub extents
   using src_ext_t = Extents;
-  auto dst_ext = submdspan_extents(src_mapping.extents(), slices...);
+  auto dst_ext = submdspan_extents(extents(), slices...);
   using dst_ext_t = decltype(dst_ext);
 
   // figure out sub layout type
@@ -121,18 +113,18 @@ submdspan_mapping(const layout_left::mapping<Extents> &src_mapping,
 
   if constexpr (std::is_same_v<dst_layout_t, layout_left>) {
     // layout_left case
-    return mapping_offset<dst_mapping_t>{
+    return submdspan_mapping_result<dst_mapping_t>{
         dst_mapping_t(dst_ext),
-        static_cast<size_t>(src_mapping(detail::first_of(slices)...))};
+        static_cast<size_t>(this->operator()(detail::first_of(slices)...))};
   } else {
     // layout_stride case
     auto inv_map = detail::inv_map_rank(
       std::integral_constant<size_t,0>(),
       std::index_sequence<>(),
       slices...);
-    return mapping_offset<dst_mapping_t>{
+    return submdspan_mapping_result<dst_mapping_t>{
         dst_mapping_t(dst_ext, detail::construct_sub_strides(
-                                   src_mapping, inv_map,
+                                   *this, inv_map,
     // HIP needs deduction guides to have markups so we need to be explicit
     // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue
     #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120)
@@ -140,7 +132,7 @@ submdspan_mapping(const layout_left::mapping<Extents> &src_mapping,
     #else
                                    std::tuple{detail::stride_of(slices)...})),
     #endif
-        static_cast<size_t>(src_mapping(detail::first_of(slices)...))};
+        static_cast<size_t>(this->operator()(detail::first_of(slices)...))};
   }
 #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
   __builtin_unreachable();
@@ -207,17 +199,15 @@ struct preserve_layout_right_mapping<std::index_sequence<Idx...>, SubRank,
     #pragma    diagnostic push
     #pragma    diag_suppress = implicit_return_from_non_void_function
 #endif
-template <class Extents, class... SliceSpecifiers>
+template <class Extents>
+template <class... SliceSpecifiers>
 MDSPAN_INLINE_FUNCTION
 constexpr auto
-submdspan_mapping(const layout_right::mapping<Extents> &src_mapping,
-                  SliceSpecifiers... slices) {
-  using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents;
-  using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset;
-
+layout_right::mapping<Extents>::submdspan_mapping_impl(
+                  SliceSpecifiers... slices) const {
   // get sub extents
   using src_ext_t = Extents;
-  auto dst_ext = submdspan_extents(src_mapping.extents(), slices...);
+  auto dst_ext = submdspan_extents(extents(), slices...);
   using dst_ext_t = decltype(dst_ext);
 
   // determine new layout type
@@ -230,18 +220,18 @@ submdspan_mapping(const layout_right::mapping<Extents> &src_mapping,
 
   if constexpr (std::is_same_v<dst_layout_t, layout_right>) {
     // layout_right case
-    return mapping_offset<dst_mapping_t>{
+    return submdspan_mapping_result<dst_mapping_t>{
         dst_mapping_t(dst_ext),
-        static_cast<size_t>(src_mapping(detail::first_of(slices)...))};
+        static_cast<size_t>(this->operator()(detail::first_of(slices)...))};
   } else {
     // layout_stride case
     auto inv_map = detail::inv_map_rank(
       std::integral_constant<size_t,0>(),
       std::index_sequence<>(),
       slices...);
-    return mapping_offset<dst_mapping_t>{
+    return submdspan_mapping_result<dst_mapping_t>{
         dst_mapping_t(dst_ext, detail::construct_sub_strides(
-                                   src_mapping, inv_map,
+                                   *this, inv_map,
     // HIP needs deduction guides to have markups so we need to be explicit
     // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue
     #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120)
@@ -249,7 +239,7 @@ submdspan_mapping(const layout_right::mapping<Extents> &src_mapping,
     #else
                                    std::tuple{detail::stride_of(slices)...})),
     #endif
-        static_cast<size_t>(src_mapping(detail::first_of(slices)...))};
+        static_cast<size_t>(this->operator()(detail::first_of(slices)...))};
   }
 #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
   __builtin_unreachable();
@@ -270,23 +260,22 @@ submdspan_mapping(const layout_right::mapping<Extents> &src_mapping,
 //**********************************
 // layout_stride submdspan_mapping
 //*********************************
-template <class Extents, class... SliceSpecifiers>
+template <class Extents>
+template <class... SliceSpecifiers>
 MDSPAN_INLINE_FUNCTION
 constexpr auto
-submdspan_mapping(const layout_stride::mapping<Extents> &src_mapping,
-                  SliceSpecifiers... slices) {
-  using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents;
-  using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset;
-  auto dst_ext = submdspan_extents(src_mapping.extents(), slices...);
+layout_stride::mapping<Extents>::submdspan_mapping_impl(
+                  SliceSpecifiers... slices) const {
+  auto dst_ext = submdspan_extents(extents(), slices...);
   using dst_ext_t = decltype(dst_ext);
   auto inv_map = detail::inv_map_rank(
       std::integral_constant<size_t,0>(),
       std::index_sequence<>(),
       slices...);
   using dst_mapping_t = typename layout_stride::template mapping<dst_ext_t>;
-  return mapping_offset<dst_mapping_t>{
+  return submdspan_mapping_result<dst_mapping_t>{
       dst_mapping_t(dst_ext, detail::construct_sub_strides(
-                                 src_mapping, inv_map,
+                                 *this, inv_map,
     // HIP needs deduction guides to have markups so we need to be explicit
     // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue
     #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120)
@@ -294,6 +283,7 @@ submdspan_mapping(const layout_stride::mapping<Extents> &src_mapping,
 #else
                                  std::tuple(detail::stride_of(slices)...))),
 #endif
-      static_cast<size_t>(src_mapping(detail::first_of(slices)...))};
+      static_cast<size_t>(this->operator()(detail::first_of(slices)...))};
 }
+
 } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp
new file mode 100644
index 00000000000..a8014867923
--- /dev/null
+++ b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp
@@ -0,0 +1,793 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+#pragma once
+
+#include <cassert>
+#include "layout_padded_fwd.hpp"
+#include "../__p0009_bits/dynamic_extent.hpp"
+#include "../__p0009_bits/extents.hpp"
+#include "../__p0009_bits/mdspan.hpp"
+#include "../__p0009_bits/layout_left.hpp"
+#include "../__p0009_bits/layout_right.hpp"
+#include "../__p0009_bits/layout_stride.hpp"
+
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
+namespace MDSPAN_IMPL_PROPOSED_NAMESPACE {
+
+namespace detail {
+template<class _T>
+MDSPAN_INLINE_FUNCTION
+constexpr _T
+find_next_multiple(_T alignment, _T offset)
+{
+  if ( alignment == 0 ) {
+    return _T(0);
+  } else {
+    return ( ( offset + alignment - 1 ) / alignment) * alignment;
+  }
+}
+
+template <class _ExtentsType, size_t _PaddingValue, size_t _ExtentToPadIdx>
+MDSPAN_INLINE_FUNCTION constexpr size_t get_actual_static_padding_value() {
+  constexpr auto rank = _ExtentsType::rank();
+
+  if constexpr (rank <= typename _ExtentsType::rank_type(1)) {
+    return 0;
+  } else if constexpr (_PaddingValue != dynamic_extent &&
+                       _ExtentsType::static_extent(_ExtentToPadIdx) !=
+                           dynamic_extent) {
+    static_assert(
+        (_PaddingValue != 0) ||
+            (_ExtentsType::static_extent(_ExtentToPadIdx) == 0),
+        "padding stride can be 0 only if "
+        "extents_type::static_extent(extent-to-pad) is 0 or dynamic_extent");
+    return find_next_multiple(_PaddingValue,
+                                _ExtentsType::static_extent(_ExtentToPadIdx));
+  } else {
+    return dynamic_extent;
+  }
+}
+
+template <size_t _PaddingValue, typename _Extents, size_t _ExtentToPadIdx, size_t _Rank, typename Enabled = void>
+struct static_array_type_for_padded_extent
+{
+  static constexpr size_t padding_value = _PaddingValue;
+  using index_type = typename _Extents::index_type;
+  using extents_type = _Extents;
+  using type = ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::maybe_static_array<
+      index_type, size_t, dynamic_extent,
+      detail::get_actual_static_padding_value<extents_type, padding_value,
+                                                _ExtentToPadIdx>()>;
+};
+
+template <size_t _PaddingValue, typename _Extents, size_t _ExtentToPadIdx, size_t Rank>
+struct static_array_type_for_padded_extent<_PaddingValue, _Extents,
+                                             _ExtentToPadIdx, Rank, std::enable_if_t<Rank <= 1>> {
+  using index_type = typename _Extents::index_type;
+  using extents_type = _Extents;
+  using type =
+      ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::maybe_static_array<
+          index_type, size_t, dynamic_extent, 0>;
+};
+
+template <size_t _PaddingValue, typename _Extents, size_t _ExtentToPadIdx>
+struct padded_extent {
+  static constexpr size_t padding_value = _PaddingValue;
+  using index_type = typename _Extents::index_type;
+  using extents_type = _Extents;
+  using static_array_type = typename static_array_type_for_padded_extent<
+      padding_value, _Extents, _ExtentToPadIdx, _Extents::rank()>::type;
+
+  static constexpr auto static_value() { return static_array_type::static_value(0); }
+
+  MDSPAN_INLINE_FUNCTION
+  static constexpr static_array_type
+  init_padding(const _Extents &exts) {
+    if constexpr ((_Extents::rank() > 1) && (padding_value == dynamic_extent)) {
+      return {exts.extent(_ExtentToPadIdx)};
+    } else {
+      return init_padding(exts, padding_value);
+    }
+  }
+
+  MDSPAN_INLINE_FUNCTION static constexpr static_array_type
+  init_padding([[maybe_unused]] const _Extents &exts,
+               [[maybe_unused]] index_type pv) {
+    if constexpr (_Extents::rank() > 1) {
+      return {find_next_multiple(pv,
+                                   exts.extent(_ExtentToPadIdx))};
+    } else {
+      return {};
+    }
+  }
+
+  template <typename _Mapping, size_t _PaddingStrideIdx>
+  MDSPAN_INLINE_FUNCTION static constexpr static_array_type
+  init_padding([[maybe_unused]] const _Mapping &other_mapping,
+                      std::integral_constant<size_t, _PaddingStrideIdx>) {
+    if constexpr (_Extents::rank() > 1) {
+      return {other_mapping.stride(_PaddingStrideIdx)};
+    } else {
+      return {};
+    }
+  }
+};
+} // namespace detail
+
+template <size_t PaddingValue>
+template <class Extents>
+class layout_left_padded<PaddingValue>::mapping {
+public:
+  static constexpr size_t padding_value = PaddingValue;
+
+  using extents_type = Extents;
+  using index_type = typename extents_type::index_type;
+  using size_type = typename extents_type::size_type;
+  using rank_type = typename extents_type::rank_type;
+  using layout_type = layout_left_padded<padding_value>;
+
+#ifndef MDSPAN_INTERNAL_TEST
+private:
+#endif // MDSPAN_INTERNAL_TEST
+
+  static constexpr rank_type padded_stride_idx = detail::layout_padded_constants<layout_type, extents_type>::padded_stride_idx;
+  static constexpr rank_type extent_to_pad_idx = detail::layout_padded_constants<layout_type, extents_type>::extent_to_pad_idx;
+
+  static_assert((padding_value != 0)
+                || (extents_type::static_extent(extent_to_pad_idx) == 0)
+                || (extents_type::static_extent(extent_to_pad_idx) == dynamic_extent),
+                "out of bounds access for rank 0");
+
+  using padded_stride_type = detail::padded_extent< padding_value, extents_type, extent_to_pad_idx >;
+
+  static constexpr size_t static_padding_stride = padded_stride_type::static_value();
+
+  typename padded_stride_type::static_array_type padded_stride = {};
+  extents_type exts = {};
+
+  constexpr index_type compute_offset(std::index_sequence<>) const {
+    return 0;
+  }
+
+  template <size_t Rank, class IndexOffset>
+  constexpr index_type compute_offset(std::index_sequence<Rank>,
+                                        IndexOffset index_offset) const {
+    return index_offset;
+  }
+
+  template <size_t... Ranks, class... IndexOffsets>
+  constexpr index_type compute_offset(std::index_sequence<Ranks...>,
+                                        IndexOffsets... index_offsets) const {
+    index_type indices[] = {static_cast<index_type>(index_offsets)...};
+    // self-recursive fold trick from
+    // https://github.com/llvm/llvm-project/blob/96e1914aa2e6d8966acbfbe2f4d184201f1aa318/libcxx/include/mdspan/layout_left.h#L144
+    index_type res = 0;
+    ((res = indices[extents_type::rank() - 1 - Ranks] +
+            ((extents_type::rank() - 1 - Ranks) == extent_to_pad_idx
+                 ? padded_stride.value(0)
+                 : exts.extent(extents_type::rank() - 1 - Ranks)) *
+                res),
+     ...);
+    return res;
+  }
+
+public:
+#if !MDSPAN_HAS_CXX_20
+  MDSPAN_INLINE_FUNCTION_DEFAULTED
+  constexpr mapping()
+      : mapping(extents_type{})
+  {}
+#else
+  MDSPAN_INLINE_FUNCTION_DEFAULTED
+  constexpr mapping()
+    requires(static_padding_stride != dynamic_extent) = default;
+
+  MDSPAN_INLINE_FUNCTION
+  constexpr mapping()
+    requires(static_padding_stride == dynamic_extent)
+      : mapping(extents_type{})
+  {}
+#endif
+
+  MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default;
+  MDSPAN_INLINE_FUNCTION_DEFAULTED mapping& operator=(const mapping&) noexcept = default;
+
+  /**
+   * Initializes the mapping with the given extents.
+   *
+   * \param ext the given extents
+   */
+  MDSPAN_INLINE_FUNCTION
+  constexpr mapping(const extents_type& ext)
+    : padded_stride(padded_stride_type::init_padding(ext)), exts(ext)
+  {}
+
+  /**
+   * Initializes the mapping with the given extents and the specified padding value.
+   *
+   * This overload participates in overload resolution only if `is_convertible_v<Size, index_type>`
+   * is `true` and `is_nothrow_constructible_v<index_type, Size>` is `true`
+   *
+   * \param ext the given extents
+   * \param padding_value the padding value
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+    class _Size,
+    /* requires */ (
+      std::is_convertible_v<_Size, index_type>
+      && std::is_nothrow_constructible_v<index_type, _Size>
+    )
+  )
+  MDSPAN_INLINE_FUNCTION
+  constexpr mapping(const extents_type &ext, _Size dynamic_padding_value)
+      : padded_stride(padded_stride_type::init_padding(ext, dynamic_padding_value)), exts(ext)
+  {
+    assert((padding_value == dynamic_extent) || (static_cast<index_type>(padding_value) == static_cast<index_type>(dynamic_padding_value)));
+  }
+
+  /**
+   * Converting constructor from `layout_left::mapping`.
+   *
+   * This overload participates in overload resolution only if `is_constructible_v<extents_type, OtherExtents>` is true.
+   * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`;
+   * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)`
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+    class _OtherExtents,
+    /* requires */ (
+      std::is_constructible_v<extents_type, _OtherExtents>
+    )
+  )
+  MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>))
+  constexpr mapping(const layout_left::mapping<_OtherExtents> &other_mapping)
+      : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant<size_t, padded_stride_idx>{})),
+        exts(other_mapping.extents())
+  {
+    static_assert((_OtherExtents::rank() > 1) || (static_padding_stride != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent)
+                  || (static_padding_stride == _OtherExtents::static_extent(extent_to_pad_idx)));
+  }
+
+  /**
+   * Converting constructor from `layout_stride::mapping`.
+   *
+   * This overload participates in overload resolution only if `is_constructible_v<extents_type, OtherExtents>` is true
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+    class _OtherExtents,
+    /* requires */ (
+      std::is_constructible_v<extents_type, _OtherExtents>
+    )
+  )
+  MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0))
+  constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping)
+      : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant<size_t, padded_stride_idx>{})),
+        exts(other_mapping.extents())
+  {
+  }
+
+  /**
+   * Converting constructor from `layout_left_padded::mapping`.
+   *
+   * This overload participates in overload resolution only if `is_constructible_v<extents_type, OtherExtents>` is true.
+   * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`.
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+    class _Mapping,
+    /* requires */ (
+      detail::is_layout_left_padded_mapping<_Mapping>::value
+      && std::is_constructible_v<extents_type, typename _Mapping::extents_type>
+    )
+  )
+  MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && (padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent)))
+  constexpr
+  mapping(const _Mapping &other_mapping)
+      : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant<size_t, padded_stride_idx>{})),
+        exts(other_mapping.extents())
+  {
+    static_assert(padding_value == dynamic_extent ||
+                  _Mapping::padding_value == dynamic_extent ||
+                  padding_value == _Mapping::padding_value);
+  }
+
+  /**
+   * Converting constructor from `layout_right_padded::mapping`.
+   *
+   * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v<extents_type, OtherExtents>` is `true`.
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+    class _Mapping,
+    /* requires */ (
+      detail::is_layout_right_padded_mapping<_Mapping>::value
+      && extents_type::rank() <= 1
+      && std::is_constructible_v<extents_type, typename _Mapping::extents_type>
+    )
+  )
+  MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<typename _Mapping::extents_type, extents_type>))
+  constexpr
+  mapping(const _Mapping &other_mapping) noexcept
+      : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))),
+        exts(other_mapping.extents())
+  {}
+
+  constexpr const extents_type &extents() const noexcept
+  {
+    return exts;
+  }
+
+  constexpr std::array<index_type, extents_type::rank()>
+  strides() const noexcept
+  {
+    if constexpr ( extents_type::rank() == 0 ) {
+      return {};
+    } else if constexpr ( extents_type::rank() == 1 ) {
+      return {1};
+    } else {
+      index_type value = 1;
+      std::array<index_type, extents_type::rank()> s{};
+      s[extent_to_pad_idx] = value;
+      value *= padded_stride.value(0);
+      for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; ++r)
+      {
+        s[r] = value;
+        value *= exts.extent(r);
+      }
+      s[extents_type::rank() - 1] = value;
+      return s;
+    }
+  }
+
+  constexpr index_type
+  required_span_size() const noexcept
+  {
+    if constexpr ( extents_type::rank() == 0 ) {
+      return 1;
+    } else if constexpr ( extents_type::rank() == 1 ) {
+      return exts.extent(0);
+    } else {
+      index_type value = padded_stride.value(0);
+      for (rank_type r = 1; r < extents_type::rank(); ++r) {
+        value *= exts.extent(r);
+      }
+      return value;
+    }
+  }
+
+  /**
+   * Return the mapping given the provided indices per rank.
+   *
+   * This overload participates in overload resolution only if:
+   * - `sizeof...(Indices) == extents_type::rank()`,
+   * - `(is_convertible_v<Indices, index_type> && ...) is true`, and
+   * - (is_nothrow_constructible_v<index_type, Indices> && ...) is true.
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+      class... _Indices,
+      /* requires */ (
+          sizeof...(_Indices) == extents_type::rank() &&
+          (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices<index_type, _Indices...>())
+    )
+  )
+  constexpr size_t operator()(_Indices... idxs) const noexcept
+  {
+    return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...);
+  }
+
+  static constexpr bool is_always_unique() noexcept { return true; }
+  static constexpr bool is_always_exhaustive() noexcept
+  {
+    return (extents_type::rank() <= rank_type(1))
+      || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent
+          && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value());
+  }
+  static constexpr bool is_always_strided() noexcept { return true; }
+
+  static constexpr bool is_unique() noexcept { return true; }
+  constexpr bool is_exhaustive() const noexcept
+  {
+    return (extents_type::rank() < 2)
+           || (exts.extent(extent_to_pad_idx) == padded_stride.value(0));
+  }
+  static constexpr bool is_strided() noexcept { return true; }
+
+  constexpr index_type stride(rank_type r) const noexcept
+  {
+    assert(r < extents_type::rank());
+    if(r == 0) return index_type(1);
+
+    index_type value = padded_stride.value(0);
+    for (rank_type k = 1; k < r; k++) value *= exts.extent(k);
+
+    return value;
+  }
+
+  /**
+   * Equality operator between `layout_left_padded`s
+   *
+   * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`.
+   *
+   * \note There is currently a difference from p2642r2, where this function is specified as taking
+   * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible.
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+    class _Mapping,
+    /* requires */ (
+      detail::is_layout_left_padded_mapping<_Mapping>::value
+      && (_Mapping::extents_type::rank() == extents_type::rank())
+    )
+  )
+  friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept
+  {
+    // Workaround for some compilers not short-circuiting properly with compile-time checks
+    // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping
+    bool strides_equal = true;
+    if constexpr (extents_type::rank() > rank_type(1))
+    {
+      strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx);
+    }
+    return (left.extents() == right.extents()) && strides_equal;
+  }
+
+#if !MDSPAN_HAS_CXX_20
+  /**
+   * Inequality operator between `layout_left_padded`s
+   *
+   * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`.
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+    class _Mapping,
+    /* requires */ (
+      detail::is_layout_left_padded_mapping<_Mapping>::value
+      && (_Mapping::extents_type::rank() == extents_type::rank())
+    )
+  )
+  friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept
+  {
+    return !(left == right);
+  }
+#endif
+};
+
+template <size_t PaddingValue>
+template <class Extents>
+class layout_right_padded<PaddingValue>::mapping {
+public:
+  static constexpr size_t padding_value = PaddingValue;
+
+  using extents_type = Extents;
+  using index_type = typename extents_type::index_type;
+  using size_type = typename extents_type::size_type;
+  using rank_type = typename extents_type::rank_type;
+  using layout_type = layout_right_padded<padding_value>;
+
+#ifndef MDSPAN_INTERNAL_TEST
+  private:
+#endif // MDSPAN_INTERNAL_TEST
+
+  static constexpr rank_type padded_stride_idx = detail::layout_padded_constants<layout_type, extents_type>::padded_stride_idx;
+  static constexpr rank_type extent_to_pad_idx = detail::layout_padded_constants<layout_type, extents_type>::extent_to_pad_idx;
+
+  static_assert((padding_value != 0)
+                || (extents_type::static_extent(extent_to_pad_idx) == 0)
+                || (extents_type::static_extent(extent_to_pad_idx) == dynamic_extent),
+                "if padding stride is 0, static_extent(extent-to-pad-rank) must also be 0 or dynamic_extent");
+
+  using padded_stride_type = detail::padded_extent< padding_value, extents_type, extent_to_pad_idx >;
+  static constexpr size_t static_padding_stride = padded_stride_type::static_value();
+
+  typename padded_stride_type::static_array_type padded_stride = {};
+  extents_type exts = {};
+
+  constexpr index_type compute_offset(std::index_sequence<>) const {
+    return 0;
+  }
+
+  template <size_t Rank, class IndexOffset>
+  constexpr index_type compute_offset(std::index_sequence<Rank>,
+                                        IndexOffset index_offset) const {
+    return index_offset;
+  }
+
+  template <size_t... Ranks, class... IndexOffsets>
+  constexpr index_type compute_offset(std::index_sequence<Ranks...>,
+                                        IndexOffsets... index_offsets) const {
+    // self-recursive fold trick from
+    // https://github.com/llvm/llvm-project/blob/4d9771741d40cc9cfcccb6b033f43689d36b705a/libcxx/include/mdspan/layout_right.h#L141
+    index_type res = 0;
+    ((res = static_cast<index_type>(index_offsets) +
+            (Ranks == extent_to_pad_idx ? padded_stride.value(0)
+                                          : exts.extent(Ranks)) *
+                res),
+     ...);
+    return res;
+  }
+
+public:
+#if !MDSPAN_HAS_CXX_20
+  MDSPAN_INLINE_FUNCTION_DEFAULTED
+      constexpr mapping()
+      : mapping(extents_type{})
+  {}
+#else
+  MDSPAN_INLINE_FUNCTION_DEFAULTED
+      constexpr mapping()
+    requires(static_padding_stride != dynamic_extent) = default;
+
+  MDSPAN_INLINE_FUNCTION
+      constexpr mapping()
+    requires(static_padding_stride == dynamic_extent)
+      : mapping(extents_type{})
+  {}
+#endif
+
+  MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default;
+  MDSPAN_INLINE_FUNCTION_DEFAULTED mapping& operator=(const mapping&) noexcept = default;
+
+  /**
+   * Initializes the mapping with the given extents.
+   *
+   * \param ext the given extents
+   */
+  MDSPAN_INLINE_FUNCTION
+  constexpr mapping(const extents_type &ext)
+      : padded_stride(padded_stride_type::init_padding(ext)), exts(ext) {}
+
+  /**
+   * Initializes the mapping with the given extents and the specified padding value.
+   *
+   * This overload participates in overload resolution only if `is_convertible_v<Size, index_type>`
+   * is `true` and `is_nothrow_constructible_v<index_type, Size>` is `true`
+   *
+   * \param ext the given extents
+   * \param padding_value the padding value
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+      class _Size,
+      /* requires */ (
+          std::is_convertible_v<_Size, index_type>
+              && std::is_nothrow_constructible_v<index_type, _Size>
+          )
+      )
+  MDSPAN_INLINE_FUNCTION
+  constexpr mapping(const extents_type &ext, _Size dynamic_padding_value)
+      : padded_stride(padded_stride_type::init_padding(ext, static_cast<index_type>(dynamic_padding_value))),
+        exts(ext) {
+    assert((padding_value == dynamic_extent) ||
+           (static_cast<index_type>(padding_value) == static_cast<index_type>(dynamic_padding_value)));
+  }
+
+  /**
+   * Converting constructor from `layout_right::mapping`.
+   *
+   * This overload participates in overload resolution only if `is_constructible_v<extents_type, OtherExtents>` is true.
+   * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`;
+   * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)`
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+      class _OtherExtents,
+      /* requires */ (
+          std::is_constructible_v<extents_type, _OtherExtents>
+          )
+      )
+  MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>))
+  constexpr mapping(const layout_right::mapping<_OtherExtents> &other_mapping)
+      : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant<size_t, padded_stride_idx>{})),
+        exts(other_mapping.extents())
+  {
+    static_assert((_OtherExtents::rank() > 1) || (padded_stride_type::static_value() != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent)
+                  || (padded_stride_type::static_value() == _OtherExtents::static_extent(extent_to_pad_idx)));
+  }
+
+  /**
+   * Converting constructor from `layout_stride::mapping`.
+   *
+   * This overload participates in overload resolution only if `is_constructible_v<extents_type, OtherExtents>` is true
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+      class _OtherExtents,
+      /* requires */ (
+          std::is_constructible_v<extents_type, _OtherExtents>
+          )
+      )
+  MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0))
+  constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping)
+      : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant<size_t, padded_stride_idx>{})),
+        exts(other_mapping.extents())
+  {}
+
+  /**
+   * Converting constructor from `layout_right_padded::mapping`.
+   *
+   * This overload participates in overload resolution only if `is_constructible_v<extents_type, OtherExtents>` is true.
+   * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`.
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+      class _Mapping,
+      /* requires */ (
+          detail::is_layout_right_padded_mapping<_Mapping>::value
+              && std::is_constructible_v<extents_type, typename _Mapping::extents_type>
+          )
+      )
+  MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 &&
+                               (padding_value == dynamic_extent ||
+                                _Mapping::padding_value == dynamic_extent)))
+  constexpr mapping(const _Mapping &other_mapping)
+      : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant<size_t, padded_stride_idx>{})),
+        exts(other_mapping.extents())
+  {
+    static_assert(padding_value == dynamic_extent ||
+                  _Mapping::padding_value == dynamic_extent ||
+                  padding_value == _Mapping::padding_value);
+  }
+
+  /**
+   * Converting constructor from `layout_left_padded::mapping`.
+   *
+   * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v<extents_type, OtherExtents>` is `true`.
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+      class _Mapping,
+      /* requires */ (
+          detail::is_layout_left_padded_mapping<_Mapping>::value
+                  && extents_type::rank() <= 1
+          && std::is_constructible_v<extents_type, typename _Mapping::extents_type>
+          )
+      )
+  MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<typename _Mapping::extents_type, extents_type>))
+  constexpr mapping(const _Mapping &other_mapping) noexcept
+      : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))),
+        exts(other_mapping.extents())
+  {}
+
+  constexpr const extents_type &extents() const noexcept
+  {
+    return exts;
+  }
+
+  constexpr std::array<index_type, extents_type::rank()>
+  strides() const noexcept
+  {
+    if constexpr ( extents_type::rank() == 0 ) {
+      return {};
+    } else if constexpr ( extents_type::rank() == 1 ) {
+      return {1};
+    } else {
+      index_type value = 1;
+      std::array<index_type, extents_type::rank()> s{};
+      s[extent_to_pad_idx] = value;
+      value *= padded_stride.value(0);
+      for (rank_type r = extent_to_pad_idx - 1; r > 0; --r)
+      {
+        s[r] = value;
+        value *= exts.extent(r);
+      }
+      s[0] = value;
+      return s;
+    }
+  }
+
+  constexpr index_type
+  required_span_size() const noexcept
+  {
+    if constexpr ( extents_type::rank() == 0 ) {
+      return 1;
+    } else if constexpr ( extents_type::rank() == 1 ) {
+      return exts.extent(0);
+    } else {
+      index_type value = 1;
+      for (rank_type r = 0; r < extent_to_pad_idx; ++r)
+      {
+        value *= exts.extent(r);
+      }
+      return value * padded_stride.value(0);
+    }
+  }
+
+  /**
+   * Return the mapping given the provided indices per rank.
+   *
+   * This overload participates in overload resolution only if:
+   * - `sizeof...(Indices) == extents_type::rank()`,
+   * - `(is_convertible_v<Indices, index_type> && ...) is true`, and
+   * - (is_nothrow_constructible_v<index_type, Indices> && ...) is true.
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+      class... _Indices,
+      /* requires */ (
+          sizeof...(_Indices) == extents_type::rank() &&
+          (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices<index_type, _Indices...>())
+          )
+      )
+  constexpr size_t operator()(_Indices... idxs) const noexcept
+  {
+    return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...);
+  }
+
+  static constexpr bool is_always_unique() noexcept { return true; }
+  static constexpr bool is_always_exhaustive() noexcept
+  {
+    return (extents_type::rank() <= rank_type(1))
+           || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent
+               && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value());
+  }
+  static constexpr bool is_always_strided() noexcept { return true; }
+
+  static constexpr bool is_unique() noexcept { return true; }
+  constexpr bool is_exhaustive() const noexcept
+  {
+    return (extents_type::rank() < 2)
+           || (exts.extent(extent_to_pad_idx) == padded_stride.value(0));
+  }
+  static constexpr bool is_strided() noexcept { return true; }
+
+  constexpr index_type stride(rank_type r) const noexcept
+  {
+    assert(r < extents_type::rank());
+    if(r == extents_type::rank() - 1) return index_type(1);
+
+    index_type value = padded_stride.value(0);
+    for (rank_type k = extents_type::rank() - 2; k > r; k--) value *= exts.extent(k);
+
+    return value;
+  }
+
+  /**
+   * Equality operator between `layout_right_padded`s
+   *
+   * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`.
+   *
+   * \note There is currently a difference from p2642r2, where this function is specified as taking
+   * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible.
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+      class _Mapping,
+      /* requires */ (
+          detail::is_layout_right_padded_mapping<_Mapping>::value
+          && (_Mapping::extents_type::rank() == extents_type::rank())
+          )
+      )
+  friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept
+  {
+    // Workaround for some compilers not short-circuiting properly with compile-time checks
+    // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping
+    bool strides_equal = true;
+    if constexpr (extents_type::rank() > rank_type(1))
+    {
+      strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx);
+    }
+    return (left.extents() == right.extents()) && strides_equal;
+  }
+
+#if !MDSPAN_HAS_CXX_20
+  /**
+   * Inequality operator between `layout_right_padded`s
+   *
+   * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`.
+   */
+  MDSPAN_TEMPLATE_REQUIRES(
+      class _Mapping,
+      /* requires */ (
+          detail::is_layout_right_padded_mapping<_Mapping>::value
+          && (_Mapping::extents_type::rank() == extents_type::rank())
+          )
+      )
+  friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept
+  {
+    return !(left == right);
+  }
+#endif
+};
+}
+}
diff --git a/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp
new file mode 100644
index 00000000000..945f091a2dc
--- /dev/null
+++ b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp
@@ -0,0 +1,117 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+#pragma once
+
+#include <cassert>
+#include "../__p0009_bits/dynamic_extent.hpp"
+
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
+namespace MDSPAN_IMPL_PROPOSED_NAMESPACE {
+
+template <size_t padding_value = dynamic_extent>
+struct layout_left_padded {
+  template <class _Extents>
+  class mapping;
+};
+
+template <size_t padding_value = dynamic_extent>
+struct layout_right_padded {
+  template <class _Extents>
+  class mapping;
+};
+
+namespace detail {
+// The layout_padded_constants structs are only useful if rank > 1, otherwise they may wrap
+template <class _Layout, class _ExtentsType>
+struct layout_padded_constants;
+
+template <class _ExtentsType, size_t _PaddingStride>
+struct layout_padded_constants<layout_left_padded<_PaddingStride>, _ExtentsType>
+{
+  using rank_type = typename _ExtentsType::rank_type;
+  static constexpr rank_type padded_stride_idx = 1;
+  static constexpr rank_type extent_to_pad_idx = 0;
+};
+
+template <class _ExtentsType, size_t _PaddingStride>
+struct layout_padded_constants<layout_right_padded<_PaddingStride>, _ExtentsType>
+{
+  using rank_type = typename _ExtentsType::rank_type;
+  static constexpr rank_type padded_stride_idx = _ExtentsType::rank() - 2;
+  static constexpr rank_type extent_to_pad_idx = _ExtentsType::rank() - 1;
+};
+
+template <class _Layout>
+struct is_layout_left_padded : std::false_type {};
+
+template <size_t _PaddingStride>
+struct is_layout_left_padded<layout_left_padded<_PaddingStride>> : std::true_type {};
+
+template <class _Mapping, class _Enabled = void>
+struct is_layout_left_padded_mapping : std::false_type {};
+
+template <class _Mapping>
+struct is_layout_left_padded_mapping<_Mapping,
+  std::enable_if_t<std::is_same<_Mapping, typename layout_left_padded<_Mapping::padding_value>::template mapping<typename _Mapping::extents_type>>::value>>
+    : std::true_type {};
+
+template <class _Layout>
+struct is_layout_right_padded : std::false_type {};
+
+template <size_t _PaddingStride>
+struct is_layout_right_padded<layout_right_padded<_PaddingStride>> : std::true_type {};
+
+template <class _Mapping, class _Enabled = void>
+struct is_layout_right_padded_mapping : std::false_type {};
+
+template <class _Mapping>
+struct is_layout_right_padded_mapping<_Mapping,
+  std::enable_if_t<std::is_same<_Mapping, typename layout_right_padded<_Mapping::padding_value>::template mapping<typename _Mapping::extents_type>>::value>>
+    : std::true_type {};
+
+template <class _LayoutExtentsType, class _PaddedLayoutMappingType>
+constexpr void check_padded_layout_converting_constructor_mandates()
+{
+  if constexpr (_LayoutExtentsType::rank() > 1) {
+    using extents_type = typename _PaddedLayoutMappingType::extents_type;
+    constexpr auto padding_value = _PaddedLayoutMappingType::padding_value;
+    constexpr auto idx = layout_padded_constants<typename _PaddedLayoutMappingType::layout_type, _LayoutExtentsType >::extent_to_pad_idx;
+    if constexpr ((_LayoutExtentsType::static_extent(idx) != dynamic_extent) &&
+                  (extents_type::static_extent(idx) != dynamic_extent) &&
+                  (padding_value != dynamic_extent)) {
+      if constexpr (padding_value == 0) {
+        static_assert(_LayoutExtentsType::static_extent(idx) == 0);
+      } else {
+        static_assert(
+            _LayoutExtentsType::static_extent(idx) % padding_value == 0);
+      }
+    }
+  }
+}
+
+template <typename _ExtentsType, typename _OtherMapping>
+constexpr void check_padded_layout_converting_constructor_preconditions([[maybe_unused]] const _OtherMapping &other_mapping) {
+  if constexpr (_ExtentsType::rank() > 1) {
+    constexpr auto padded_stride_idx =
+        layout_padded_constants<typename _OtherMapping::layout_type,
+                                  _ExtentsType>::padded_stride_idx;
+    constexpr auto extent_to_pad_idx = layout_padded_constants<typename _OtherMapping::layout_type, _ExtentsType>::extent_to_pad_idx;
+    assert(other_mapping.stride(padded_stride_idx) == other_mapping.extents().extent(extent_to_pad_idx));
+  }
+}
+}
+}
+}
diff --git a/tpls/mdspan/include/mdspan/mdspan.hpp b/tpls/mdspan/include/mdspan/mdspan.hpp
index b440873526a..ac72a1a4e64 100644
--- a/tpls/mdspan/include/mdspan/mdspan.hpp
+++ b/tpls/mdspan/include/mdspan/mdspan.hpp
@@ -35,6 +35,7 @@
 #include "../experimental/__p0009_bits/layout_right.hpp"
 #include "../experimental/__p0009_bits/macros.hpp"
 #if MDSPAN_HAS_CXX_17
+#include "../experimental/__p2642_bits/layout_padded.hpp"
 #include "../experimental/__p2630_bits/submdspan.hpp"
 #endif