Fix remaining swarm D->H->D copies #1122
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: CI extended | |
on: | |
# run every day at 06:00 UTC | |
schedule: | |
- cron: '0 6 * * *' | |
# when triggered manually | |
workflow_dispatch: | |
# when auto merge is enabled (hack to make sure it's run before merging) | |
pull_request: | |
types: [auto_merge_enabled] | |
# Cancel "duplicated" workflows triggered by pushes to internal | |
# branches with associated PRs. | |
concurrency: | |
group: ${{ github.ref }}-${{ github.head_ref }}-CI-extended | |
cancel-in-progress: true | |
env: | |
CTEST_OUTPUT_ON_FAILURE: 1 | |
CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build | |
MACHINE_CFG: cmake/machinecfg/CI.cmake | |
OMPI_MCA_mpi_common_cuda_event_max: 1000 | |
# https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231 | |
OMPI_MCA_btl_vader_single_copy_mechanism: none | |
jobs: | |
perf-and-regression: | |
strategy: | |
matrix: | |
device: ['cuda', 'host'] | |
parallel: ['serial', 'mpi'] | |
runs-on: [self-hosted, A100] | |
container: | |
image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent | |
# map to local user id on CI machine to allow writing to build cache | |
options: --user 1001 | |
steps: | |
- uses: actions/checkout@v3 | |
with: | |
submodules: 'true' | |
- name: Setup cache for gold standard | |
uses: actions/cache@v3 | |
with: | |
path: tst/regression/gold_standard/ | |
key: gold-standard | |
- name: Set vars based on matrix | |
id: cmake-vars | |
run: | | |
if ${{ matrix.device == 'host' }}; then | |
echo "enable_asan=ON" >> $GITHUB_OUTPUT | |
else | |
echo "enable_asan=OFF" >> $GITHUB_OUTPUT | |
fi | |
- name: Configure | |
run: | | |
cmake -B build \ | |
-DCMAKE_BUILD_TYPE=Release \ | |
-DENABLE_ASAN=${{ steps.cmake-vars.outputs.enable_asan }} \ | |
-DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} | |
- name: Build | |
run: cmake --build build | |
# run performance "unit" tests (none use MPI) | |
- name: Performance tests | |
if: ${{ matrix.parallel == 'serial' }} | |
run: | | |
cd build | |
# Pick GPU with most available memory | |
export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }') | |
# Sanitizers options (leak detection is disabled) | |
export ASAN_OPTIONS=abort_on_error=1:fast_unwind_on_malloc=1 | |
export UBSAN_OPTIONS=print_stacktrace=0 | |
export LSAN_OPTIONS=detect_leaks=0 | |
ctest -L performance -LE perf-reg | |
# run regression tests | |
- name: Regression tests | |
run: | | |
cd build | |
# Pick GPU with most available memory | |
export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }') | |
# Sanitizers options (disable leak detection for MPI runs, due to OpenMPI leaks) | |
export ASAN_OPTIONS=abort_on_error=1:fast_unwind_on_malloc=1 | |
export UBSAN_OPTIONS=print_stacktrace=0 | |
export LSAN_OPTIONS=detect_leaks=0 | |
ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600 | |
# Test Ascent integration (only most complex setup with MPI and on device) | |
- name: Ascent tests | |
if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }} | |
run: | | |
cmake -B build-ascent \ | |
-DCMAKE_BUILD_TYPE=Release \ | |
-DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \ | |
-DPARTHENON_ENABLE_ASCENT=ON \ | |
-DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent | |
cmake --build build-ascent | |
cd example/advection/ | |
# Pick GPU with most available memory | |
export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }') | |
mpirun -np 2 ../../build-ascent/example/advection/advection-example \ | |
-i parthinput.advection \ | |
parthenon/output5/dt=0.05 \ | |
parthenon/time/tlim=0.1 | |
# check if file exists | |
if [ ! -f "ascent_render_57.png" ]; then | |
echo "'ascent_render_57.png' does not exist." | |
exit 1 | |
fi | |
- uses: actions/upload-artifact@v3 | |
with: | |
name: log-and-convergence-${{ matrix.device }}-${{ matrix.parallel }} | |
path: | | |
build/CMakeFiles/CMakeOutput.log | |
build/tst/regression/outputs/advection_convergence*/advection-errors.dat | |
build/tst/regression/outputs/advection_convergence*/advection-errors.png | |
example/advection/ascent_render_57.png | |
retention-days: 3 | |
perf-and-regression-amdgpu: | |
strategy: | |
matrix: | |
parallel: ['serial', 'mpi'] | |
runs-on: [self-hosted, navi1030] | |
container: | |
image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5 | |
# Map to local user id on CI machine to allow writing to build cache and | |
# forward device handles to access AMD GPU within container | |
options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined | |
env: | |
CMAKE_GENERATOR: Ninja | |
CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build | |
steps: | |
- uses: actions/checkout@v3 | |
with: | |
submodules: 'true' | |
- name: Setup cache for gold standard | |
uses: actions/cache@v3 | |
with: | |
path: tst/regression/gold_standard/ | |
key: gold-standard | |
- name: Configure | |
run: | | |
cmake -B build \ | |
-DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \ | |
-DCMAKE_BUILD_TYPE=Release \ | |
-DMACHINE_VARIANT=hip-${{ matrix.parallel }} \ | |
-DCMAKE_CXX_COMPILER=hipcc | |
- name: Build | |
run: cmake --build build | |
# run performance "unit" tests (none use MPI) | |
- name: Performance tests | |
if: ${{ matrix.parallel == 'serial' }} | |
run: | | |
cd build | |
ctest -L performance -LE perf-reg | |
# run regression tests | |
- name: Regression tests | |
run: | | |
cd build | |
ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600 | |
- uses: actions/upload-artifact@v3 | |
with: | |
name: log-and-convergence-${{ matrix.parallel }} | |
path: | | |
build/CMakeFiles/CMakeOutput.log | |
build/tst/regression/outputs/advection_convergence*/advection-errors.dat | |
build/tst/regression/outputs/advection_convergence*/advection-errors.png | |
retention-days: 3 |