From ab1819a7e41c762cfee7017f4d6d274a47ffd724 Mon Sep 17 00:00:00 2001 From: lenka Date: Tue, 2 Apr 2024 09:50:50 -0700 Subject: [PATCH] status wait time fix; test to fail --- test/mpi_tests/local_checks.sh | 4 ++-- test/mpi_tests/test_sbatch_script.sh | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) mode change 100644 => 100755 test/mpi_tests/test_sbatch_script.sh diff --git a/test/mpi_tests/local_checks.sh b/test/mpi_tests/local_checks.sh index 102979fcb0..9817e293fc 100644 --- a/test/mpi_tests/local_checks.sh +++ b/test/mpi_tests/local_checks.sh @@ -5,7 +5,7 @@ #SBATCH --mem-per-cpu=16G #SBATCH --partition=expansion -export MODULE_PATH=/groups/esm/modules:$MODULE_PATH +export MODULEPATH="/groups/esm/modules:$MODULEPATH" module purge module load climacommon/2024_03_18 @@ -53,6 +53,6 @@ else fi # Trouble shooting? -# - ensure you're using the latest module file of climacommon +# - ensure you're using the latest module file of climacommon and set MODULEPATH to the correct location # - ensure you're using the latest version of ClimaCoupler.jl # - did you cd to your version of ClimaCoupler.jl? diff --git a/test/mpi_tests/test_sbatch_script.sh b/test/mpi_tests/test_sbatch_script.sh old mode 100644 new mode 100755 index 1056cfe182..83f1a9ac78 --- a/test/mpi_tests/test_sbatch_script.sh +++ b/test/mpi_tests/test_sbatch_script.sh @@ -1,5 +1,11 @@ #!/bin/bash +# This script submits a job to the Slurm scheduler and waits for it to finish. It +# reports the job status every 30 seconds until the job completes. If the job +# fails or is terminated, the script prints an error message and exits with a +# non-zero status code. This is used by Buildkite to determine whether the job +# truly succeeded or failed. + # Submit the sbatch script and capture its job ID JOB_ID=$(sbatch test/mpi_tests/local_checks.sh | awk '{print $4}') echo "Submitted job with ID: $JOB_ID, output log: slurm-$JOB_ID.out" @@ -7,22 +13,20 @@ START_TIME=$(date +%s) # Loop until the job finishes while true; do # Check the status of the job - STATUS=$(squeue -j $JOB_ID | grep $JOB_ID | awk '{print $5}') - echo "Pre eval status: $STATUS" + STATUS=$(scontrol show job $JOB_ID | grep -oP 'JobState=\K\S+') + sleep 30 ELAPSED_TIME=$(( $(date +%s) - $START_TIME )) # If the job status is 'PD' (pending) or 'R' (running), wait and continue checking - if [ "$STATUS" == "PD" ] || [ "$STATUS" == "R" ]; then - sleep 20 + if [ "$STATUS" == "" ] || [ "$STATUS" == "PENDING" ] || [ "$STATUS" == "RUNNING" ]; then echo "Job is still running... Elapsed time: $ELAPSED_TIME seconds." # If the job status is 'CF' (completed successfully), print success message and exit - elif [ "$STATUS" == "CF" ]; then + elif [ "$STATUS" == "COMPLETED" ]; then echo "Job completed successfully." - echo "Post eval status: $STATUS" exit 0 # If the job status is anything else, print error message and exit else echo "Error: Job failed or terminated. See slurm-$JOB_ID.out for more information." - echo "Post eval status: $STATUS" + cat "slurm-$JOB_ID.out" exit 1 fi -done \ No newline at end of file +done