Skip to content

Commit

Permalink
status wait time fix; test to fail
Browse files Browse the repository at this point in the history
  • Loading branch information
LenkaNovak committed Apr 2, 2024
1 parent ec7b507 commit ab1819a
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 10 deletions.
4 changes: 2 additions & 2 deletions test/mpi_tests/local_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#SBATCH --mem-per-cpu=16G
#SBATCH --partition=expansion

export MODULE_PATH=/groups/esm/modules:$MODULE_PATH
export MODULEPATH="/groups/esm/modules:$MODULEPATH"
module purge
module load climacommon/2024_03_18

Expand Down Expand Up @@ -53,6 +53,6 @@ else
fi

# Trouble shooting?
# - ensure you're using the latest module file of climacommon
# - ensure you're using the latest module file of climacommon and set MODULEPATH to the correct location
# - ensure you're using the latest version of ClimaCoupler.jl
# - did you cd to your version of ClimaCoupler.jl?
20 changes: 12 additions & 8 deletions test/mpi_tests/test_sbatch_script.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,28 +1,32 @@
#!/bin/bash

# This script submits a job to the Slurm scheduler and waits for it to finish. It
# reports the job status every 30 seconds until the job completes. If the job
# fails or is terminated, the script prints an error message and exits with a
# non-zero status code. This is used by Buildkite to determine whether the job
# truly succeeded or failed.

# Submit the sbatch script and capture its job ID
JOB_ID=$(sbatch test/mpi_tests/local_checks.sh | awk '{print $4}')
echo "Submitted job with ID: $JOB_ID, output log: slurm-$JOB_ID.out"
START_TIME=$(date +%s)
# Loop until the job finishes
while true; do
# Check the status of the job
STATUS=$(squeue -j $JOB_ID | grep $JOB_ID | awk '{print $5}')
echo "Pre eval status: $STATUS"
STATUS=$(scontrol show job $JOB_ID | grep -oP 'JobState=\K\S+')
sleep 30
ELAPSED_TIME=$(( $(date +%s) - $START_TIME ))
# If the job status is 'PD' (pending) or 'R' (running), wait and continue checking
if [ "$STATUS" == "PD" ] || [ "$STATUS" == "R" ]; then
sleep 20
if [ "$STATUS" == "" ] || [ "$STATUS" == "PENDING" ] || [ "$STATUS" == "RUNNING" ]; then
echo "Job is still running... Elapsed time: $ELAPSED_TIME seconds."
# If the job status is 'CF' (completed successfully), print success message and exit
elif [ "$STATUS" == "CF" ]; then
elif [ "$STATUS" == "COMPLETED" ]; then
echo "Job completed successfully."
echo "Post eval status: $STATUS"
exit 0
# If the job status is anything else, print error message and exit
else
echo "Error: Job failed or terminated. See slurm-$JOB_ID.out for more information."
echo "Post eval status: $STATUS"
cat "slurm-$JOB_ID.out"
exit 1
fi
done
done

0 comments on commit ab1819a

Please sign in to comment.