From 3bdc5ab6b4eaf17a60f43ee77b7538b4270fa19c Mon Sep 17 00:00:00 2001 From: lenka Date: Thu, 28 Mar 2024 17:22:26 -0700 Subject: [PATCH] sep script --- .buildkite/pipeline.yml | 7 ++----- test/mpi_tests/test_sbatch_script.sh | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) create mode 100644 test/mpi_tests/test_sbatch_script.sh diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 1ba80b177..758f335ce 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -433,11 +433,8 @@ steps: steps: - label: "Submit and Monitor sbatch Job on Caltech HPC" # check that (1) the script can be succesfully submitted, (2) it runs successfully - command: - - | - # Submit the sbatch script and capture its job ID - # JOB_ID=$(sbatch test/mpi_tests/local_checks.sh | awk '{print $4}') - agents: + command: "./../test/mpi_tests/test_sbatch_script.sh" + agents: slurm_ntasks: 1 - wait diff --git a/test/mpi_tests/test_sbatch_script.sh b/test/mpi_tests/test_sbatch_script.sh new file mode 100644 index 000000000..1e7f7ebde --- /dev/null +++ b/test/mpi_tests/test_sbatch_script.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +Submit the sbatch script and capture its job ID +JOB_ID=$(sbatch test/mpi_tests/local_checks.sh | awk '{print $4}') +echo "Submitted job with ID: $JOB_ID, output log: slurm-$JOB_ID.out" +START_TIME=$(date +%s) +# Loop until the job finishes +while true; do + # Check the status of the job + STATUS=$(squeue -j $JOB_ID | grep $JOB_ID | awk '{print $5}') + ELAPSED_TIME=$(( $(date +%s) - $START_TIME )) + # If the job status is 'PD' (pending) or 'R' (running), wait and continue checking + if [ "$STATUS" == "PD" ] || [ "$STATUS" == "R" ]; then + sleep 60 + echo "Job is still running... Elapsed time: $ELAPSED_TIME seconds." + # If the job status is 'CF' (completed successfully), print success message and exit + elif [ "$STATUS" == "CF" ]; then + echo "Job completed successfully." + exit 0 + # If the job status is anything else, print error message and exit + else + echo "Error: Job failed or terminated. See slurm-$JOB_ID.out for more information." + exit 1 + fi +done \ No newline at end of file