-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add error up climacommons test error test wait split cmds split cmds split to diff steps gp fix no { rm init one ln cmd clean options now sol passes test to fail on BK revs try try pip fix try no bl ln try try sep script sep script sep script sep script try try try revert fail test exit 0 try try fail test print status status wait time fix; test to fail try to fail, turn off depot test to pass
- Loading branch information
1 parent
4e528c2
commit 6e5e843
Showing
7 changed files
with
112 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
19 changes: 19 additions & 0 deletions
19
config/model_configs/coarse_single_ft64_hourly_checkpoints_restart.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
anim: false | ||
apply_limiter: false | ||
dt: "400secs" | ||
dt_cpl: 400 | ||
dt_save_restart: "10days" | ||
dt_save_to_sol: "1days" | ||
energy_check: false | ||
h_elem: 6 | ||
hourly_checkpoint: true | ||
hourly_checkpoint_dt: 1 | ||
job_id: "coarse_single_ft64_hourly_checkpoints_restart" | ||
mode_name: "amip" | ||
moist: "equil" | ||
mono_surface: false | ||
precip_model: "0M" | ||
rad: "gray" | ||
run_name: "coarse_single_ft64_hourly_checkpoints_restart" | ||
t_end: "800secs" | ||
vert_diff: "true" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#!/bin/bash | ||
|
||
# This script submits a job to the Slurm scheduler and waits for it to finish. It | ||
# reports the job status every 30 seconds until the job completes. If the job | ||
# fails or is terminated, the script prints an error message and exits with a | ||
# non-zero status code. This is used by Buildkite to determine whether the job | ||
# truly succeeded or failed. | ||
|
||
# Submit the sbatch script and capture its job ID | ||
JOB_ID=$(sbatch test/mpi_tests/local_checks.sh | awk '{print $4}') | ||
echo "Submitted job with ID: $JOB_ID, output log: slurm-$JOB_ID.out" | ||
START_TIME=$(date +%s) | ||
# Loop until the job finishes | ||
while true; do | ||
# Check the status of the job | ||
STATUS=$(scontrol show job $JOB_ID | grep -oP 'JobState=\K\S+') | ||
sleep 30 | ||
ELAPSED_TIME=$(( $(date +%s) - $START_TIME )) | ||
# If the job status is 'PD' (pending) or 'R' (running), wait and continue checking | ||
if [ "$STATUS" == "" ] || [ "$STATUS" == "PENDING" ] || [ "$STATUS" == "RUNNING" ]; then | ||
echo "Job is still running... Elapsed time: $ELAPSED_TIME seconds." | ||
# If the job status is 'CF' (completed successfully), print success message and exit | ||
elif [ "$STATUS" == "COMPLETED" ]; then | ||
echo "Job completed successfully." | ||
exit 0 | ||
# If the job status is anything else, print error message and exit | ||
else | ||
echo "Error: Job failed or terminated. See slurm-$JOB_ID.out for more information." | ||
cat "slurm-$JOB_ID.out" | ||
exit 1 | ||
fi | ||
done |