Skip to content

Commit

Permalink
Merge pull request #121 from CliMA/ne/improve_slurm_job_status
Browse files Browse the repository at this point in the history
Use `squeue` to find a Slurm job's status
  • Loading branch information
nefrathenrici authored Dec 14, 2024
2 parents 81dbec4 + 5a6ae95 commit 60be971
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 27 deletions.
53 changes: 39 additions & 14 deletions src/slurm.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

export kwargs, slurm_model_run, wait_for_jobs

# Initial code is common to PBS and Slurm schedulers
Expand Down Expand Up @@ -104,8 +105,10 @@ end
job_running(status::Symbol) = status == :RUNNING
job_success(status::Symbol) = status == :COMPLETED
job_failed(status::Symbol) = status == :FAILED
job_pending(status::Symbol) = status == :PENDING
job_completed(status::Symbol) = job_failed(status) || job_success(status)

job_pending(jobid) = job_pending(job_status(jobid))
job_running(jobid) = job_running(job_status(jobid))
job_success(jobid) = job_success(job_status(jobid))
job_failed(jobid) = job_failed(job_status(jobid))
Expand Down Expand Up @@ -281,22 +284,44 @@ wait_for_jobs(
)

"""
job_status(jobid)
job_status(job_id)
Parse the slurm jobid's state and return one of three status symbols: :COMPLETED, :FAILED, or :RUNNING.
Parse the slurm job_id's state and return one of three status symbols: :PENDING, :RUNNING, or :COMPLETED.
"""
function job_status(jobid::SlurmJobID)
failure_statuses = ("FAILED", "CANCELLED+", "CANCELLED")
output = readchomp(`sacct -j $jobid --format=State --noheader`)
# Jobs usually have multiple statuses
statuses = strip.(split(output, "\n"))
if all(s -> s == "COMPLETED", statuses)
return :COMPLETED
elseif any(s -> s in failure_statuses, statuses)
return :FAILED
else
return :RUNNING
end
function job_status(job_id::SlurmJobID)
cmd = `squeue -j $job_id --format=%T --noheader`
# Obtain stderr, difficult to do otherwise
stdout = Pipe()
stderr = Pipe()
process = run(pipeline(ignorestatus(cmd), stdout = stdout, stderr = stderr))
close(stdout.in)
close(stderr.in)
status = String(read(stdout))
stderr = String(read(stderr))
exit_code = process.exitcode

# https://slurm.schedmd.com/job_state_codes.html
pending_statuses = [
"PENDING",
"CONFIGURING",
"REQUEUE_FED",
"REQUEUE_HOLD",
"REQUEUED",
"RESIZING",
]
running_statuses =
["RUNNING", "COMPLETING", "STAGED", "SUSPENDED", "STOPPED", "RESIZING"]
invalid_job_err = "slurm_load_jobs error: Invalid job id specified"
@debug job_id status exit_code stderr

status == "" && exit_code == 0 && stderr == "" && return :COMPLETED
exit_code != 0 && contains(stderr, invalid_job_err) && return :COMPLETED

any(str -> contains(status, str), pending_statuses) && return :PENDING
any(str -> contains(status, str), running_statuses) && return :RUNNING

@warn "Job ID $job_id has unknown status `$status`. Marking as completed"
return :COMPLETED
end

"""
Expand Down
24 changes: 11 additions & 13 deletions test/slurm_unit_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -77,32 +77,30 @@ end
# Test job lifecycle
test_cmd = """
#!/bin/bash
#SBATCH --time=00:00:10
sleep 10
#SBATCH --time=00:01:00
sleep 30
"""

jobid = submit_cmd_helper(test_cmd)
@test CAL.job_status(jobid) == :RUNNING
@test CAL.job_running(CAL.job_status(jobid))
@test CAL.job_running(jobid) || CAL.job_pending(jobid)

sleep(180) # Ensure job finishes. To debug, lower sleep time or comment out the code block
sleep(480) # Ensure job finishes. To debug, lower sleep time or comment it out
@test CAL.job_status(jobid) == :COMPLETED
@test CAL.job_completed(CAL.job_status(jobid))
@test CAL.job_success(CAL.job_status(jobid))
@test CAL.job_completed(jobid)
@test CAL.job_success(jobid)

# Test job cancellation
jobid = submit_cmd_helper(test_cmd)
CAL.kill_job(jobid)
sleep(1)
@test CAL.job_status(jobid) == :FAILED
@test CAL.job_completed(CAL.job_status(jobid)) &&
CAL.job_failed(CAL.job_status(jobid))
sleep(5)
@test CAL.job_status(jobid) == :COMPLETED
@test CAL.job_completed(jobid)

# Test batch cancellation
jobids = ntuple(x -> submit_cmd_helper(test_cmd), 5)

CAL.kill_job.(jobids)
sleep(5)
for jobid in jobids
@test CAL.job_completed(CAL.job_status(jobid))
@test CAL.job_failed(CAL.job_status(jobid))
@test CAL.job_completed(jobid)
end

0 comments on commit 60be971

Please sign in to comment.