Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use squeue to find a Slurm job's status #121

Merged
merged 1 commit into from
Dec 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 39 additions & 14 deletions src/slurm.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

export kwargs, slurm_model_run, wait_for_jobs

# Initial code is common to PBS and Slurm schedulers
Expand Down Expand Up @@ -104,8 +105,10 @@ end
job_running(status::Symbol) = status == :RUNNING
job_success(status::Symbol) = status == :COMPLETED
job_failed(status::Symbol) = status == :FAILED
job_pending(status::Symbol) = status == :PENDING
job_completed(status::Symbol) = job_failed(status) || job_success(status)

job_pending(jobid) = job_pending(job_status(jobid))
job_running(jobid) = job_running(job_status(jobid))
job_success(jobid) = job_success(job_status(jobid))
job_failed(jobid) = job_failed(job_status(jobid))
Expand Down Expand Up @@ -281,22 +284,44 @@ wait_for_jobs(
)

"""
job_status(jobid)
job_status(job_id)

Parse the slurm jobid's state and return one of three status symbols: :COMPLETED, :FAILED, or :RUNNING.
Parse the slurm job_id's state and return one of three status symbols: :PENDING, :RUNNING, or :COMPLETED.
"""
function job_status(jobid::SlurmJobID)
failure_statuses = ("FAILED", "CANCELLED+", "CANCELLED")
output = readchomp(`sacct -j $jobid --format=State --noheader`)
# Jobs usually have multiple statuses
statuses = strip.(split(output, "\n"))
if all(s -> s == "COMPLETED", statuses)
return :COMPLETED
elseif any(s -> s in failure_statuses, statuses)
return :FAILED
else
return :RUNNING
end
function job_status(job_id::SlurmJobID)
cmd = `squeue -j $job_id --format=%T --noheader`
# Obtain stderr, difficult to do otherwise
stdout = Pipe()
stderr = Pipe()
process = run(pipeline(ignorestatus(cmd), stdout = stdout, stderr = stderr))
close(stdout.in)
close(stderr.in)
status = String(read(stdout))
stderr = String(read(stderr))
exit_code = process.exitcode

# https://slurm.schedmd.com/job_state_codes.html
pending_statuses = [
"PENDING",
"CONFIGURING",
"REQUEUE_FED",
"REQUEUE_HOLD",
"REQUEUED",
"RESIZING",
]
running_statuses =
["RUNNING", "COMPLETING", "STAGED", "SUSPENDED", "STOPPED", "RESIZING"]
invalid_job_err = "slurm_load_jobs error: Invalid job id specified"
@debug job_id status exit_code stderr

status == "" && exit_code == 0 && stderr == "" && return :COMPLETED
exit_code != 0 && contains(stderr, invalid_job_err) && return :COMPLETED

any(str -> contains(status, str), pending_statuses) && return :PENDING
any(str -> contains(status, str), running_statuses) && return :RUNNING

@warn "Job ID $job_id has unknown status `$status`. Marking as completed"
return :COMPLETED
end

"""
Expand Down
24 changes: 11 additions & 13 deletions test/slurm_unit_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -77,32 +77,30 @@ end
# Test job lifecycle
test_cmd = """
#!/bin/bash
#SBATCH --time=00:00:10
sleep 10
#SBATCH --time=00:01:00
sleep 30
"""

jobid = submit_cmd_helper(test_cmd)
@test CAL.job_status(jobid) == :RUNNING
@test CAL.job_running(CAL.job_status(jobid))
@test CAL.job_running(jobid) || CAL.job_pending(jobid)

sleep(180) # Ensure job finishes. To debug, lower sleep time or comment out the code block
sleep(480) # Ensure job finishes. To debug, lower sleep time or comment it out
@test CAL.job_status(jobid) == :COMPLETED
@test CAL.job_completed(CAL.job_status(jobid))
@test CAL.job_success(CAL.job_status(jobid))
@test CAL.job_completed(jobid)
@test CAL.job_success(jobid)

# Test job cancellation
jobid = submit_cmd_helper(test_cmd)
CAL.kill_job(jobid)
sleep(1)
@test CAL.job_status(jobid) == :FAILED
@test CAL.job_completed(CAL.job_status(jobid)) &&
CAL.job_failed(CAL.job_status(jobid))
sleep(5)
@test CAL.job_status(jobid) == :COMPLETED
@test CAL.job_completed(jobid)

# Test batch cancellation
jobids = ntuple(x -> submit_cmd_helper(test_cmd), 5)

CAL.kill_job.(jobids)
sleep(5)
for jobid in jobids
@test CAL.job_completed(CAL.job_status(jobid))
@test CAL.job_failed(CAL.job_status(jobid))
@test CAL.job_completed(jobid)
end
Loading