Skip to content

Commit

Permalink
slurm handle squeue timeouts (#209)
Browse files Browse the repository at this point in the history
slurm handle squeue time-outs by interpreting stderr and throwing specific errors if it finds something in it.
  • Loading branch information
johrstrom authored Oct 1, 2020
1 parent c27cebd commit a7eeac1
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 1 deletion.
19 changes: 18 additions & 1 deletion lib/ood_core/job/adapters/slurm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ class Batch
# from
class Error < StandardError; end

# An error indicating the slurm command timed out
class SlurmTimeoutError < Error; end

# @param cluster [#to_s, nil] the cluster name
# @param conf [#to_s, nil] path to the slurm conf
# @param bin [#to_s] path to slurm installation binaries
Expand Down Expand Up @@ -147,6 +150,9 @@ def get_jobs(id: "", owner: nil, attrs: nil)
end
jobs
end
rescue SlurmTimeoutError
# TODO: could use a log entry here
return [{ id: id, state: 'undetermined' }]
end

def squeue_fields(attrs)
Expand Down Expand Up @@ -303,7 +309,18 @@ def call(cmd, *args, env: {}, stdin: "")

cmd, args = OodCore::Job::Adapters::Helper.ssh_wrap(submit_host, cmd, args, strict_host_checking)
o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
s.success? ? o : raise(Error, e)
s.success? ? interpret_and_raise(o, e) : raise(Error, e)
end

# Helper function to raise an error based on the contents of stderr.
# Slurm exits 0 even when the command fails, so we need to interpret stderr
# to see if the command was actually successful.
def interpret_and_raise(stdout, stderr)
return stdout if stderr.empty?

raise SlurmTimeoutError, stderr if /^slurm_load_jobs error: Socket timed out/.match(stderr)

stdout
end

def squeue_attrs_for_info_attrs(attrs)
Expand Down
30 changes: 30 additions & 0 deletions spec/job/adapters/slurm_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1076,6 +1076,36 @@ def job_info(opts = {})
# [:account, :job_id, :job_name, :node_list, :partition, :scheduled_nodes, :state_compact, :time_used, :user])
# end
end

describe "#get_jobs" do
let(:squeue_args) {[
"squeue",
"--all",
"--states=all",
"--noconvert",
"-o",
"\u001E%a\u001F%A\u001F%B\u001F%c\u001F%C\u001F%d\u001F%D\u001F%e\u001F%E\u001F%f\u001F%F\u001F%g\u001F%G\u001F%h\u001F%H\u001F%i\u001F%I\u001F%j\u001F%J\u001F%k\u001F%K\u001F%l\u001F%L\u001F%m\u001F%M\u001F%n\u001F%N\u001F%o\u001F%O\u001F%q\u001F%P\u001F%Q\u001F%r\u001F%S\u001F%t\u001F%T\u001F%u\u001F%U\u001F%v\u001F%V\u001F%w\u001F%W\u001F%x\u001F%X\u001F%y\u001F%Y\u001F%z\u001F%Z\u001F%b",
"-j",
"123"
]}

it "handles Slurm socket timeouts" do
slurm_stderr = "slurm_load_jobs error: Socket timed out on send/recv operation"
slurm_stdout = "CLUSTER: saturn"

allow(Open3).to receive(:capture3).with({}, *squeue_args, stdin_data: "").and_return([slurm_stdout, slurm_stderr, double("success?" => true)])
expect(batch.get_jobs(id: '123')).to eq([{ id: '123', state: 'undetermined'}])
end

it "still propogates non-zero exitting errors" do
slurm_stderr = "Some unhandled error"
slurm_stdout = ""

allow(Open3).to receive(:capture3).with({}, *squeue_args, stdin_data: "").and_return([slurm_stdout, slurm_stderr, double("success?" => false)])
expect { batch.get_jobs(id: '123') }.to raise_error(Slurm::Batch::Error)
end
end

end

describe "customizing bin paths" do
Expand Down

0 comments on commit a7eeac1

Please sign in to comment.