From 94ef6d4d9f259ea9127c02875b248d357a635d92 Mon Sep 17 00:00:00 2001 From: Aday Bujeda Date: Wed, 23 Oct 2024 12:39:32 +0100 Subject: [PATCH 1/3] Added support to execute sacct to get job historic data for metrics --- lib/ood_core/job/adapters/slurm.rb | 60 ++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/lib/ood_core/job/adapters/slurm.rb b/lib/ood_core/job/adapters/slurm.rb index 72f11121..a920395e 100644 --- a/lib/ood_core/job/adapters/slurm.rb +++ b/lib/ood_core/job/adapters/slurm.rb @@ -323,6 +323,38 @@ def all_squeue_fields } end + # Metrics fields requested from a formatted `sacct` call + def sacct_metrics_fields + { + # The user name of the user who ran the job. + user: 'User', + # Job Id for reference + job_id: 'JobId', + # The job's elapsed time. + elapsed: 'Elapsed', + # Minimum required memory for the job + req_mem: 'ReqMem', + # Count of allocated CPUs + alloc_cpus: 'AllocCPUS', + # Number of requested CPUs. + req_cpus: 'ReqCPUS', + # What the timelimit was/is for the job + time_limit: 'Timelimit', + # Displays the job status, or state + state: 'State', + # The sum of the SystemCPU and UserCPU time used by the job or job step + total_cpu: 'TotalCPU', + # Maximum resident set size of all tasks in job. + max_rss: 'MaxRSS', + # The time the job was submitted. In the same format as End. + submit: 'Submit', + # Initiation time of the job. In the same format as End. + start: 'Start', + # Trackable resources. These are the minimum resource counts requested by the job/step at submission time. + req_tres: 'ReqTRES' + } + end + def queues info_raw = call('scontrol', 'show', 'part', '-o') @@ -357,6 +389,30 @@ def nodes end.compact end + def sacct_metrics(job_ids, states, from, to) + #https://slurm.schedmd.com/sacct.html + fields = sacct_metrics_fields + args = ['-P'] # Output will be delimited + args.concat ['--delimiter', UNIT_SEPARATOR] + args.concat ['-n'] # No header + args.concat ['--units', 'G'] # Memory units in GB + args.concat ['-o', fields.values.join(',')] # Required data + args.concat ['--state', states.join(',')] unless states.empty? # Filter by these states + args.concat ['-j', job_ids.join(',')] unless job_ids.empty? # Filter by these job ids + args.concat ['-S', from] if from # Filter from This date + args.concat ['-E', to] if to # Filter until this date + + metrics = [] + StringIO.open(call('sacct', *args)) do |output| + output.each_line do |line| + # Replace blank values with nil + values = line.strip.split(UNIT_SEPARATOR).map{ |value| value.blank? ? nil : value } + metrics << Hash[fields.keys.zip(values)] unless values.empty? + end + end + metrics + end + private def str_to_queue_info(line) hsh = line.split(' ').map do |token| @@ -699,6 +755,10 @@ def nodes @slurm.nodes end + def sacct_metrics(job_ids: [], states: [], from: nil, to: nil) + @slurm.sacct_metrics(job_ids, states, from, to) + end + private # Convert duration to seconds def duration_in_seconds(time) From 4a7696cd76483aec0f07c2ca294897939e0589aa Mon Sep 17 00:00:00 2001 From: Aday Bujeda Date: Mon, 28 Oct 2024 12:58:46 +0000 Subject: [PATCH 2/3] Updated sacct call to return an array of info objects --- lib/ood_core/job/adapters/slurm.rb | 52 +++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/lib/ood_core/job/adapters/slurm.rb b/lib/ood_core/job/adapters/slurm.rb index a920395e..d1624c33 100644 --- a/lib/ood_core/job/adapters/slurm.rb +++ b/lib/ood_core/job/adapters/slurm.rb @@ -330,6 +330,8 @@ def sacct_metrics_fields user: 'User', # Job Id for reference job_id: 'JobId', + # The name of the job or job step + job_name: 'JobName', # The job's elapsed time. elapsed: 'Elapsed', # Minimum required memory for the job @@ -346,10 +348,12 @@ def sacct_metrics_fields total_cpu: 'TotalCPU', # Maximum resident set size of all tasks in job. max_rss: 'MaxRSS', + # Identifies the partition on which the job ran. + partition: 'Partition', # The time the job was submitted. In the same format as End. - submit: 'Submit', + submit_time: 'Submit', # Initiation time of the job. In the same format as End. - start: 'Start', + start_time: 'Start', # Trackable resources. These are the minimum resource counts requested by the job/step at submission time. req_tres: 'ReqTRES' } @@ -389,13 +393,14 @@ def nodes end.compact end - def sacct_metrics(job_ids, states, from, to) - #https://slurm.schedmd.com/sacct.html + def sacct_metrics(job_ids, states, from, to, show_steps) + # https://slurm.schedmd.com/sacct.html fields = sacct_metrics_fields args = ['-P'] # Output will be delimited args.concat ['--delimiter', UNIT_SEPARATOR] args.concat ['-n'] # No header args.concat ['--units', 'G'] # Memory units in GB + args.concat ['--allocations'] unless show_steps # Show statistics relevant to the job, not taking steps into consideration args.concat ['-o', fields.values.join(',')] # Required data args.concat ['--state', states.join(',')] unless states.empty? # Filter by these states args.concat ['-j', job_ids.join(',')] unless job_ids.empty? # Filter by these job ids @@ -518,8 +523,23 @@ def squeue_attrs_for_info_attrs(attrs) 'SE' => :completed, # SPECIAL_EXIT 'ST' => :running, # STOPPED 'S' => :suspended, # SUSPENDED - 'TO' => :completed, # TIMEOUT - 'OOM' => :completed # OUT_OF_MEMORY + 'TO' => :completed, # TIMEOUT + 'OOM' => :completed, # OUT_OF_MEMORY + + 'BOOT_FAIL' => :completed, + 'CANCELED' => :completed, + 'COMPLETED' => :completed, + 'DEADLINE' => :completed, + 'FAILED' => :completed, + 'NODE_FAIL' => :completed, + 'OUT_OF_MEMORY' => :completed, + 'PENDING' => :queued, + 'PREEMPTED' => :completed, + 'RUNNING' => :running, + 'REQUEUED' => :queued, + 'REVOKED' => :completed, + 'SUSPENDED' => :suspended, + 'TIMEOUT' => :completed, } # @api private @@ -755,8 +775,24 @@ def nodes @slurm.nodes end - def sacct_metrics(job_ids: [], states: [], from: nil, to: nil) - @slurm.sacct_metrics(job_ids, states, from, to) + def sacct_metrics(job_ids: [], states: [], from: nil, to: nil, show_steps: false) + @slurm.sacct_metrics(job_ids, states, from, to, show_steps).map do |v| + Info.new( + id: v[:job_id], + status: get_state(v[:state]), + job_name: v[:job_name], + job_owner: v[:user], + procs: v[:alloc_cpus], + queue_name: v[:partition], + wallclock_time: duration_in_seconds(v[:elapsed]), + wallclock_limit: duration_in_seconds(v[:time_limit]), + cpu_time: duration_in_seconds(v[:total_cpu]), + submission_time: v[:submit_time] ? Time.parse(v[:submit_time]) : nil, + dispatch_time: (v[:start_time].nil? || v[:start_time] == "N/A") ? nil : Time.parse(v[:start_time]), + native: v, + gpus: self.class.gpus_from_gres(v[:gres]) + ) + end end private From a925368673328db953233546cecf2a930bc21d2a Mon Sep 17 00:00:00 2001 From: Aday Bujeda Date: Tue, 29 Oct 2024 10:00:19 +0000 Subject: [PATCH 3/3] Added custom parse_time method to sacct call --- lib/ood_core/job/adapters/slurm.rb | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/ood_core/job/adapters/slurm.rb b/lib/ood_core/job/adapters/slurm.rb index d1624c33..fffda124 100644 --- a/lib/ood_core/job/adapters/slurm.rb +++ b/lib/ood_core/job/adapters/slurm.rb @@ -787,8 +787,8 @@ def sacct_metrics(job_ids: [], states: [], from: nil, to: nil, show_steps: false wallclock_time: duration_in_seconds(v[:elapsed]), wallclock_limit: duration_in_seconds(v[:time_limit]), cpu_time: duration_in_seconds(v[:total_cpu]), - submission_time: v[:submit_time] ? Time.parse(v[:submit_time]) : nil, - dispatch_time: (v[:start_time].nil? || v[:start_time] == "N/A") ? nil : Time.parse(v[:start_time]), + submission_time: parse_time(v[:submit_time]), + dispatch_time: parse_time(v[:start_time]), native: v, gpus: self.class.gpus_from_gres(v[:gres]) ) @@ -809,6 +809,13 @@ def seconds_to_duration(time) "%02d:%02d:%02d" % [time/3600, time/60%60, time%60] end + # Parse date time string ignoring unknown values returned by Slurm + def parse_time(date_time) + return nil if date_time.empty? || %w[N/A NONE UNKNOWN].include?(date_time.to_s.upcase) + + Time.parse(date_time) + end + # Convert host list string to individual nodes # "em082" # "em[014,055-056,161]"