Fix per-pc resource limits

armadaproject · Jun 22, 2023 · f11417a · f11417a
1 parent f869cea
commit f11417a
Show file tree

Hide file tree

Showing 19 changed files with 768 additions and 596 deletions.
diff --git a/internal/armada/configuration/types.go b/internal/armada/configuration/types.go
@@ -233,12 +233,8 @@ type PriorityClass struct {
  Priority int32
  // If true, Armada may preempt jobs of this class to improve fairness.
  Preemptible bool
- // Limits resources assigned to jobs of priority equal to or lower than that of this priority class.
+ // Limits resources assigned to jobs of this priority class.
  // Specifically, jobs of this priority class are only scheduled if doing so does not exceed this limit.
- //
- // For example, if priority is 10 and MaximumResourceFractionPerQueue is map[string]float64{"cpu": 0.3},
- // jobs of this priority class are not scheduled if doing so would cause the total resources assigned
- // to jobs of priority 10 or lower from the same queue to exceed 30% of the total.
  MaximumResourceFractionPerQueue map[string]float64
  // Per-pool override of MaximumResourceFractionPerQueue.
  // If missing for a particular pool, MaximumResourceFractionPerQueue is used instead for that pool.

diff --git a/internal/armada/server/lease.go b/internal/armada/server/lease.go
@@ -40,6 +40,7 @@ import (
  schedulerconstraints "github.com/armadaproject/armada/internal/scheduler/constraints"
  schedulercontext "github.com/armadaproject/armada/internal/scheduler/context"
  "github.com/armadaproject/armada/internal/scheduler/database"
+ "github.com/armadaproject/armada/internal/scheduler/interfaces"
  schedulerinterfaces "github.com/armadaproject/armada/internal/scheduler/interfaces"
  "github.com/armadaproject/armada/internal/scheduler/nodedb"
  "github.com/armadaproject/armada/internal/scheduler/schedulerobjects"
@@ -282,10 +283,31 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
  })
  }
 
+ // Map queue names to priority factor for all active queues, i.e.,
+ // all queues for which the jobs queue has not been deleted automatically by Redis.
+ queues, err := q.queueRepository.GetAllQueues()
+ if err != nil {
+ return nil, err
+ }
+ priorityFactorByQueue := make(map[string]float64, len(queues))
+ apiQueues := make([]*api.Queue, len(queues))
+ for i, queue := range queues {
+ priorityFactorByQueue[queue.Name] = float64(queue.PriorityFactor)
+ apiQueues[i] = &api.Queue{Name: queue.Name}
+ }
+ activeQueues, err := q.jobRepository.FilterActiveQueues(apiQueues)
+ if err != nil {
+ return nil, err
+ }
+ priorityFactorByActiveQueue := make(map[string]float64, len(activeQueues))
+ for _, queue := range activeQueues {
+ priorityFactorByActiveQueue[queue.Name] = priorityFactorByQueue[queue.Name]
+ }
+
  // Nodes to be considered by the scheduler.
  lastSeen := q.clock.Now()
  nodes := make([]*schedulerobjects.Node, 0, len(req.Nodes))
- allocatedByQueueForCluster := make(map[string]schedulerobjects.QuantityByPriorityAndResourceType)
+ allocatedByQueueAndPriorityClassForCluster := make(map[string]schedulerobjects.QuantityByTAndResourceType[string], len(queues))
  jobIdsByGangId := make(map[string]map[string]bool)
  gangIdByJobId := make(map[string]string)
  nodeIdByJobId := make(map[string]string)
@@ -332,11 +354,9 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
  }
 
  // Aggregate total resources allocated by queue for this cluster.
- allocatedByQueueForCluster = scheduler.UpdateUsage(
- allocatedByQueueForCluster,
- jobs,
- q.schedulingConfig.Preemption.PriorityClasses,
- scheduler.Add,
+ allocatedByQueueAndPriorityClassForCluster = updateAllocatedByQueueAndPriorityClass(
+ allocatedByQueueAndPriorityClassForCluster,
+ add, jobs,
  )
 
  // Group gangs.
@@ -398,30 +418,36 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
  return nil, err
  }
 
- // Load executor reports for all clusters, and insert an updated report for this cluster.
+ // Load allocation reports for all executors from Redis.
  reportsByExecutor, err := q.usageRepository.GetClusterQueueResourceUsage()
  if err != nil {
  return nil, err
  }
- executorReport := &schedulerobjects.ClusterResourceUsageReport{
+
+ // Insert an updated report for the current executor, which includes information received in this lease call.
+ currentExecutorReport := &schedulerobjects.ClusterResourceUsageReport{
  Pool: req.Pool,
  Created: q.clock.Now(),
- ResourcesByQueue: make(map[string]*schedulerobjects.QueueClusterResourceUsage),
+ ResourcesByQueue: make(map[string]*schedulerobjects.QueueClusterResourceUsage, len(queues)),
  }
- for queue, allocated := range allocatedByQueueForCluster {
- executorReport.ResourcesByQueue[queue] = &schedulerobjects.QueueClusterResourceUsage{
- Created: executorReport.Created,
- Queue: queue,
- ExecutorId: req.ClusterId,
- ResourcesByPriority: allocated.DeepCopy(),
+ for queue, allocatedByPriorityClass := range allocatedByQueueAndPriorityClassForCluster {
+ currentExecutorReport.ResourcesByQueue[queue] = &schedulerobjects.QueueClusterResourceUsage{
+ Created:  currentExecutorReport.Created,
+ Queue:  queue,
+ ExecutorId:  req.ClusterId,
+ ResourcesByPriorityClassName: armadamaps.DeepCopy(allocatedByPriorityClass),
  }
  }
- reportsByExecutor[req.ClusterId] = executorReport
- if err := q.usageRepository.UpdateClusterQueueResourceUsage(req.ClusterId, executorReport); err != nil {
+ reportsByExecutor[req.ClusterId] = currentExecutorReport
+
+ // Write the updated report into Redis to make the information available to other replicas of the server.
+ if err := q.usageRepository.UpdateClusterQueueResourceUsage(req.ClusterId, currentExecutorReport); err != nil {
  return nil, errors.WithMessagef(err, "failed to update cluster usage for cluster %s", req.ClusterId)
  }
- allocatedByQueueForPool := q.aggregateUsage(reportsByExecutor, req.Pool)
- log.Infof("allocated resources per queue for pool %s before scheduling: %v", req.Pool, allocatedByQueueForPool)
+
+ // Aggregate allocation across all clusters.
+ allocatedByQueueAndPriorityClassForPool := q.aggregateAllocationAcrossExecutor(reportsByExecutor, req.Pool)
+ log.Infof("allocated resources per queue for pool %s before scheduling: %v", req.Pool, allocatedByQueueAndPriorityClassForPool)
 
  // Store executor details in Redis so they can be used by submit checks and the new scheduler.
  if err := q.executorRepository.StoreExecutor(ctx, &schedulerobjects.Executor{
@@ -435,29 +461,7 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
  log.WithError(err).Warnf("could not store executor details for cluster %s", req.ClusterId)
  }
 
- // Map queue names to priority factor for all active queues, i.e.,
- // all queues for which the jobs queue has not been deleted automatically by Redis.
- queues, err := q.queueRepository.GetAllQueues()
- if err != nil {
- return nil, err
- }
- priorityFactorByQueue := make(map[string]float64, len(queues))
- apiQueues := make([]*api.Queue, len(queues))
- for i, queue := range queues {
- priorityFactorByQueue[queue.Name] = float64(queue.PriorityFactor)
- apiQueues[i] = &api.Queue{Name: queue.Name}
- }
- activeQueues, err := q.jobRepository.FilterActiveQueues(apiQueues)
- if err != nil {
- return nil, err
- }
- priorityFactorByActiveQueue := make(map[string]float64, len(activeQueues))
- for _, queue := range activeQueues {
- priorityFactorByActiveQueue[queue.Name] = priorityFactorByQueue[queue.Name]
- }
-
- // Give Schedule() a 3 second shorter deadline than ctx,
- // to give it a chance to finish up before ctx is cancelled.
+ // Give Schedule() a 3 second shorter deadline than ctx to give it a chance to finish up before ctx deadline.
  if deadline, ok := ctx.Deadline(); ok {
  var cancel context.CancelFunc
  ctx, cancel = context.WithDeadline(ctx, deadline.Add(-3*time.Second))
@@ -473,7 +477,7 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
  schedulerobjects.ResourceList{Resources: totalCapacity},
  )
  for queue, priorityFactor := range priorityFactorByQueue {
- if err := sctx.AddQueueSchedulingContext(queue, priorityFactor, allocatedByQueueForPool[queue]); err != nil {
+ if err := sctx.AddQueueSchedulingContext(queue, priorityFactor, allocatedByQueueAndPriorityClassForPool[queue]); err != nil {
  return nil, err
  }
  }
@@ -629,38 +633,37 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
  }
 
  // Update resource cluster report to account for preempted/leased jobs and write it to Redis.
- allocatedByQueueForCluster = scheduler.UpdateUsage(
- allocatedByQueueForCluster,
- result.PreemptedJobs,
- q.schedulingConfig.Preemption.PriorityClasses,
- scheduler.Subtract,
+ allocatedByQueueAndPriorityClassForCluster = updateAllocatedByQueueAndPriorityClass(
+ allocatedByQueueAndPriorityClassForCluster,
+ subtract, result.PreemptedJobs,
  )
- for queue, m := range allocatedByQueueForCluster {
+ for queue, m := range allocatedByQueueAndPriorityClassForCluster {
  // Any quantity in the negative indicates a resource accounting problem.
- if !m.IsStrictlyNonNegative() {
- log.Errorf("unexpected negative resource quantity for queue %s: %v", queue, m)
+ for _, rl := range m {
+ if !rl.IsStrictlyNonNegative() {
+ return nil, errors.Errorf("unexpected negative resource quantity for queue %s: %v", queue, m)
+ }
  }
  }
- allocatedByQueueForCluster = scheduler.UpdateUsage(
- allocatedByQueueForCluster,
- successfullyLeasedApiJobs,
- q.schedulingConfig.Preemption.PriorityClasses,
- scheduler.Add,
+ allocatedByQueueAndPriorityClassForCluster = updateAllocatedByQueueAndPriorityClass(
+ allocatedByQueueAndPriorityClassForCluster,
+ add, successfullyLeasedApiJobs,
  )
- executorReport.Created = q.clock.Now()
- for queue, usage := range allocatedByQueueForCluster {
- executorReport.ResourcesByQueue[queue] = &schedulerobjects.QueueClusterResourceUsage{
- Created: executorReport.Created,
- Queue: queue,
- ExecutorId: req.ClusterId,
- ResourcesByPriority: usage.DeepCopy(),
+ currentExecutorReport.Created = q.clock.Now()
+ for queue, usage := range allocatedByQueueAndPriorityClassForCluster {
+ currentExecutorReport.ResourcesByQueue[queue] = &schedulerobjects.QueueClusterResourceUsage{
+ Created:  currentExecutorReport.Created,
+ Queue:  queue,
+ ExecutorId:  req.ClusterId,
+ ResourcesByPriorityClassName: armadamaps.DeepCopy(usage),
  }
  }
- if err := q.usageRepository.UpdateClusterQueueResourceUsage(req.ClusterId, executorReport); err != nil {
+ if err := q.usageRepository.UpdateClusterQueueResourceUsage(req.ClusterId, currentExecutorReport); err != nil {
  logging.WithStacktrace(log, err).Errorf("failed to update cluster usage")
  }
- allocatedByQueueForPool = q.aggregateUsage(reportsByExecutor, req.Pool)
- log.Infof("allocated resources per queue for pool %s after scheduling: %v", req.Pool, allocatedByQueueForPool)
+
+ allocatedByQueueAndPriorityClassForPool = q.aggregateAllocationAcrossExecutor(reportsByExecutor, req.Pool)
+ log.Infof("allocated resources per queue for pool %s after scheduling: %v", req.Pool, allocatedByQueueAndPriorityClassForPool)
 
  // Optionally set node id selectors on scheduled jobs.
  if q.schedulingConfig.Preemption.SetNodeIdSelector {
@@ -742,31 +745,65 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
  return successfullyLeasedApiJobs, nil
 }
 
-// aggregateUsage Creates a map of resource usage first by cluster and then by queue.
-// Clusters in pools other than pool are excluded.
-func (q *AggregatedQueueServer) aggregateUsage(reportsByCluster map[string]*schedulerobjects.ClusterResourceUsageReport, pool string) map[string]schedulerobjects.QuantityByPriorityAndResourceType {
- const activeClusterExpiry = 10 * time.Minute
+type addOrSubtract int
+
+const (
+ add addOrSubtract = iota
+ subtract
+)
+
+func updateAllocatedByQueueAndPriorityClass[T interfaces.LegacySchedulerJob](allocatedByQueueAndPriorityClass map[string]schedulerobjects.QuantityByTAndResourceType[string], op addOrSubtract, jobs []T) map[string]schedulerobjects.QuantityByTAndResourceType[string] {
+ if allocatedByQueueAndPriorityClass == nil {
+ allocatedByQueueAndPriorityClass = make(map[string]schedulerobjects.QuantityByTAndResourceType[string], 256)
+ }
+ for _, job := range jobs {
+ allocatedByPriorityClassName := allocatedByQueueAndPriorityClass[job.GetQueue()]
+ if allocatedByPriorityClassName == nil {
+ allocatedByPriorityClassName = make(map[string]schedulerobjects.ResourceList)
+ allocatedByQueueAndPriorityClass[job.GetQueue()] = allocatedByPriorityClassName
+ }
+ allocated := allocatedByPriorityClassName[job.GetPriorityClassName()]
+ if op == add {
+ allocated.AddV1ResourceList(job.GetResourceRequirements().Requests)
+ } else if op == subtract {
+ allocated.SubV1ResourceList(job.GetResourceRequirements().Requests)
+ } else {
+ panic(fmt.Sprintf("unknown op %d", op))
+ }
+ allocatedByPriorityClassName[job.GetPriorityClassName()] = allocated
+ }
+ return allocatedByQueueAndPriorityClass
+}
+
+func (q *AggregatedQueueServer) aggregateAllocationAcrossExecutor(reportsByExecutor map[string]*schedulerobjects.ClusterResourceUsageReport, pool string) map[string]schedulerobjects.QuantityByTAndResourceType[string] {
  now := q.clock.Now()
- aggregatedUsageByQueue := make(map[string]schedulerobjects.QuantityByPriorityAndResourceType)
- for _, clusterReport := range reportsByCluster {
- if clusterReport.Pool != pool {
- // Separate resource accounting per pool.
+ allocatedByQueueAndPriorityClass := make(map[string]schedulerobjects.QuantityByTAndResourceType[string])
+ for _, executorReport := range reportsByExecutor {
+ if executorReport.Pool != pool {
+ // Only consider executors in the specified pool.
  continue
  }
- if !clusterReport.Created.Add(activeClusterExpiry).After(now) {
- // Stale report; omit.
- continue
+ if q.schedulingConfig.ExecutorTimeout != 0 {
+ reportAge := now.Sub(executorReport.Created)
+ if reportAge > q.schedulingConfig.ExecutorTimeout {
+ // Stale report; omit.
+ continue
+ }
  }
- for queue, report := range clusterReport.ResourcesByQueue {
- quantityByPriorityAndResourceType, ok := aggregatedUsageByQueue[queue]
- if !ok {
- quantityByPriorityAndResourceType = make(schedulerobjects.QuantityByPriorityAndResourceType)
- aggregatedUsageByQueue[queue] = quantityByPriorityAndResourceType
+ for queue, queueReport := range executorReport.ResourcesByQueue {
+ allocatedByPriorityClass := allocatedByQueueAndPriorityClass[queue]
+ if allocatedByPriorityClass == nil {
+ allocatedByPriorityClass = make(map[string]schedulerobjects.ResourceList)
+ allocatedByQueueAndPriorityClass[queue] = allocatedByPriorityClass
+ }
+ for priorityClassName, allocated := range queueReport.ResourcesByPriorityClassName {
+ rl := allocatedByPriorityClass[priorityClassName]
+ rl.Add(allocated)
+ allocatedByPriorityClass[priorityClassName] = rl
  }
- quantityByPriorityAndResourceType.Add(report.ResourcesByPriority)
  }
  }
- return aggregatedUsageByQueue
+ return allocatedByQueueAndPriorityClass
 }
 
 func (q *AggregatedQueueServer) decompressJobOwnershipGroups(jobs []*api.Job) error {

diff --git a/internal/executor/utilisation/cluster_utilisation.go b/internal/executor/utilisation/cluster_utilisation.go
@@ -281,8 +281,8 @@ func groupPodsByNodes(pods []*v1.Pod) map[string][]*v1.Pod {
  return podsByNodes
 }
 
-func allocatedByPriorityAndResourceTypeFromPods(pods []*v1.Pod) schedulerobjects.QuantityByPriorityAndResourceType {
- rv := make(schedulerobjects.QuantityByPriorityAndResourceType)
+func allocatedByPriorityAndResourceTypeFromPods(pods []*v1.Pod) schedulerobjects.QuantityByTAndResourceType[int32] {
+ rv := make(schedulerobjects.QuantityByTAndResourceType[int32])
  for _, pod := range pods {
  var priority int32 = 0
  if pod.Spec.Priority != nil {

diff --git a/internal/scheduler/common.go b/internal/scheduler/common.go
@@ -116,46 +116,6 @@ func JobsSummary(jobs []interfaces.LegacySchedulerJob) string {
  )
 }
 
-type AddOrSubtract int
-
-const (
- Add AddOrSubtract = iota
- Subtract
-)
-
-func UpdateUsage[S ~[]E, E interfaces.LegacySchedulerJob](
- usage map[string]schedulerobjects.QuantityByPriorityAndResourceType,
- jobs S,
- priorityClasses map[string]configuration.PriorityClass,
- addOrSubtract AddOrSubtract,
-) map[string]schedulerobjects.QuantityByPriorityAndResourceType {
- if usage == nil {
- usage = make(map[string]schedulerobjects.QuantityByPriorityAndResourceType)
- }
- for _, job := range jobs {
- req := PodRequirementFromLegacySchedulerJob(job, priorityClasses)
- if req == nil {
- continue
- }
- requests := schedulerobjects.ResourceListFromV1ResourceList(req.ResourceRequirements.Requests)
- queue := job.GetQueue()
- m := usage[queue]
- if m == nil {
- m = make(schedulerobjects.QuantityByPriorityAndResourceType)
- }
- switch addOrSubtract {
- case Add:
- m.Add(schedulerobjects.QuantityByPriorityAndResourceType{req.Priority: requests})
- case Subtract:
- m.Sub(schedulerobjects.QuantityByPriorityAndResourceType{req.Priority: requests})
- default:
- panic(fmt.Sprintf("invalid operation %d", addOrSubtract))
- }
- usage[queue] = m
- }
- return usage
-}
-
 func jobSchedulingContextsFromJobs[T interfaces.LegacySchedulerJob](jobs []T, executorId string, priorityClasses map[string]configuration.PriorityClass) []*schedulercontext.JobSchedulingContext {
  if jobs == nil {
  return nil