Skip to content

Commit

Permalink
Fix Metrics (#3812)
Browse files Browse the repository at this point in the history
* fix node label bug (#181)

* lint

Signed-off-by: Chris Martin <chris@cmartinit.co.uk>

* lint

Signed-off-by: Chris Martin <chris@cmartinit.co.uk>

---------

Signed-off-by: Chris Martin <chris@cmartinit.co.uk>
Co-authored-by: Christopher Martin <Chris.Martin@gresearch.co.uk>
Co-authored-by: Chris Martin <chris@cmartinit.co.uk>
  • Loading branch information
3 people authored Jul 23, 2024
1 parent 7d4ed88 commit 27a9e3b
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 14 deletions.
5 changes: 2 additions & 3 deletions internal/scheduler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,6 @@ func appendLabelsFromJob(labels []string, job *jobdb.Job) []string {
executor := executorNameFromRun(job.LatestRun())
labels = append(labels, job.Queue())
labels = append(labels, executor)
labels = append(labels, "") // No nodeType.
return labels
}

Expand Down Expand Up @@ -498,7 +497,7 @@ func (m *Metrics) counterVectorsFromResource(resource v1.ResourceName) (*prometh
Name: name,
Help: resource.String() + "resource counter.",
},
[]string{"state", "category", "subCategory", "queue", "cluster", "nodeType", "node"},
[]string{"state", "category", "subCategory", "queue", "cluster"},
)
m.resourceCounters[resource] = c
}
Expand All @@ -514,7 +513,7 @@ func (m *Metrics) counterVectorsFromResource(resource v1.ResourceName) (*prometh
Name: name,
Help: resource.String() + "-second resource counter.",
},
[]string{"priorState", "state", "category", "subCategory", "queue", "cluster", "nodeType", "node"},
[]string{"priorState", "state", "category", "subCategory", "queue", "cluster"},
)
m.resourceCounters[resourceSeconds] = cSeconds
}
Expand Down
75 changes: 64 additions & 11 deletions internal/scheduler/metrics/metrics_test.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,74 @@
package metrics

import (
"regexp"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/google/uuid"
"github.com/stretchr/testify/require"
v1 "k8s.io/api/core/v1"

"github.com/armadaproject/armada/internal/common/armadacontext"
"github.com/armadaproject/armada/internal/scheduler/configuration"
"github.com/armadaproject/armada/internal/scheduler/context"
"github.com/armadaproject/armada/internal/scheduler/schedulerobjects"
"github.com/armadaproject/armada/internal/scheduler/testfixtures"
"github.com/armadaproject/armada/pkg/armadaevents"
)

func TestFoo(t *testing.T) {
r, err := regexp.Compile("foo.*bar")
func TestUpdate(t *testing.T) {
ctx := armadacontext.Background()

metrics, err := New(configuration.MetricsConfig{
TrackedErrorRegexes: nil,
TrackedResourceNames: []v1.ResourceName{"cpu"},
ResetInterval: 24 * time.Hour,
})
require.NoError(t, err)
assert.True(t, r.MatchString("foobar"))
assert.True(t, r.MatchString("foo bar"))
assert.True(t, r.MatchString("foo and bar"))
assert.True(t, r.MatchString("this is foo and bar so"))
assert.False(t, r.MatchString("barfoo"))
assert.False(t, r.MatchString("foo"))
assert.False(t, r.MatchString("bar"))

now := time.Now()

queuedJob := testfixtures.NewJob(uuid.NewString(),
"test-jobset",
"test-queue",
1,
&schedulerobjects.JobSchedulingInfo{},
true,
0,
false,
false,
false,
time.Now().UnixNano(),
true)

jobRunErrorsByRunId := map[uuid.UUID]*armadaevents.Error{
uuid.MustParse(queuedJob.Id()): {
Terminal: true,
Reason: &armadaevents.Error_PodError{
PodError: &armadaevents.PodError{
Message: "my error",
},
},
},
}

leasedJob := queuedJob.WithNewRun("test-executor", "node1", "test-node", "test-pool", 1)
pendingJob := leasedJob.WithUpdatedRun(leasedJob.LatestRun().WithPendingTime(addSeconds(now, 1)))
runningJob := pendingJob.WithUpdatedRun(pendingJob.LatestRun().WithRunningTime(addSeconds(now, 2)))
finishedJob := runningJob.WithUpdatedRun(runningJob.LatestRun().WithTerminatedTime(addSeconds(now, 3)))
preemptedJob := finishedJob.WithUpdatedRun(runningJob.LatestRun().WithPreemptedTime(addSeconds(now, 4)))

require.NoError(t, metrics.UpdateQueued(queuedJob))
require.NoError(t, metrics.UpdateLeased(context.JobSchedulingContextFromJob(leasedJob)))
require.NoError(t, metrics.UpdatePending(pendingJob))
require.NoError(t, metrics.UpdateRunning(runningJob))
require.NoError(t, metrics.UpdateSucceeded(finishedJob))
require.NoError(t, metrics.UpdateCancelled(finishedJob))
require.NoError(t, metrics.UpdateFailed(ctx, finishedJob, jobRunErrorsByRunId))
require.NoError(t, metrics.UpdatePreempted(preemptedJob))
}

func addSeconds(t time.Time, seconds int) *time.Time {
t = t.Add(time.Duration(seconds) * time.Second)
return &t
}

0 comments on commit 27a9e3b

Please sign in to comment.