Skip to content

Commit

Permalink
Push metrics from testsuite
Browse files Browse the repository at this point in the history
  • Loading branch information
severinson committed Sep 13, 2023
1 parent ba1973f commit 83fb1a2
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 6 deletions.
14 changes: 14 additions & 0 deletions cmd/testsuite/cmd/test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ func testCmd(app *testsuite.App) *cobra.Command {
cmd.Flags().String("tests", "", "Test file pattern, e.g., './testcases/*.yaml'.")
cmd.Flags().String("junit", "", "Write a JUnit test report to this path.")
cmd.Flags().String("benchmark", "", "Write a benchmark test report to this path.")
cmd.Flags().String("prometheusPushgatewayUrl", "", "Push metrics to Prometheus pushgateway at this url.")
cmd.Flags().String("prometheusPushgatewayJobName", "armada-testsuite", "Metrics are annotated with with job=prometheusPushGatewayJobName.")
return cmd
}

Expand Down Expand Up @@ -64,6 +66,18 @@ func testCmdRunE(app *testsuite.App) func(cmd *cobra.Command, args []string) err
return errors.New("benchmark report not currently supported")
}

prometheusPushgatewayUrl, err := cmd.Flags().GetString("prometheusPushgatewayUrl")
if err != nil {
return errors.WithStack(err)
}
app.Params.PrometheusPushGatewayUrl = prometheusPushgatewayUrl

prometheusPushgatewayJobName, err := cmd.Flags().GetString("prometheusPushgatewayJobName")
if err != nil {
return errors.WithStack(err)
}
app.Params.PrometheusPushGatewayJobName = prometheusPushgatewayJobName

// Create a context that is cancelled on SIGINT/SIGTERM.
// Ensures test jobs are cancelled on ctrl-C.
ctx, cancel := context.WithCancel(context.Background())
Expand Down
135 changes: 134 additions & 1 deletion internal/testsuite/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ import (
"github.com/hashicorp/go-multierror"
"github.com/mattn/go-zglob"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/push"
"github.com/renstrom/shortuuid"
"golang.org/x/sync/errgroup"
apimachineryYaml "k8s.io/apimachinery/pkg/util/yaml"
Expand All @@ -29,6 +31,8 @@ import (
"github.com/armadaproject/armada/pkg/client"
)

const metricsPrefix = "armada_testsuite_"

type App struct {
// Parameters passed to the CLI by the user.
Params *Params
Expand All @@ -45,7 +49,13 @@ type App struct {
// and that they can be provided either dynamically on a command line, or
// statically in a config file that's reused between command runs.
type Params struct {
// Armada connection details.
ApiConnectionDetails *client.ApiConnectionDetails
// If non-empty, push metrics containing test results to a Prometheus push gateway with this url.
PrometheusPushGatewayUrl string
// Exported metrics are annotated with job=PrometheusPushGatewayJobName.
// Must be non-empty.
PrometheusPushGatewayJobName string
}

// New instantiates an App with default parameters, including standard output
Expand Down Expand Up @@ -107,7 +117,7 @@ func TestSpecFromFilePath(filePath string) (*api.TestSpec, error) {
return nil, err
}

// Randomise jobSetName for each test to ensure we're only getting events for this run.
// Randomise job set for each test to ensure we're only getting events for this run.
fileName := filepath.Base(filePath)
fileName = strings.TrimSuffix(fileName, filepath.Ext(fileName))
testSpec.JobSetId = fileName + "-" + shortuuid.New()
Expand All @@ -126,13 +136,119 @@ type TestSuiteReport struct {
TestCaseReports []*TestCaseReport
}

func (tsr *TestSuiteReport) Describe(c chan<- *prometheus.Desc) {
for _, tcr := range tsr.TestCaseReports {
tcr.Describe(c)
}
}

func (tsr *TestSuiteReport) Collect(c chan<- prometheus.Metric) {
for _, tcr := range tsr.TestCaseReports {
tcr.Collect(c)
}
}

type TestCaseReport struct {
Out *bytes.Buffer
Start time.Time
Finish time.Time
FailureReason string
BenchmarkReport *eventbenchmark.TestCaseBenchmarkReport
TestSpec *api.TestSpec

// Prometheus metric descriptions.
// Test start time in seconds since the epoch.
startTimePrometheusDesc *prometheus.Desc
// Test finish time in seconds since the epoch.
finishTimePrometheusDesc *prometheus.Desc
// Outputs 1 on test timeout.
testTimeoutPrometheusDesc *prometheus.Desc
// Outputs 1 on test failure, not including timeouts.
testFailurePrometheusDesc *prometheus.Desc
}

func NewTestCaseReport(testSpec *api.TestSpec) *TestCaseReport {
rv := &TestCaseReport{
Start: time.Now(),
TestSpec: testSpec,
}
rv.initialiseMetrics()
return rv
}

func (r *TestCaseReport) initialiseMetrics() {
r.startTimePrometheusDesc = prometheus.NewDesc(
metricsPrefix+"test_start_time",
"The time at which a test started.",
[]string{"testcase"},
nil,
)
r.finishTimePrometheusDesc = prometheus.NewDesc(
metricsPrefix+"test_finish_time",
"The time at which a test finished.",
[]string{"testcase"},
nil,
)
r.testTimeoutPrometheusDesc = prometheus.NewDesc(
metricsPrefix+"test_timeout",
"Outputs 1 on test timeout and 0 otherwise.",
[]string{"testcase"},
nil,
)
r.testFailurePrometheusDesc = prometheus.NewDesc(
metricsPrefix+"test_failure",
"Outputs 1 on test failure, not including timeout, and 0 otherwise.",
[]string{"testcase"},
nil,
)
}

func (r *TestCaseReport) Describe(c chan<- *prometheus.Desc) {
c <- r.startTimePrometheusDesc
c <- r.finishTimePrometheusDesc
c <- r.testTimeoutPrometheusDesc
c <- r.testFailurePrometheusDesc
}

func (r *TestCaseReport) Collect(c chan<- prometheus.Metric) {
c <- prometheus.MustNewConstMetric(
r.startTimePrometheusDesc,
prometheus.CounterValue,
float64(r.Start.Unix()),
r.TestSpec.Name,
)
c <- prometheus.MustNewConstMetric(
r.finishTimePrometheusDesc,
prometheus.CounterValue,
float64(r.Finish.Unix()),
r.TestSpec.Name,
)

// Test failures always contain either "unexpected event for job" or "error asserting failure reason".
// TODO(albin): Improve this.
testFailure := 0.0
if strings.Contains(r.FailureReason, "unexpected event for job") || strings.Contains(r.FailureReason, "error asserting failure reason") {
testFailure = 1.0
}
c <- prometheus.MustNewConstMetric(
r.testFailurePrometheusDesc,
prometheus.GaugeValue,
testFailure,
r.TestSpec.Name,
)

// We assume that any other failures are due to timeout.
// TODO(albin): Improve this.
testTimeout := 0.0
if r.FailureReason != "" && testFailure == 0 {
testTimeout = 1.0
}
c <- prometheus.MustNewConstMetric(
r.testTimeoutPrometheusDesc,
prometheus.GaugeValue,
testTimeout,
r.TestSpec.Name,
)
}

func (report *TestSuiteReport) NumSuccesses() int {
Expand Down Expand Up @@ -203,9 +319,26 @@ func (a *App) RunTests(ctx context.Context, testSpecs []*api.TestSpec) (*TestSui
return nil, err
}

// Optionally push metrics.
if a.Params.PrometheusPushGatewayUrl != "" {
if err := pushTestSuiteReportMetrics(rv, a.Params.PrometheusPushGatewayUrl, a.Params.PrometheusPushGatewayJobName); err != nil {
return nil, err
}
}
return rv, nil
}

func pushTestSuiteReportMetrics(tsr *TestSuiteReport, url, job string) error {
pusher := push.New(url, job)
pusher.Collector(tsr)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
if err := pusher.PushContext(ctx); err != nil {
return errors.WithStack(err)
}
return nil
}

// UnmarshalTestCase unmarshalls bytes into a TestSpec.
func UnmarshalTestCase(yamlBytes []byte, testSpec *api.TestSpec) error {
var result *multierror.Error
Expand Down
7 changes: 2 additions & 5 deletions internal/testsuite/testrunner.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,8 @@ func (report *TestCaseReport) JunitTestCase() junit.Testcase {
}

func (srv *TestRunner) Run(ctx context.Context) (err error) {
report := &TestCaseReport{
Out: &bytes.Buffer{},
Start: time.Now(),
TestSpec: srv.testSpec,
}
report := NewTestCaseReport(srv.testSpec)
report.Out = &bytes.Buffer{}
out := io.MultiWriter(srv.Out, report.Out)

fmt.Fprintf(out, "test case started %s\n", srv.testSpec.ShortString())
Expand Down

0 comments on commit 83fb1a2

Please sign in to comment.