diff --git a/cmd/testsuite/cmd/test.go b/cmd/testsuite/cmd/test.go index 72eef04d966..ccf8ecdf9fd 100644 --- a/cmd/testsuite/cmd/test.go +++ b/cmd/testsuite/cmd/test.go @@ -31,6 +31,8 @@ func testCmd(app *testsuite.App) *cobra.Command { cmd.Flags().String("tests", "", "Test file pattern, e.g., './testcases/*.yaml'.") cmd.Flags().String("junit", "", "Write a JUnit test report to this path.") cmd.Flags().String("benchmark", "", "Write a benchmark test report to this path.") + cmd.Flags().String("prometheusPushgatewayUrl", "", "Push metrics to Prometheus pushgateway at this url.") + cmd.Flags().String("prometheusPushgatewayJobName", "armada-testsuite", "Metrics are annotated with with job=prometheusPushGatewayJobName.") return cmd } @@ -64,6 +66,18 @@ func testCmdRunE(app *testsuite.App) func(cmd *cobra.Command, args []string) err return errors.New("benchmark report not currently supported") } + prometheusPushgatewayUrl, err := cmd.Flags().GetString("prometheusPushgatewayUrl") + if err != nil { + return errors.WithStack(err) + } + app.Params.PrometheusPushGatewayUrl = prometheusPushgatewayUrl + + prometheusPushgatewayJobName, err := cmd.Flags().GetString("prometheusPushgatewayJobName") + if err != nil { + return errors.WithStack(err) + } + app.Params.PrometheusPushGatewayJobName = prometheusPushgatewayJobName + // Create a context that is cancelled on SIGINT/SIGTERM. // Ensures test jobs are cancelled on ctrl-C. ctx, cancel := context.WithCancel(context.Background()) diff --git a/internal/testsuite/app.go b/internal/testsuite/app.go index 218c7269288..05a0c4402c2 100644 --- a/internal/testsuite/app.go +++ b/internal/testsuite/app.go @@ -17,6 +17,8 @@ import ( "github.com/hashicorp/go-multierror" "github.com/mattn/go-zglob" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/push" "github.com/renstrom/shortuuid" "golang.org/x/sync/errgroup" apimachineryYaml "k8s.io/apimachinery/pkg/util/yaml" @@ -29,6 +31,8 @@ import ( "github.com/armadaproject/armada/pkg/client" ) +const metricsPrefix = "armada_testsuite_" + type App struct { // Parameters passed to the CLI by the user. Params *Params @@ -45,7 +49,13 @@ type App struct { // and that they can be provided either dynamically on a command line, or // statically in a config file that's reused between command runs. type Params struct { + // Armada connection details. ApiConnectionDetails *client.ApiConnectionDetails + // If non-empty, push metrics containing test results to a Prometheus push gateway with this url. + PrometheusPushGatewayUrl string + // Exported metrics are annotated with job=PrometheusPushGatewayJobName. + // Must be non-empty. + PrometheusPushGatewayJobName string } // New instantiates an App with default parameters, including standard output @@ -107,7 +117,7 @@ func TestSpecFromFilePath(filePath string) (*api.TestSpec, error) { return nil, err } - // Randomise jobSetName for each test to ensure we're only getting events for this run. + // Randomise job set for each test to ensure we're only getting events for this run. fileName := filepath.Base(filePath) fileName = strings.TrimSuffix(fileName, filepath.Ext(fileName)) testSpec.JobSetId = fileName + "-" + shortuuid.New() @@ -126,6 +136,18 @@ type TestSuiteReport struct { TestCaseReports []*TestCaseReport } +func (tsr *TestSuiteReport) Describe(c chan<- *prometheus.Desc) { + for _, tcr := range tsr.TestCaseReports { + tcr.Describe(c) + } +} + +func (tsr *TestSuiteReport) Collect(c chan<- prometheus.Metric) { + for _, tcr := range tsr.TestCaseReports { + tcr.Collect(c) + } +} + type TestCaseReport struct { Out *bytes.Buffer Start time.Time @@ -133,6 +155,100 @@ type TestCaseReport struct { FailureReason string BenchmarkReport *eventbenchmark.TestCaseBenchmarkReport TestSpec *api.TestSpec + + // Prometheus metric descriptions. + // Test start time in seconds since the epoch. + startTimePrometheusDesc *prometheus.Desc + // Test finish time in seconds since the epoch. + finishTimePrometheusDesc *prometheus.Desc + // Outputs 1 on test timeout. + testTimeoutPrometheusDesc *prometheus.Desc + // Outputs 1 on test failure, not including timeouts. + testFailurePrometheusDesc *prometheus.Desc +} + +func NewTestCaseReport(testSpec *api.TestSpec) *TestCaseReport { + rv := &TestCaseReport{ + Start: time.Now(), + TestSpec: testSpec, + } + rv.initialiseMetrics() + return rv +} + +func (r *TestCaseReport) initialiseMetrics() { + r.startTimePrometheusDesc = prometheus.NewDesc( + metricsPrefix+"test_start_time", + "The time at which a test started.", + []string{"testcase"}, + nil, + ) + r.finishTimePrometheusDesc = prometheus.NewDesc( + metricsPrefix+"test_finish_time", + "The time at which a test finished.", + []string{"testcase"}, + nil, + ) + r.testTimeoutPrometheusDesc = prometheus.NewDesc( + metricsPrefix+"test_timeout", + "Outputs 1 on test timeout and 0 otherwise.", + []string{"testcase"}, + nil, + ) + r.testFailurePrometheusDesc = prometheus.NewDesc( + metricsPrefix+"test_failure", + "Outputs 1 on test failure, not including timeout, and 0 otherwise.", + []string{"testcase"}, + nil, + ) +} + +func (r *TestCaseReport) Describe(c chan<- *prometheus.Desc) { + c <- r.startTimePrometheusDesc + c <- r.finishTimePrometheusDesc + c <- r.testTimeoutPrometheusDesc + c <- r.testFailurePrometheusDesc +} + +func (r *TestCaseReport) Collect(c chan<- prometheus.Metric) { + c <- prometheus.MustNewConstMetric( + r.startTimePrometheusDesc, + prometheus.CounterValue, + float64(r.Start.Unix()), + r.TestSpec.Name, + ) + c <- prometheus.MustNewConstMetric( + r.finishTimePrometheusDesc, + prometheus.CounterValue, + float64(r.Finish.Unix()), + r.TestSpec.Name, + ) + + // Test failures always contain either "unexpected event for job" or "error asserting failure reason". + // TODO(albin): Improve this. + testFailure := 0.0 + if strings.Contains(r.FailureReason, "unexpected event for job") || strings.Contains(r.FailureReason, "error asserting failure reason") { + testFailure = 1.0 + } + c <- prometheus.MustNewConstMetric( + r.testFailurePrometheusDesc, + prometheus.GaugeValue, + testFailure, + r.TestSpec.Name, + ) + + // We assume that any other failures are due to timeout. + // TODO(albin): Improve this. + testTimeout := 0.0 + if r.FailureReason != "" && testFailure == 0 { + testTimeout = 1.0 + } + c <- prometheus.MustNewConstMetric( + r.testTimeoutPrometheusDesc, + prometheus.GaugeValue, + testTimeout, + r.TestSpec.Name, + ) } func (report *TestSuiteReport) NumSuccesses() int { @@ -203,9 +319,26 @@ func (a *App) RunTests(ctx context.Context, testSpecs []*api.TestSpec) (*TestSui return nil, err } + // Optionally push metrics. + if a.Params.PrometheusPushGatewayUrl != "" { + if err := pushTestSuiteReportMetrics(rv, a.Params.PrometheusPushGatewayUrl, a.Params.PrometheusPushGatewayJobName); err != nil { + return nil, err + } + } return rv, nil } +func pushTestSuiteReportMetrics(tsr *TestSuiteReport, url, job string) error { + pusher := push.New(url, job) + pusher.Collector(tsr) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if err := pusher.PushContext(ctx); err != nil { + return errors.WithStack(err) + } + return nil +} + // UnmarshalTestCase unmarshalls bytes into a TestSpec. func UnmarshalTestCase(yamlBytes []byte, testSpec *api.TestSpec) error { var result *multierror.Error diff --git a/internal/testsuite/testrunner.go b/internal/testsuite/testrunner.go index 2feed1285c0..3108cc9888d 100644 --- a/internal/testsuite/testrunner.go +++ b/internal/testsuite/testrunner.go @@ -49,11 +49,8 @@ func (report *TestCaseReport) JunitTestCase() junit.Testcase { } func (srv *TestRunner) Run(ctx context.Context) (err error) { - report := &TestCaseReport{ - Out: &bytes.Buffer{}, - Start: time.Now(), - TestSpec: srv.testSpec, - } + report := NewTestCaseReport(srv.testSpec) + report.Out = &bytes.Buffer{} out := io.MultiWriter(srv.Out, report.Out) fmt.Fprintf(out, "test case started %s\n", srv.testSpec.ShortString())