Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Push metrics from testsuite #2970

Merged
merged 6 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions cmd/testsuite/cmd/test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ func testCmd(app *testsuite.App) *cobra.Command {
cmd.Flags().String("tests", "", "Test file pattern, e.g., './testcases/*.yaml'.")
cmd.Flags().String("junit", "", "Write a JUnit test report to this path.")
cmd.Flags().String("benchmark", "", "Write a benchmark test report to this path.")
cmd.Flags().String("prometheusPushgatewayUrl", "", "Push metrics to Prometheus pushgateway at this url.")
cmd.Flags().String("prometheusPushgatewayJobName", "armada-testsuite", "Metrics are annotated with with job=prometheusPushGatewayJobName.")
return cmd
}

Expand Down Expand Up @@ -64,6 +66,18 @@ func testCmdRunE(app *testsuite.App) func(cmd *cobra.Command, args []string) err
return errors.New("benchmark report not currently supported")
}

prometheusPushgatewayUrl, err := cmd.Flags().GetString("prometheusPushgatewayUrl")
if err != nil {
return errors.WithStack(err)
}
app.Params.PrometheusPushGatewayUrl = prometheusPushgatewayUrl

prometheusPushgatewayJobName, err := cmd.Flags().GetString("prometheusPushgatewayJobName")
if err != nil {
return errors.WithStack(err)
}
app.Params.PrometheusPushGatewayJobName = prometheusPushgatewayJobName

// Create a context that is cancelled on SIGINT/SIGTERM.
// Ensures test jobs are cancelled on ctrl-C.
ctx, cancel := context.WithCancel(context.Background())
Expand Down
135 changes: 134 additions & 1 deletion internal/testsuite/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ import (
"github.com/hashicorp/go-multierror"
"github.com/mattn/go-zglob"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/push"
"github.com/renstrom/shortuuid"
"golang.org/x/sync/errgroup"
apimachineryYaml "k8s.io/apimachinery/pkg/util/yaml"
Expand All @@ -29,6 +31,8 @@ import (
"github.com/armadaproject/armada/pkg/client"
)

const metricsPrefix = "armada_testsuite_"

type App struct {
// Parameters passed to the CLI by the user.
Params *Params
Expand All @@ -45,7 +49,13 @@ type App struct {
// and that they can be provided either dynamically on a command line, or
// statically in a config file that's reused between command runs.
type Params struct {
// Armada connection details.
ApiConnectionDetails *client.ApiConnectionDetails
// If non-empty, push metrics containing test results to a Prometheus push gateway with this url.
PrometheusPushGatewayUrl string
// Exported metrics are annotated with job=PrometheusPushGatewayJobName.
// Must be non-empty.
PrometheusPushGatewayJobName string
}

// New instantiates an App with default parameters, including standard output
Expand Down Expand Up @@ -107,7 +117,7 @@ func TestSpecFromFilePath(filePath string) (*api.TestSpec, error) {
return nil, err
}

// Randomise jobSetName for each test to ensure we're only getting events for this run.
// Randomise job set for each test to ensure we're only getting events for this run.
fileName := filepath.Base(filePath)
fileName = strings.TrimSuffix(fileName, filepath.Ext(fileName))
testSpec.JobSetId = fileName + "-" + shortuuid.New()
Expand All @@ -126,13 +136,119 @@ type TestSuiteReport struct {
TestCaseReports []*TestCaseReport
}

func (tsr *TestSuiteReport) Describe(c chan<- *prometheus.Desc) {
for _, tcr := range tsr.TestCaseReports {
tcr.Describe(c)
}
}

func (tsr *TestSuiteReport) Collect(c chan<- prometheus.Metric) {
for _, tcr := range tsr.TestCaseReports {
tcr.Collect(c)
}
}

type TestCaseReport struct {
Out *bytes.Buffer
Start time.Time
Finish time.Time
FailureReason string
BenchmarkReport *eventbenchmark.TestCaseBenchmarkReport
TestSpec *api.TestSpec

// Prometheus metric descriptions.
// Test start time in seconds since the epoch.
startTimePrometheusDesc *prometheus.Desc
// Test finish time in seconds since the epoch.
finishTimePrometheusDesc *prometheus.Desc
// Outputs 1 on test timeout.
testTimeoutPrometheusDesc *prometheus.Desc
// Outputs 1 on test failure, not including timeouts.
testFailurePrometheusDesc *prometheus.Desc
}

func NewTestCaseReport(testSpec *api.TestSpec) *TestCaseReport {
rv := &TestCaseReport{
Start: time.Now(),
TestSpec: testSpec,
}
rv.initialiseMetrics()
return rv
}

func (r *TestCaseReport) initialiseMetrics() {
r.startTimePrometheusDesc = prometheus.NewDesc(
metricsPrefix+"test_start_time",
"The time at which a test started.",
[]string{"testcase"},
nil,
)
r.finishTimePrometheusDesc = prometheus.NewDesc(
metricsPrefix+"test_finish_time",
"The time at which a test finished.",
[]string{"testcase"},
nil,
)
r.testTimeoutPrometheusDesc = prometheus.NewDesc(
metricsPrefix+"test_timeout",
"Outputs 1 on test timeout and 0 otherwise.",
[]string{"testcase"},
nil,
)
r.testFailurePrometheusDesc = prometheus.NewDesc(
metricsPrefix+"test_failure",
"Outputs 1 on test failure, not including timeout, and 0 otherwise.",
[]string{"testcase"},
nil,
)
}

func (r *TestCaseReport) Describe(c chan<- *prometheus.Desc) {
c <- r.startTimePrometheusDesc
c <- r.finishTimePrometheusDesc
c <- r.testTimeoutPrometheusDesc
c <- r.testFailurePrometheusDesc
}

func (r *TestCaseReport) Collect(c chan<- prometheus.Metric) {
c <- prometheus.MustNewConstMetric(
r.startTimePrometheusDesc,
prometheus.CounterValue,
float64(r.Start.Unix()),
r.TestSpec.Name,
)
c <- prometheus.MustNewConstMetric(
r.finishTimePrometheusDesc,
prometheus.CounterValue,
float64(r.Finish.Unix()),
r.TestSpec.Name,
)

// Test failures always contain either "unexpected event for job" or "error asserting failure reason".
// TODO(albin): Improve this.
testFailure := 0.0
if strings.Contains(r.FailureReason, "unexpected event for job") || strings.Contains(r.FailureReason, "error asserting failure reason") {
testFailure = 1.0
}
c <- prometheus.MustNewConstMetric(
r.testFailurePrometheusDesc,
prometheus.GaugeValue,
testFailure,
r.TestSpec.Name,
)

// We assume that any other failures are due to timeout.
// TODO(albin): Improve this.
testTimeout := 0.0
if r.FailureReason != "" && testFailure == 0 {
testTimeout = 1.0
}
c <- prometheus.MustNewConstMetric(
r.testTimeoutPrometheusDesc,
prometheus.GaugeValue,
testTimeout,
r.TestSpec.Name,
)
}

func (report *TestSuiteReport) NumSuccesses() int {
Expand Down Expand Up @@ -203,9 +319,26 @@ func (a *App) RunTests(ctx context.Context, testSpecs []*api.TestSpec) (*TestSui
return nil, err
}

// Optionally push metrics.
if a.Params.PrometheusPushGatewayUrl != "" {
if err := pushTestSuiteReportMetrics(rv, a.Params.PrometheusPushGatewayUrl, a.Params.PrometheusPushGatewayJobName); err != nil {
return nil, err
}
}
return rv, nil
}

func pushTestSuiteReportMetrics(tsr *TestSuiteReport, url, job string) error {
pusher := push.New(url, job)
pusher.Collector(tsr)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
if err := pusher.PushContext(ctx); err != nil {
return errors.WithStack(err)
}
return nil
}

// UnmarshalTestCase unmarshalls bytes into a TestSpec.
func UnmarshalTestCase(yamlBytes []byte, testSpec *api.TestSpec) error {
var result *multierror.Error
Expand Down
7 changes: 2 additions & 5 deletions internal/testsuite/testrunner.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,8 @@ func (report *TestCaseReport) JunitTestCase() junit.Testcase {
}

func (srv *TestRunner) Run(ctx context.Context) (err error) {
report := &TestCaseReport{
Out: &bytes.Buffer{},
Start: time.Now(),
TestSpec: srv.testSpec,
}
report := NewTestCaseReport(srv.testSpec)
report.Out = &bytes.Buffer{}
out := io.MultiWriter(srv.Out, report.Out)

fmt.Fprintf(out, "test case started %s\n", srv.testSpec.ShortString())
Expand Down
Loading