diff --git a/cmd/pint/tests/0003_lint_workdir.txt b/cmd/pint/tests/0003_lint_workdir.txt index 50d88823..3cd88050 100644 --- a/cmd/pint/tests/0003_lint_workdir.txt +++ b/cmd/pint/tests/0003_lint_workdir.txt @@ -57,10 +57,13 @@ rules/0003.yaml:40 Warning: `instance` label should be removed when aggregating rules/0003.yaml:40 Warning: `job` label is required and should be preserved when aggregating `^.+$` rules, use `by(job, ...)`. (promql/aggregate) 40 | expr: sum(byinstance) by(instance) -rules/0003.yaml:61 Information: Using the value of `rate(errors[5m])` inside this annotation might be hard to read, consider using one of humanize template functions to make it more human friendly. (alerts/template) +rules/0003.yaml:55 Bug: `errors[1h1s]` selector is trying to query Prometheus for 1h1s worth of metrics, but 1h is the maximum allowed range query. (promql/range_query) + 55 | expr: sum(rate(errors[1h1s])) > 0.5 + +rules/0003.yaml:61 Information: Using the value of `rate(errors[1h])` inside this annotation might be hard to read, consider using one of humanize template functions to make it more human friendly. (alerts/template) 61 | summary: 'error rate: {{ $value }}' -level=INFO msg="Problems found" Fatal=1 Bug=2 Warning=10 Information=1 +level=INFO msg="Problems found" Fatal=1 Bug=3 Warning=10 Information=1 level=ERROR msg="Fatal error" err="found 2 problem(s) with severity Bug or higher" -- rules/0001.yml -- - record: colo_job:fl_cf_html_bytes_in:rate10m @@ -132,10 +135,10 @@ level=ERROR msg="Fatal error" err="found 2 problem(s) with severity Bug or highe expr: up == 0 - alert: Error Rate - expr: sum(rate(errors[5m])) > 0.5 + expr: sum(rate(errors[1h1s])) > 0.5 - alert: Error Rate - expr: sum(rate(errors[5m])) > 0.5 + expr: sum(rate(errors[1h])) > 0.5 annotations: link: http://docs summary: 'error rate: {{ $value }}' @@ -155,4 +158,9 @@ rule { strip = [ "instance" ] } } - +rule { + range_query { + max = "1h" + severity = "bug" + } +} diff --git a/docs/changelog.md b/docs/changelog.md index cac01f47..62ade3fb 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -6,6 +6,8 @@ - [promql/regexp](checks/promql/regexp.md) check will now look for smelly regexp selectors. See check docs for details. +- [promql/range_query](checks/promql/range_query.md) now allows to configure a custom maximum + duration for range queries - #1064. ## v0.64.1 diff --git a/docs/checks/promql/range_query.md b/docs/checks/promql/range_query.md index 484a392c..0cb58c4f 100644 --- a/docs/checks/promql/range_query.md +++ b/docs/checks/promql/range_query.md @@ -14,8 +14,8 @@ By default Prometheus keeps [15 days of data](https://prometheus.io/docs/prometh this can be customised by setting time or disk space limits. There are two main ways of configuring retention limits in Prometheus: -* time based - Prometheus will keep last N days of metrics -* disk based - Prometheus will try to use up to N bytes of disk space. +- time based - Prometheus will keep last N days of metrics +- disk based - Prometheus will try to use up to N bytes of disk space. Pint will ignore any disk space limits, since that doesn't tell us what the effective time retention is. @@ -34,13 +34,30 @@ getting results of a `avg_over_time(foo[40d])` you are getting the average value of `foo` in the last 40 days, but in reality you're only getting an average value in the last 30 days, and you cannot get any more than that. +You can also configure your own maximum allowed range duration if you want +to ensure that all queries are never requesting more than allowed range. +This can be done by adding a configuration rule as below. + ## Configuration -This check doesn't have any configuration options. +Syntax: + +```js +range_query { + max = "2h" + comment = "..." + severity = "bug|warning|info" +} +``` + +- `max` - duration for the maximum allowed query range. +- `comment` - set a custom comment that will be added to reported problems. +- `severity` - set custom severity for reported issues, defaults to `warning`. ## How to enable it -This check is enabled by default for all configured Prometheus servers. +This check is enabled by default for all configured Prometheus servers and will +validate that queries don't use ranges longer than configured Prometheus retention. Example: @@ -64,6 +81,19 @@ prometheus "dev" { } ``` +Additionally you can configure an extra rule that will enforce a custom maximum +query range duration: + +```js +rule { + range_query { + max = "4h" + comment = "You cannot use range queries with range more than 4h" + severity = "bug" + } +} +``` + ## How to disable it You can disable this check globally by adding this config block: @@ -102,6 +132,20 @@ Example: # pint disable promql/range_query(prod) ``` +To disable a custom maximum range duration rule use: + +```yaml +# pint disable promql/range_query($duration) +``` + +Where `$duration` is the value of `max` option in `range_query` rule. + +Example: + +```yaml +# pint disable promql/range_query(4h) +``` + ## How to snooze it You can disable this check until given time by adding a comment to it. Example: @@ -111,6 +155,6 @@ You can disable this check until given time by adding a comment to it. Example: ``` Where `$TIMESTAMP` is either use [RFC3339](https://www.rfc-editor.org/rfc/rfc3339) -formatted or `YYYY-MM-DD`. -Adding this comment will disable `promql/range_query` *until* `$TIMESTAMP`, after that +formatted or `YYYY-MM-DD`. +Adding this comment will disable `promql/range_query` _until_ `$TIMESTAMP`, after that check will be re-enabled. diff --git a/internal/checks/promql_range_query.go b/internal/checks/promql_range_query.go index 87590cf2..9db4e2dd 100644 --- a/internal/checks/promql_range_query.go +++ b/internal/checks/promql_range_query.go @@ -9,6 +9,7 @@ import ( promParser "github.com/prometheus/prometheus/promql/parser" "github.com/cloudflare/pint/internal/discovery" + "github.com/cloudflare/pint/internal/output" "github.com/cloudflare/pint/internal/parser" "github.com/cloudflare/pint/internal/promapi" ) @@ -17,12 +18,15 @@ const ( RangeQueryCheckName = "promql/range_query" ) -func NewRangeQueryCheck(prom *promapi.FailoverGroup) RangeQueryCheck { - return RangeQueryCheck{prom: prom} +func NewRangeQueryCheck(prom *promapi.FailoverGroup, limit time.Duration, comment string, severity Severity) RangeQueryCheck { + return RangeQueryCheck{prom: prom, limit: limit, comment: comment, severity: severity} } type RangeQueryCheck struct { - prom *promapi.FailoverGroup + prom *promapi.FailoverGroup + comment string + limit time.Duration + severity Severity } func (c RangeQueryCheck) Meta() CheckMeta { @@ -38,6 +42,9 @@ func (c RangeQueryCheck) Meta() CheckMeta { } func (c RangeQueryCheck) String() string { + if c.limit > 0 { + return fmt.Sprintf("%s(%s)", RangeQueryCheckName, output.HumanizeDuration(c.limit)) + } return fmt.Sprintf("%s(%s)", RangeQueryCheckName, c.prom.Name()) } @@ -47,11 +54,26 @@ func (c RangeQueryCheck) Reporter() string { func (c RangeQueryCheck) Check(ctx context.Context, _ discovery.Path, rule parser.Rule, _ []discovery.Entry) (problems []Problem) { expr := rule.Expr() - if expr.SyntaxError != nil { return problems } + if c.limit > 0 { + for _, problem := range c.checkNode(ctx, expr.Query, c.limit, fmt.Sprintf("%s is the maximum allowed range query.", model.Duration(c.limit))) { + problems = append(problems, Problem{ + Lines: expr.Value.Lines, + Reporter: c.Reporter(), + Text: problem.text, + Details: maybeComment(c.comment), + Severity: c.severity, + }) + } + } + + if c.prom == nil || len(problems) > 0 { + return problems + } + flags, err := c.prom.Flags(ctx) if err != nil { text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Warning) @@ -84,7 +106,7 @@ func (c RangeQueryCheck) Check(ctx context.Context, _ discovery.Path, rule parse retention = time.Hour * 24 * 15 } - for _, problem := range c.checkNode(ctx, expr.Query, retention, flags.URI) { + for _, problem := range c.checkNode(ctx, expr.Query, retention, fmt.Sprintf("%s is configured to only keep %s of metrics history.", promText(c.prom.Name(), flags.URI), model.Duration(retention))) { problems = append(problems, Problem{ Lines: expr.Value.Lines, Reporter: c.Reporter(), @@ -96,20 +118,20 @@ func (c RangeQueryCheck) Check(ctx context.Context, _ discovery.Path, rule parse return problems } -func (c RangeQueryCheck) checkNode(ctx context.Context, node *parser.PromQLNode, retention time.Duration, uri string) (problems []exprProblem) { +func (c RangeQueryCheck) checkNode(ctx context.Context, node *parser.PromQLNode, retention time.Duration, reason string) (problems []exprProblem) { if n, ok := node.Expr.(*promParser.MatrixSelector); ok { if n.Range > retention { problems = append(problems, exprProblem{ expr: node.Expr.String(), - text: fmt.Sprintf("`%s` selector is trying to query Prometheus for %s worth of metrics, but %s is configured to only keep %s of metrics history.", - node.Expr, model.Duration(n.Range), promText(c.prom.Name(), uri), model.Duration(retention)), + text: fmt.Sprintf("`%s` selector is trying to query Prometheus for %s worth of metrics, but %s", + node.Expr, model.Duration(n.Range), reason), severity: Warning, }) } } for _, child := range node.Children { - problems = append(problems, c.checkNode(ctx, child, retention, uri)...) + problems = append(problems, c.checkNode(ctx, child, retention, reason)...) } return problems diff --git a/internal/checks/promql_range_query_test.go b/internal/checks/promql_range_query_test.go index 27fc4575..dc1bb47f 100644 --- a/internal/checks/promql_range_query_test.go +++ b/internal/checks/promql_range_query_test.go @@ -3,6 +3,7 @@ package checks_test import ( "fmt" "testing" + "time" "github.com/cloudflare/pint/internal/checks" "github.com/cloudflare/pint/internal/parser" @@ -10,7 +11,11 @@ import ( ) func newRangeQueryCheck(prom *promapi.FailoverGroup) checks.RuleChecker { - return checks.NewRangeQueryCheck(prom) + return checks.NewRangeQueryCheck(prom, 0, "", checks.Fatal) +} + +func newRangeQueryCheckWithLimit(prom *promapi.FailoverGroup) checks.RuleChecker { + return checks.NewRangeQueryCheck(prom, time.Hour*4, "some text", checks.Bug) } func retentionToLow(name, uri, metric, qr, retention string) string { @@ -214,6 +219,33 @@ func TestRangeQueryCheck(t *testing.T) { }, }, }, + { + description: "limit / 3h", + content: "- record: foo\n expr: rate(foo[3h])\n", + checker: newRangeQueryCheckWithLimit, + prometheus: noProm, + problems: noProblems, + }, + { + description: "limit / 5h", + content: "- record: foo\n expr: rate(foo[5h])\n", + checker: newRangeQueryCheckWithLimit, + prometheus: noProm, + problems: func(_ string) []checks.Problem { + return []checks.Problem{ + { + Lines: parser.LineRange{ + First: 2, + Last: 2, + }, + Reporter: "promql/range_query", + Text: "`foo[5h]` selector is trying to query Prometheus for 5h worth of metrics, but 4h is the maximum allowed range query.", + Details: "Rule comment: some text", + Severity: checks.Bug, + }, + } + }, + }, } runTests(t, testCases) } diff --git a/internal/config/__snapshots__/config_test.snap b/internal/config/__snapshots__/config_test.snap index 991ab14f..b27d96ec 100755 --- a/internal/config/__snapshots__/config_test.snap +++ b/internal/config/__snapshots__/config_test.snap @@ -2763,3 +2763,51 @@ ] } --- + +[TestGetChecksForRule/custom_range_query - 1] +{ + "ci": { + "baseBranch": "master", + "maxCommits": 20 + }, + "parser": {}, + "checks": { + "enabled": [ + "alerts/absent", + "alerts/annotation", + "alerts/count", + "alerts/external_labels", + "alerts/for", + "alerts/template", + "labels/conflict", + "promql/aggregate", + "alerts/comparison", + "promql/fragile", + "promql/range_query", + "promql/rate", + "promql/regexp", + "promql/syntax", + "promql/vector_matching", + "query/cost", + "promql/counter", + "promql/series", + "rule/dependency", + "rule/duplicate", + "rule/for", + "rule/name", + "rule/label", + "rule/link", + "rule/reject" + ] + }, + "owners": {}, + "rules": [ + { + "range_query": { + "max": "1h", + "severity": "bug" + } + } + ] +} +--- diff --git a/internal/config/config.go b/internal/config/config.go index 45e800f0..a53519c9 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -134,7 +134,7 @@ func (cfg *Config) GetChecksForRule(ctx context.Context, gen *PrometheusGenerato }) allChecks = append(allChecks, checkMeta{ name: checks.RangeQueryCheckName, - check: checks.NewRangeQueryCheck(p), + check: checks.NewRangeQueryCheck(p, 0, "", checks.Warning), tags: p.Tags(), }) allChecks = append(allChecks, checkMeta{ diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 3e2fb757..d859b3ce 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -1929,6 +1929,32 @@ rule { checks.AlertsCheckName + "(prom)", }, }, + { + title: "custom range_query", + config: `rule { + range_query { + max = "1h" + severity = "bug" + } +}`, + entry: discovery.Entry{ + State: discovery.Modified, + Path: discovery.Path{ + Name: "rules.yml", + SymlinkTarget: "rules.yml", + }, + Rule: newRule(t, "- record: foo\n expr: sum(foo)\n"), + }, + checks: []string{ + checks.SyntaxCheckName, + checks.AlertForCheckName, + checks.ComparisonCheckName, + checks.TemplateCheckName, + checks.FragileCheckName, + checks.RegexpCheckName, + checks.RangeQueryCheckName + "(1h)", + }, + }, } dir := t.TempDir() @@ -2277,6 +2303,14 @@ func TestConfigErrors(t *testing.T) { }`, err: "unknown severity: xxx", }, + { + config: `rule { + range_query { + max = "abc" + } +}`, + err: `not a valid duration string: "abc"`, + }, } dir := t.TempDir() diff --git a/internal/config/range_query.go b/internal/config/range_query.go new file mode 100644 index 00000000..64ac9e0c --- /dev/null +++ b/internal/config/range_query.go @@ -0,0 +1,41 @@ +package config + +import ( + "errors" + + "github.com/cloudflare/pint/internal/checks" +) + +type RangeQuerySettings struct { + Max string `hcl:"max" json:"max"` + Comment string `hcl:"comment,optional" json:"comment,omitempty"` + Severity string `hcl:"severity,optional" json:"severity,omitempty"` +} + +func (s RangeQuerySettings) validate() error { + if s.Max != "" { + dur, err := parseDuration(s.Max) + if err != nil { + return err + } + if dur == 0 { + return errors.New("range_query max value cannot be zero") + } + } + + if s.Severity != "" { + if _, err := checks.ParseSeverity(s.Severity); err != nil { + return err + } + } + + return nil +} + +func (s RangeQuerySettings) getSeverity(fallback checks.Severity) checks.Severity { + if s.Severity != "" { + sev, _ := checks.ParseSeverity(s.Severity) + return sev + } + return fallback +} diff --git a/internal/config/range_query_test.go b/internal/config/range_query_test.go new file mode 100644 index 00000000..2b8cf903 --- /dev/null +++ b/internal/config/range_query_test.go @@ -0,0 +1,49 @@ +package config + +import ( + "errors" + "fmt" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestRangeQuerySettings(t *testing.T) { + type testCaseT struct { + err error + conf RangeQuerySettings + } + + testCases := []testCaseT{ + { + conf: RangeQuerySettings{ + Max: "foo", + }, + err: errors.New(`not a valid duration string: "foo"`), + }, + { + conf: RangeQuerySettings{ + Max: "0h", + }, + err: errors.New("range_query max value cannot be zero"), + }, + { + conf: RangeQuerySettings{ + Max: "1d", + Severity: "bag", + }, + err: errors.New("unknown severity: bag"), + }, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("%v", tc.conf), func(t *testing.T) { + err := tc.conf.validate() + if err == nil || tc.err == nil { + require.Equal(t, err, tc.err) + } else { + require.EqualError(t, err, tc.err.Error()) + } + }) + } +} diff --git a/internal/config/rule.go b/internal/config/rule.go index c46c9c55..3a5a3e3f 100644 --- a/internal/config/rule.go +++ b/internal/config/rule.go @@ -23,6 +23,7 @@ type Rule struct { Alerts *AlertsSettings `hcl:"alerts,block" json:"alerts,omitempty"` For *ForSettings `hcl:"for,block" json:"for,omitempty"` KeepFiringFor *ForSettings `hcl:"keep_firing_for,block" json:"keep_firing_for,omitempty"` + RangeQuery *RangeQuerySettings `hcl:"range_query,block" json:"range_query,omitempty"` Reject []RejectSettings `hcl:"reject,block" json:"reject,omitempty"` RuleLink []RuleLinkSettings `hcl:"link,block" json:"link,omitempty"` RuleName []RuleNameSettings `hcl:"name,block" json:"name,omitempty"` @@ -101,6 +102,12 @@ func (rule Rule) validate() (err error) { } } + if rule.RangeQuery != nil { + if err = rule.RangeQuery.validate(); err != nil { + return err + } + } + return nil } @@ -288,6 +295,15 @@ func (rule Rule) resolveChecks(ctx context.Context, path string, r parser.Rule, }) } + if rule.RangeQuery != nil { + severity := rule.RangeQuery.getSeverity(checks.Warning) + limit, _ := parseDuration(rule.RangeQuery.Max) + enabled = append(enabled, checkMeta{ + name: checks.CostCheckName, + check: checks.NewRangeQueryCheck(nil, limit, rule.RangeQuery.Comment, severity), + }) + } + return enabled }