From 9b01f7f99f4d306df1d92fd685fe282770936c3e Mon Sep 17 00:00:00 2001 From: Lukasz Mierzwa Date: Fri, 16 Aug 2024 13:50:03 +0100 Subject: [PATCH] Look for smelly selectors --- .github/spellcheck/wordlist.txt | 1 + docs/changelog.md | 7 ++ docs/checks/promql/regexp.md | 107 +++++++++++++++++++++++++- internal/checks/promql_regexp.go | 27 ++++++- internal/checks/promql_regexp_test.go | 31 +++++++- 5 files changed, 168 insertions(+), 5 deletions(-) diff --git a/.github/spellcheck/wordlist.txt b/.github/spellcheck/wordlist.txt index ed9b61d7..ef607f70 100644 --- a/.github/spellcheck/wordlist.txt +++ b/.github/spellcheck/wordlist.txt @@ -9,6 +9,7 @@ CLI cloudflare Cloudflare config +configs Deduplicate deduplicated dir diff --git a/docs/changelog.md b/docs/changelog.md index e53623b8..cac01f47 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,12 @@ # Changelog +## v0.65.0 + +### Added + +- [promql/regexp](checks/promql/regexp.md) check will now look for smelly regexp selectors. + See check docs for details. + ## v0.64.1 ### Fixed diff --git a/docs/checks/promql/regexp.md b/docs/checks/promql/regexp.md index 09b25dd1..85fd7251 100644 --- a/docs/checks/promql/regexp.md +++ b/docs/checks/promql/regexp.md @@ -6,6 +6,11 @@ grand_parent: Documentation # promql/regexp +This checks will inspect all query metric selectors that use regexp matchers +to look for common problems. + +## Unnecessary regexp selectors + This check will warn if metric selector uses a regexp match but the regexp query doesn't have any patterns and so a simple string equality match can be used. Since regexp checks are more expensive using a simple equality check if @@ -26,13 +31,113 @@ Example of a query that wouldn't trigger this warning: foo{job=~"bar|baz"} ``` -Another problem this check will report on is redundant regexp anchors. +## Redundant anchors + As noted on [Querying Prometheus](https://prometheus.io/docs/prometheus/latest/querying/basics/) page Prometheus fully anchors all regex matchers. So a query match using `foo=~"bar.*"` will be parsed as `foo=~"^bar.*$"` and so any anchors used in the query will be redundant. This means that passing `foo=~"^bar.*$"` to the query will be parsed as `foo=~"^^bar.*$$"`, so both `^` and `$` should be skipped to avoid it. +This check will report selectors with redundant anchors. + +## Smelly selectors + +Metric labels are suppose to be opaque strings that you use for filtering, but sometimes +you might end up with a single label value that is really a few different strings concatenated together, +rather than a few different labels. Which then forces you to use regexp label selectors that target +individual parts of that one very long label. + +An example of that is a `job` label that includes too much information and instead of just being a name. +Since the `job` labels value is (by default) equal to the `job_name` field on a scrape configuration block +it's easy to end up with one very long string if you need to create a few similar scrape configs: + +```yaml +- job_name: myservice_cluster1_production + [...] + +- job_name: myservice_cluster2_production + [...] + +- job_name: myservice_cluster1_staging + [...] +``` + +In the configs above we end up with `job` label holding extra information: + +- the environment in which the service is deployed (`production` or `staging`) +- the name of the cluster (`cluster1` or `cluster2`) + +And the time series we will scrape using this config will look like this: + +```js +{job="myservice_cluster1_production", instance="..."} +{job="myservice_cluster2_production", instance="..."} +{job="myservice_cluster1_staging", instance="..."} +``` + +If we then wanted to query metrics only for the `production` environment we have to use a regexp: + +```yaml +- alert: Scrape failed + expr: up{job=~"myservice_.+_production"} == 0 +``` + +This isn't ideal for a number of reasons: + +- It's a lot less obvious what's happening here and why. +- It's much easier to have a typo in a long regexp expression. +- It's a lot harder to aggregate by cluster or environment this way. + +This is why regexp selector like this are a code smell that should be avoided. +To avoid using them one can simply set more labels on these scrape jobs: + +```yaml +- job_name: myservice_cluster1_production + [...] + relabel_configs: + - target_label: job + replacement: myservice + - target_label: cluster + replacement: cluster1 + - target_label: env + replacement: production + +- job_name: myservice_cluster2_production + [...] + relabel_configs: + - target_label: job + replacement: myservice + - target_label: cluster + replacement: cluster2 + - target_label: env + replacement: production + +- job_name: myservice_cluster1_staging + [...] + relabel_configs: + - target_label: job + replacement: myservice + - target_label: cluster + replacement: cluster1 + - target_label: env + replacement: staging +``` + +Which will result in time series with explicit labels: + +```js +{job="myservice", cluster="cluster1", env="production", instance="..."} +{job="myservice", cluster="cluster2", env="production", instance="..."} +{job="myservice", cluster="cluster1", env="staging", instance="..."} +``` + +And simple explicit queries: + +```yaml +- alert: Scrape failed + expr: up{job="myservice", env="production"} == 0 +``` ## Configuration diff --git a/internal/checks/promql_regexp.go b/internal/checks/promql_regexp.go index 4f7f88cb..30f5292b 100644 --- a/internal/checks/promql_regexp.go +++ b/internal/checks/promql_regexp.go @@ -85,8 +85,9 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule re = "^(?:" + re + ")$" } - var hasFlags, isUseful, isWildcard, isLiteral, isBad bool + var hasFlags, isUseful, isWildcard, isLiteral, isBad, isSmelly bool var beginText, endText int + var lastOp syntax.Op r, _ := syntax.Parse(re, syntax.Perl) for _, s := range r.Sub { // If effective flags are different from default flags then we assume regexp is useful. @@ -94,6 +95,9 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule if s.Flags > 0 && s.Flags != syntax.Perl { hasFlags = true } + if isOpSmelly(s.Op, lastOp) { + isSmelly = true + } // nolint: exhaustive switch s.Op { case syntax.OpBeginText: @@ -114,6 +118,7 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule default: isUseful = true } + lastOp = s.Op } if hasFlags && !isWildcard { isUseful = true @@ -137,12 +142,16 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule bad = append(bad, badMatcher{lm: lm, badAnchor: true}) isBad = true } + if isSmelly { + bad = append(bad, badMatcher{lm: lm, isSmelly: true}) + } if !isBad { good = append(good, lm) } } for _, b := range bad { var text string + s := Bug switch { case b.badAnchor: text = fmt.Sprintf("Prometheus regexp matchers are automatically fully anchored so match for `%s` will result in `%s%s\"^%s$\"`, remove regexp anchors `^` and/or `$`.", @@ -154,6 +163,9 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule case b.isWildcard && b.op == labels.MatchNotEqual: text = fmt.Sprintf("Unnecessary wildcard regexp, simply use `%s` if you want to match on all time series for `%s` without the `%s` label.", makeLabel(name, slices.Concat(good, []*labels.Matcher{{Type: labels.MatchEqual, Name: b.lm.Name, Value: ""}})...), name, b.lm.Name) + case b.isSmelly: + text = fmt.Sprintf("`{%s}` looks like a smelly selector that tries to extract substrings from the value, please consider breaking down the value of this label into multiple smaller labels", b.lm.String()) + s = Warning default: text = fmt.Sprintf("Unnecessary regexp match on static string `%s`, use `%s%s%q` instead.", b.lm, b.lm.Name, b.op, b.lm.Value) @@ -164,7 +176,7 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule Reporter: c.Reporter(), Text: text, Details: RegexpCheckDetails, - Severity: Bug, + Severity: s, }) } } @@ -176,6 +188,7 @@ type badMatcher struct { lm *labels.Matcher op labels.MatchType isWildcard bool + isSmelly bool badAnchor bool } @@ -203,3 +216,13 @@ func makeLabel(name string, matchers ...*labels.Matcher) string { b.WriteRune('}') return b.String() } + +func isOpSmelly(a, b syntax.Op) bool { + if a == syntax.OpLiteral && (b == syntax.OpStar || b == syntax.OpPlus) { + return true + } + if b == syntax.OpLiteral && (a == syntax.OpStar || a == syntax.OpPlus) { + return true + } + return false +} diff --git a/internal/checks/promql_regexp_test.go b/internal/checks/promql_regexp_test.go index 9870ca3c..5afa491f 100644 --- a/internal/checks/promql_regexp_test.go +++ b/internal/checks/promql_regexp_test.go @@ -30,7 +30,7 @@ func TestRegexpCheck(t *testing.T) { }, { description: "valid regexp", - content: "- record: foo\n expr: foo{job=~\"bar.+\"}\n", + content: "- record: foo\n expr: foo{job=~\"bar|foo\"}\n", checker: newRegexpCheck, prometheus: noProm, problems: noProblems, @@ -51,7 +51,7 @@ func TestRegexpCheck(t *testing.T) { }, { description: "valid partial regexp", - content: "- record: foo\n expr: foo{job=~\"prefix.*\"}\n", + content: "- record: foo\n expr: foo{job=~\"prefix(a|b)\"}\n", checker: newRegexpCheck, prometheus: noProm, problems: noProblems, @@ -404,6 +404,33 @@ func TestRegexpCheck(t *testing.T) { prometheus: noProm, problems: noProblems, }, + { + description: "smelly selector", + content: "- record: foo\n expr: foo{job=~\"service_.*_prod\"}\n", + checker: newRegexpCheck, + prometheus: noProm, + problems: func(_ string) []checks.Problem { + return []checks.Problem{ + { + Lines: parser.LineRange{ + First: 2, + Last: 2, + }, + Reporter: checks.RegexpCheckName, + Text: "`{job=~\"service_.*_prod\"}` looks like a smelly selector that tries to extract substrings from the value, please consider breaking down the value of this label into multiple smaller labels", + Details: checks.RegexpCheckDetails, + Severity: checks.Warning, + }, + } + }, + }, + { + description: "non-smelly selector", + content: "- record: foo\n expr: rate(http_requests_total{job=\"foo\", code=~\"5..\"}[5m])\n", + checker: newRegexpCheck, + prometheus: noProm, + problems: noProblems, + }, } runTests(t, testCases) }