Merge pull request #1074 from cloudflare/promql/regexp

Look for smelly selectors
cloudflare · Aug 16, 2024 · 1786c76 · 1786c76
2 parents f363eef + 9b01f7f
commit 1786c76
Show file tree

Hide file tree

Showing 5 changed files with 168 additions and 5 deletions.
diff --git a/.github/spellcheck/wordlist.txt b/.github/spellcheck/wordlist.txt
@@ -9,6 +9,7 @@ CLI
 cloudflare
 Cloudflare
 config
+configs
 Deduplicate
 deduplicated
 dir

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,5 +1,12 @@
 # Changelog
 
+## v0.65.0
+
+### Added
+
+- [promql/regexp](checks/promql/regexp.md) check will now look for smelly regexp selectors.
+ See check docs for details.
+
 ## v0.64.1
 
 ### Fixed

diff --git a/docs/checks/promql/regexp.md b/docs/checks/promql/regexp.md
@@ -6,6 +6,11 @@ grand_parent: Documentation
 
 # promql/regexp
 
+This checks will inspect all query metric selectors that use regexp matchers
+to look for common problems.
+
+## Unnecessary regexp selectors
+
 This check will warn if metric selector uses a regexp match but the regexp query
 doesn't have any patterns and so a simple string equality match can be used.
 Since regexp checks are more expensive using a simple equality check if
@@ -26,13 +31,113 @@ Example of a query that wouldn't trigger this warning:
 foo{job=~"bar|baz"}
 ```
 
-Another problem this check will report on is redundant regexp anchors.
+## Redundant anchors
+
 As noted on [Querying Prometheus](https://prometheus.io/docs/prometheus/latest/querying/basics/)
 page Prometheus fully anchors all regex matchers.
 So a query match using `foo=~"bar.*"` will be parsed as `foo=~"^bar.*$"` and
 so any anchors used in the query will be redundant.
 This means that passing `foo=~"^bar.*$"` to the query will be parsed as
 `foo=~"^^bar.*$$"`, so both `^` and `$` should be skipped to avoid it.
+This check will report selectors with redundant anchors.
+
+## Smelly selectors
+
+Metric labels are suppose to be opaque strings that you use for filtering, but sometimes
+you might end up with a single label value that is really a few different strings concatenated together,
+rather than a few different labels. Which then forces you to use regexp label selectors that target
+individual parts of that one very long label.
+
+An example of that is a `job` label that includes too much information and instead of just being a name.
+Since the `job` labels value is (by default) equal to the `job_name` field on a scrape configuration block
+it's easy to end up with one very long string if you need to create a few similar scrape configs:
+
+```yaml
+- job_name: myservice_cluster1_production
+ [...]
+
+- job_name: myservice_cluster2_production
+ [...]
+
+- job_name: myservice_cluster1_staging
+ [...]
+```
+
+In the configs above we end up with `job` label holding extra information:
+
+- the environment in which the service is deployed (`production` or `staging`)
+- the name of the cluster (`cluster1` or `cluster2`)
+
+And the time series we will scrape using this config will look like this:
+
+```js
+{job="myservice_cluster1_production", instance="..."}
+{job="myservice_cluster2_production", instance="..."}
+{job="myservice_cluster1_staging", instance="..."}
+```
+
+If we then wanted to query metrics only for the `production` environment we have to use a regexp:
+
+```yaml
+- alert: Scrape failed
+ expr: up{job=~"myservice_.+_production"} == 0
+```
+
+This isn't ideal for a number of reasons:
+
+- It's a lot less obvious what's happening here and why.
+- It's much easier to have a typo in a long regexp expression.
+- It's a lot harder to aggregate by cluster or environment this way.
+
+This is why regexp selector like this are a code smell that should be avoided.
+To avoid using them one can simply set more labels on these scrape jobs:
+
+```yaml
+- job_name: myservice_cluster1_production
+ [...]
+ relabel_configs:
+ - target_label: job
+ replacement: myservice
+ - target_label: cluster
+ replacement: cluster1
+ - target_label: env
+ replacement: production
+
+- job_name: myservice_cluster2_production
+ [...]
+ relabel_configs:
+ - target_label: job
+ replacement: myservice
+ - target_label: cluster
+ replacement: cluster2
+ - target_label: env
+ replacement: production
+
+- job_name: myservice_cluster1_staging
+ [...]
+ relabel_configs:
+ - target_label: job
+ replacement: myservice
+ - target_label: cluster
+ replacement: cluster1
+ - target_label: env
+ replacement: staging
+```
+
+Which will result in time series with explicit labels:
+
+```js
+{job="myservice", cluster="cluster1", env="production", instance="..."}
+{job="myservice", cluster="cluster2", env="production", instance="..."}
+{job="myservice", cluster="cluster1", env="staging", instance="..."}
+```
+
+And simple explicit queries:
+
+```yaml
+- alert: Scrape failed
+ expr: up{job="myservice", env="production"} == 0
+```
 
 ## Configuration
 

diff --git a/internal/checks/promql_regexp.go b/internal/checks/promql_regexp.go
@@ -85,15 +85,19 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule
  re = "^(?:" + re + ")$"
  }
 
- var hasFlags, isUseful, isWildcard, isLiteral, isBad bool
+ var hasFlags, isUseful, isWildcard, isLiteral, isBad, isSmelly bool
  var beginText, endText int
+ var lastOp syntax.Op
  r, _ := syntax.Parse(re, syntax.Perl)
  for _, s := range r.Sub {
  // If effective flags are different from default flags then we assume regexp is useful.
  // It could be case sensitive match.
  if s.Flags > 0 && s.Flags != syntax.Perl {
  hasFlags = true
  }
+ if isOpSmelly(s.Op, lastOp) {
+ isSmelly = true
+ }
  // nolint: exhaustive
  switch s.Op {
  case syntax.OpBeginText:
@@ -114,6 +118,7 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule
  default:
  isUseful = true
  }
+ lastOp = s.Op
  }
  if hasFlags && !isWildcard {
  isUseful = true
@@ -137,12 +142,16 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule
  bad = append(bad, badMatcher{lm: lm, badAnchor: true})
  isBad = true
  }
+ if isSmelly {
+ bad = append(bad, badMatcher{lm: lm, isSmelly: true})
+ }
  if !isBad {
  good = append(good, lm)
  }
  }
  for _, b := range bad {
  var text string
+ s := Bug
  switch {
  case b.badAnchor:
  text = fmt.Sprintf("Prometheus regexp matchers are automatically fully anchored so match for `%s` will result in `%s%s\"^%s$\"`, remove regexp anchors `^` and/or `$`.",
@@ -154,6 +163,9 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule
  case b.isWildcard && b.op == labels.MatchNotEqual:
  text = fmt.Sprintf("Unnecessary wildcard regexp, simply use `%s` if you want to match on all time series for `%s` without the `%s` label.",
  makeLabel(name, slices.Concat(good, []*labels.Matcher{{Type: labels.MatchEqual, Name: b.lm.Name, Value: ""}})...), name, b.lm.Name)
+ case b.isSmelly:
+ text = fmt.Sprintf("`{%s}` looks like a smelly selector that tries to extract substrings from the value, please consider breaking down the value of this label into multiple smaller labels", b.lm.String())
+ s = Warning
  default:
  text = fmt.Sprintf("Unnecessary regexp match on static string `%s`, use `%s%s%q` instead.",
  b.lm, b.lm.Name, b.op, b.lm.Value)
@@ -164,7 +176,7 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule
  Reporter: c.Reporter(),
  Text: text,
  Details: RegexpCheckDetails,
- Severity: Bug,
+ Severity: s,
  })
  }
  }
@@ -176,6 +188,7 @@ type badMatcher struct {
  lm *labels.Matcher
  op labels.MatchType
  isWildcard bool
+ isSmelly bool
  badAnchor bool
 }
 
@@ -203,3 +216,13 @@ func makeLabel(name string, matchers ...*labels.Matcher) string {
  b.WriteRune('}')
  return b.String()
 }
+
+func isOpSmelly(a, b syntax.Op) bool {
+ if a == syntax.OpLiteral && (b == syntax.OpStar || b == syntax.OpPlus) {
+ return true
+ }
+ if b == syntax.OpLiteral && (a == syntax.OpStar || a == syntax.OpPlus) {
+ return true
+ }
+ return false
+}
diff --git a/internal/checks/promql_regexp_test.go b/internal/checks/promql_regexp_test.go
@@ -30,7 +30,7 @@ func TestRegexpCheck(t *testing.T) {
  },
  {
  description: "valid regexp",
- content: "- record: foo\n expr: foo{job=~\"bar.+\"}\n",
+ content: "- record: foo\n expr: foo{job=~\"bar|foo\"}\n",
  checker: newRegexpCheck,
  prometheus: noProm,
  problems: noProblems,
@@ -51,7 +51,7 @@ func TestRegexpCheck(t *testing.T) {
  },
  {
  description: "valid partial regexp",
- content: "- record: foo\n expr: foo{job=~\"prefix.*\"}\n",
+ content: "- record: foo\n expr: foo{job=~\"prefix(a|b)\"}\n",
  checker: newRegexpCheck,
  prometheus: noProm,
  problems: noProblems,
@@ -404,6 +404,33 @@ func TestRegexpCheck(t *testing.T) {
  prometheus: noProm,
  problems: noProblems,
  },
+ {
+ description: "smelly selector",
+ content: "- record: foo\n expr: foo{job=~\"service_.*_prod\"}\n",
+ checker: newRegexpCheck,
+ prometheus: noProm,
+ problems: func(_ string) []checks.Problem {
+ return []checks.Problem{
+ {
+ Lines: parser.LineRange{
+ First: 2,
+ Last: 2,
+ },
+ Reporter: checks.RegexpCheckName,
+ Text: "`{job=~\"service_.*_prod\"}` looks like a smelly selector that tries to extract substrings from the value, please consider breaking down the value of this label into multiple smaller labels",
+ Details: checks.RegexpCheckDetails,
+ Severity: checks.Warning,
+ },
+ }
+ },
+ },
+ {
+ description: "non-smelly selector",
+ content: "- record: foo\n expr: rate(http_requests_total{job=\"foo\", code=~\"5..\"}[5m])\n",
+ checker: newRegexpCheck,
+ prometheus: noProm,
+ problems: noProblems,
+ },
  }
  runTests(t, testCases)
 }