From 9b01f7f99f4d306df1d92fd685fe282770936c3e Mon Sep 17 00:00:00 2001
From: Lukasz Mierzwa <lukasz@cloudflare.com>
Date: Fri, 16 Aug 2024 13:50:03 +0100
Subject: [PATCH] Look for smelly selectors

---
 .github/spellcheck/wordlist.txt       |   1 +
 docs/changelog.md                     |   7 ++
 docs/checks/promql/regexp.md          | 107 +++++++++++++++++++++++++-
 internal/checks/promql_regexp.go      |  27 ++++++-
 internal/checks/promql_regexp_test.go |  31 +++++++-
 5 files changed, 168 insertions(+), 5 deletions(-)

diff --git a/.github/spellcheck/wordlist.txt b/.github/spellcheck/wordlist.txt
index ed9b61d7..ef607f70 100644
--- a/.github/spellcheck/wordlist.txt
+++ b/.github/spellcheck/wordlist.txt
@@ -9,6 +9,7 @@ CLI
 cloudflare
 Cloudflare
 config
+configs
 Deduplicate
 deduplicated
 dir
diff --git a/docs/changelog.md b/docs/changelog.md
index e53623b8..cac01f47 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,12 @@
 # Changelog
 
+## v0.65.0
+
+### Added
+
+- [promql/regexp](checks/promql/regexp.md) check will now look for smelly regexp selectors.
+  See check docs for details.
+
 ## v0.64.1
 
 ### Fixed
diff --git a/docs/checks/promql/regexp.md b/docs/checks/promql/regexp.md
index 09b25dd1..85fd7251 100644
--- a/docs/checks/promql/regexp.md
+++ b/docs/checks/promql/regexp.md
@@ -6,6 +6,11 @@ grand_parent: Documentation
 
 # promql/regexp
 
+This checks will inspect all query metric selectors that use regexp matchers
+to look for common problems.
+
+## Unnecessary regexp selectors
+
 This check will warn if metric selector uses a regexp match but the regexp query
 doesn't have any patterns and so a simple string equality match can be used.
 Since regexp checks are more expensive using a simple equality check if
@@ -26,13 +31,113 @@ Example of a query that wouldn't trigger this warning:
 foo{job=~"bar|baz"}
 ```
 
-Another problem this check will report on is redundant regexp anchors.
+## Redundant anchors
+
 As noted on [Querying Prometheus](https://prometheus.io/docs/prometheus/latest/querying/basics/)
 page Prometheus fully anchors all regex matchers.
 So a query match using `foo=~"bar.*"` will be parsed as `foo=~"^bar.*$"` and
 so any anchors used in the query will be redundant.
 This means that passing `foo=~"^bar.*$"` to the query will be parsed as
 `foo=~"^^bar.*$$"`, so both `^` and `$` should be skipped to avoid it.
+This check will report selectors with redundant anchors.
+
+## Smelly selectors
+
+Metric labels are suppose to be opaque strings that you use for filtering, but sometimes
+you might end up with a single label value that is really a few different strings concatenated together,
+rather than a few different labels. Which then forces you to use regexp label selectors that target
+individual parts of that one very long label.
+
+An example of that is a `job` label that includes too much information and instead of just being a name.
+Since the `job` labels value is (by default) equal to the `job_name` field on a scrape configuration block
+it's easy to end up with one very long string if you need to create a few similar scrape configs:
+
+```yaml
+- job_name: myservice_cluster1_production
+  [...]
+
+- job_name: myservice_cluster2_production
+  [...]
+
+- job_name: myservice_cluster1_staging
+  [...]
+```
+
+In the configs above we end up with `job` label holding extra information:
+
+- the environment in which the service is deployed (`production` or `staging`)
+- the name of the cluster (`cluster1` or `cluster2`)
+
+And the time series we will scrape using this config will look like this:
+
+```js
+{job="myservice_cluster1_production", instance="..."}
+{job="myservice_cluster2_production", instance="..."}
+{job="myservice_cluster1_staging", instance="..."}
+```
+
+If we then wanted to query metrics only for the `production` environment we have to use a regexp:
+
+```yaml
+- alert: Scrape failed
+  expr: up{job=~"myservice_.+_production"} == 0
+```
+
+This isn't ideal for a number of reasons:
+
+- It's a lot less obvious what's happening here and why.
+- It's much easier to have a typo in a long regexp expression.
+- It's a lot harder to aggregate by cluster or environment this way.
+
+This is why regexp selector like this are a code smell that should be avoided.
+To avoid using them one can simply set more labels on these scrape jobs:
+
+```yaml
+- job_name: myservice_cluster1_production
+  [...]
+  relabel_configs:
+    - target_label: job
+      replacement: myservice
+    - target_label: cluster
+      replacement: cluster1
+    - target_label: env
+      replacement: production
+
+- job_name: myservice_cluster2_production
+  [...]
+  relabel_configs:
+    - target_label: job
+      replacement: myservice
+    - target_label: cluster
+      replacement: cluster2
+    - target_label: env
+      replacement: production
+
+- job_name: myservice_cluster1_staging
+  [...]
+  relabel_configs:
+    - target_label: job
+      replacement: myservice
+    - target_label: cluster
+      replacement: cluster1
+    - target_label: env
+      replacement: staging
+```
+
+Which will result in time series with explicit labels:
+
+```js
+{job="myservice", cluster="cluster1", env="production", instance="..."}
+{job="myservice", cluster="cluster2", env="production", instance="..."}
+{job="myservice", cluster="cluster1", env="staging", instance="..."}
+```
+
+And simple explicit queries:
+
+```yaml
+- alert: Scrape failed
+  expr: up{job="myservice", env="production"} == 0
+```
 
 ## Configuration
 
diff --git a/internal/checks/promql_regexp.go b/internal/checks/promql_regexp.go
index 4f7f88cb..30f5292b 100644
--- a/internal/checks/promql_regexp.go
+++ b/internal/checks/promql_regexp.go
@@ -85,8 +85,9 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule
 				re = "^(?:" + re + ")$"
 			}
 
-			var hasFlags, isUseful, isWildcard, isLiteral, isBad bool
+			var hasFlags, isUseful, isWildcard, isLiteral, isBad, isSmelly bool
 			var beginText, endText int
+			var lastOp syntax.Op
 			r, _ := syntax.Parse(re, syntax.Perl)
 			for _, s := range r.Sub {
 				// If effective flags are different from default flags then we assume regexp is useful.
@@ -94,6 +95,9 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule
 				if s.Flags > 0 && s.Flags != syntax.Perl {
 					hasFlags = true
 				}
+				if isOpSmelly(s.Op, lastOp) {
+					isSmelly = true
+				}
 				// nolint: exhaustive
 				switch s.Op {
 				case syntax.OpBeginText:
@@ -114,6 +118,7 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule
 				default:
 					isUseful = true
 				}
+				lastOp = s.Op
 			}
 			if hasFlags && !isWildcard {
 				isUseful = true
@@ -137,12 +142,16 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule
 				bad = append(bad, badMatcher{lm: lm, badAnchor: true})
 				isBad = true
 			}
+			if isSmelly {
+				bad = append(bad, badMatcher{lm: lm, isSmelly: true})
+			}
 			if !isBad {
 				good = append(good, lm)
 			}
 		}
 		for _, b := range bad {
 			var text string
+			s := Bug
 			switch {
 			case b.badAnchor:
 				text = fmt.Sprintf("Prometheus regexp matchers are automatically fully anchored so match for `%s` will result in `%s%s\"^%s$\"`, remove regexp anchors `^` and/or `$`.",
@@ -154,6 +163,9 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule
 			case b.isWildcard && b.op == labels.MatchNotEqual:
 				text = fmt.Sprintf("Unnecessary wildcard regexp, simply use `%s` if you want to match on all time series for `%s` without the `%s` label.",
 					makeLabel(name, slices.Concat(good, []*labels.Matcher{{Type: labels.MatchEqual, Name: b.lm.Name, Value: ""}})...), name, b.lm.Name)
+			case b.isSmelly:
+				text = fmt.Sprintf("`{%s}` looks like a smelly selector that tries to extract substrings from the value, please consider breaking down the value of this label into multiple smaller labels", b.lm.String())
+				s = Warning
 			default:
 				text = fmt.Sprintf("Unnecessary regexp match on static string `%s`, use `%s%s%q` instead.",
 					b.lm, b.lm.Name, b.op, b.lm.Value)
@@ -164,7 +176,7 @@ func (c RegexpCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rule
 				Reporter: c.Reporter(),
 				Text:     text,
 				Details:  RegexpCheckDetails,
-				Severity: Bug,
+				Severity: s,
 			})
 		}
 	}
@@ -176,6 +188,7 @@ type badMatcher struct {
 	lm         *labels.Matcher
 	op         labels.MatchType
 	isWildcard bool
+	isSmelly   bool
 	badAnchor  bool
 }
 
@@ -203,3 +216,13 @@ func makeLabel(name string, matchers ...*labels.Matcher) string {
 	b.WriteRune('}')
 	return b.String()
 }
+
+func isOpSmelly(a, b syntax.Op) bool {
+	if a == syntax.OpLiteral && (b == syntax.OpStar || b == syntax.OpPlus) {
+		return true
+	}
+	if b == syntax.OpLiteral && (a == syntax.OpStar || a == syntax.OpPlus) {
+		return true
+	}
+	return false
+}
diff --git a/internal/checks/promql_regexp_test.go b/internal/checks/promql_regexp_test.go
index 9870ca3c..5afa491f 100644
--- a/internal/checks/promql_regexp_test.go
+++ b/internal/checks/promql_regexp_test.go
@@ -30,7 +30,7 @@ func TestRegexpCheck(t *testing.T) {
 		},
 		{
 			description: "valid regexp",
-			content:     "- record: foo\n  expr: foo{job=~\"bar.+\"}\n",
+			content:     "- record: foo\n  expr: foo{job=~\"bar|foo\"}\n",
 			checker:     newRegexpCheck,
 			prometheus:  noProm,
 			problems:    noProblems,
@@ -51,7 +51,7 @@ func TestRegexpCheck(t *testing.T) {
 		},
 		{
 			description: "valid partial regexp",
-			content:     "- record: foo\n  expr: foo{job=~\"prefix.*\"}\n",
+			content:     "- record: foo\n  expr: foo{job=~\"prefix(a|b)\"}\n",
 			checker:     newRegexpCheck,
 			prometheus:  noProm,
 			problems:    noProblems,
@@ -404,6 +404,33 @@ func TestRegexpCheck(t *testing.T) {
 			prometheus:  noProm,
 			problems:    noProblems,
 		},
+		{
+			description: "smelly selector",
+			content:     "- record: foo\n  expr: foo{job=~\"service_.*_prod\"}\n",
+			checker:     newRegexpCheck,
+			prometheus:  noProm,
+			problems: func(_ string) []checks.Problem {
+				return []checks.Problem{
+					{
+						Lines: parser.LineRange{
+							First: 2,
+							Last:  2,
+						},
+						Reporter: checks.RegexpCheckName,
+						Text:     "`{job=~\"service_.*_prod\"}` looks like a smelly selector that tries to extract substrings from the value, please consider breaking down the value of this label into multiple smaller labels",
+						Details:  checks.RegexpCheckDetails,
+						Severity: checks.Warning,
+					},
+				}
+			},
+		},
+		{
+			description: "non-smelly selector",
+			content:     "- record: foo\n  expr: rate(http_requests_total{job=\"foo\", code=~\"5..\"}[5m])\n",
+			checker:     newRegexpCheck,
+			prometheus:  noProm,
+			problems:    noProblems,
+		},
 	}
 	runTests(t, testCases)
 }