Skip to content

Commit

Permalink
Merge pull request #1050 from cloudflare/truncate
Browse files Browse the repository at this point in the history
Don't list all possible prometheus servers
  • Loading branch information
prymitive committed Jul 31, 2024
2 parents 9155063 + fe18dfd commit 160a26c
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 8 deletions.
17 changes: 14 additions & 3 deletions internal/checks/base_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,15 @@ type problemsFn func(string) []checks.Problem

type newPrometheusFn func(string) *promapi.FailoverGroup

type newCtxFn func() context.Context
type newCtxFn func(string) context.Context

type otherPromsFn func(string) []*promapi.FailoverGroup

type checkTest struct {
description string
content string
prometheus newPrometheusFn
otherProms otherPromsFn
ctx newCtxFn
checker newCheckFn
entries []discovery.Entry
Expand Down Expand Up @@ -130,20 +133,28 @@ func runTests(t *testing.T, testCases []checkTest) {
}

var proms []*promapi.FailoverGroup
reg := prometheus.NewRegistry()
prom := tc.prometheus(uri)
if prom != nil {
proms = append(proms, prom)
reg := prometheus.NewRegistry()
prom.StartWorkers(reg)
defer prom.Close(reg)
}

if tc.otherProms != nil {
for _, op := range tc.otherProms(uri) {
proms = append(proms, op)
op.StartWorkers(reg)
defer op.Close(reg)
}
}

entries, err := parseContent(tc.content)
require.NoError(t, err, "cannot parse rule content")
for _, entry := range entries {
ctx := context.WithValue(context.Background(), promapi.AllPrometheusServers, proms)
if tc.ctx != nil {
ctx = tc.ctx()
ctx = tc.ctx(uri)
}
problems := tc.checker(prom).Check(ctx, entry.Path, entry.Rule, tc.entries)
require.Equal(t, tc.problems(uri), problems)
Expand Down
12 changes: 11 additions & 1 deletion internal/checks/promql_series.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"log/slog"
"regexp"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -573,7 +574,7 @@ func (c SeriesCheck) checkOtherServer(ctx context.Context, query string) string
buf.WriteString(query)
buf.WriteString("` was found on other prometheus servers:\n\n")

var matches int
var matches, skipped int
for _, prom := range servers {
slog.Debug("Checking if metric exists on any other Prometheus server", slog.String("check", c.Reporter()), slog.String("selector", query))

Expand All @@ -591,6 +592,10 @@ func (c SeriesCheck) checkOtherServer(ctx context.Context, query string) string

if series > 0 {
matches++
if matches > 10 {
skipped++
continue
}
buf.WriteString("- [")
buf.WriteString(prom.Name())
buf.WriteString("](")
Expand All @@ -600,6 +605,11 @@ func (c SeriesCheck) checkOtherServer(ctx context.Context, query string) string
buf.WriteString(")\n")
}
}
if skipped > 0 {
buf.WriteString("- and ")
buf.WriteString(strconv.Itoa(skipped))
buf.WriteString(" other server(s).\n")
}

buf.WriteString("\nYou might be trying to deploy this rule to the wrong Prometheus server instance.\n")

Expand Down
131 changes: 127 additions & 4 deletions internal/checks/promql_series_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ func TestSeriesCheck(t *testing.T) {
{
description: "#2 series never present, custom range",
content: "- record: foo\n expr: sum(notfound)\n",
ctx: func() context.Context {
ctx: func(_ string) context.Context {
s := checks.PromqlSeriesSettings{
LookbackRange: "3d",
LookbackStep: "6m",
Expand Down Expand Up @@ -669,7 +669,7 @@ func TestSeriesCheck(t *testing.T) {
description: "#2 series never present but metric ignored",
content: "- record: foo\n expr: sum(notfound)\n",
checker: newSeriesCheck,
ctx: func() context.Context {
ctx: func(_ string) context.Context {
s := checks.PromqlSeriesSettings{
IgnoreMetrics: []string{"foo", "bar", "not.+"},
}
Expand Down Expand Up @@ -1413,7 +1413,7 @@ func TestSeriesCheck(t *testing.T) {
description: "#4 metric was present but disappeared over 1h ago / ignored",
content: "- record: foo\n expr: sum(found{job=\"foo\", instance=\"bar\"})\n",
checker: newSeriesCheck,
ctx: func() context.Context {
ctx: func(_ string) context.Context {
s := checks.PromqlSeriesSettings{
IgnoreMetrics: []string{"foo", "found", "not.+"},
}
Expand Down Expand Up @@ -1990,7 +1990,7 @@ func TestSeriesCheck(t *testing.T) {
description: "#5 metric was present but not with label value",
content: "- record: foo\n expr: sum(found{notfound=\"notfound\", instance=~\".+\", not!=\"negative\", instance!~\"bad\"})\n",
checker: newSeriesCheck,
ctx: func() context.Context {
ctx: func(_ string) context.Context {
s := checks.PromqlSeriesSettings{
IgnoreMetrics: []string{"foo", "bar", "found"},
}
Expand Down Expand Up @@ -3771,6 +3771,129 @@ func TestSeriesCheck(t *testing.T) {
}
},
},
{
description: "series not present on other servers",
content: "- record: foo\n expr: notfound\n",
checker: newSeriesCheck,
prometheus: newSimpleProm,
otherProms: func(uri string) []*promapi.FailoverGroup {
var proms []*promapi.FailoverGroup
for i := range 5 {
proms = append(proms, simpleProm(fmt.Sprintf("prom%d", i), uri+"/other", time.Second, false))
}
return proms
},
problems: func(uri string) []checks.Problem {
return []checks.Problem{
{
Lines: parser.LineRange{
First: 2,
Last: 2,
},
Reporter: checks.SeriesCheckName,
Text: noMetricText("prom", uri, "notfound", "1w"),
Details: checks.SeriesCheckCommonProblemDetails,
Severity: checks.Bug,
},
}
},
mocks: []*prometheusMock{
{
conds: []requestCondition{requestPathCond{path: "/other/api/v1/query"}},
resp: respondWithEmptyVector(),
},
{
conds: []requestCondition{requireQueryPath},
resp: respondWithEmptyVector(),
},
{
conds: []requestCondition{requireRangeQueryPath},
resp: respondWithEmptyMatrix(),
},
},
},
{
description: "series present on other servers",
content: "- record: foo\n expr: notfound\n",
checker: newSeriesCheck,
prometheus: newSimpleProm,
otherProms: func(uri string) []*promapi.FailoverGroup {
var proms []*promapi.FailoverGroup
for i := range 5 {
proms = append(proms, simpleProm(fmt.Sprintf("prom%d", i), uri+"/other", time.Second, false))
}
return proms
},
problems: func(uri string) []checks.Problem {
return []checks.Problem{
{
Lines: parser.LineRange{
First: 2,
Last: 2,
},
Reporter: checks.SeriesCheckName,
Text: noMetricText("prom", uri, "notfound", "1w"),
Details: fmt.Sprintf("`notfound` was found on other prometheus servers:\n\n- [prom0](%s/other/graph?g0.expr=notfound)\n- [prom1](%s/other/graph?g0.expr=notfound)\n- [prom2](%s/other/graph?g0.expr=notfound)\n- [prom3](%s/other/graph?g0.expr=notfound)\n- [prom4](%s/other/graph?g0.expr=notfound)\n\nYou might be trying to deploy this rule to the wrong Prometheus server instance.\n", uri, uri, uri, uri, uri),
Severity: checks.Bug,
},
}
},
mocks: []*prometheusMock{
{
conds: []requestCondition{requestPathCond{path: "/other/api/v1/query"}},
resp: respondWithSingleInstantVector(),
},
{
conds: []requestCondition{requireQueryPath},
resp: respondWithEmptyVector(),
},
{
conds: []requestCondition{requireRangeQueryPath},
resp: respondWithEmptyMatrix(),
},
},
},
{
description: "series present on other servers / 15",
content: "- record: foo\n expr: notfound\n",
checker: newSeriesCheck,
prometheus: newSimpleProm,
otherProms: func(uri string) []*promapi.FailoverGroup {
var proms []*promapi.FailoverGroup
for i := range 15 {
proms = append(proms, simpleProm(fmt.Sprintf("prom%d", i), uri+"/other", time.Second, false))
}
return proms
},
problems: func(uri string) []checks.Problem {
return []checks.Problem{
{
Lines: parser.LineRange{
First: 2,
Last: 2,
},
Reporter: checks.SeriesCheckName,
Text: noMetricText("prom", uri, "notfound", "1w"),
Details: fmt.Sprintf("`notfound` was found on other prometheus servers:\n\n- [prom0](%s/other/graph?g0.expr=notfound)\n- [prom1](%s/other/graph?g0.expr=notfound)\n- [prom2](%s/other/graph?g0.expr=notfound)\n- [prom3](%s/other/graph?g0.expr=notfound)\n- [prom4](%s/other/graph?g0.expr=notfound)\n- [prom5](%s/other/graph?g0.expr=notfound)\n- [prom6](%s/other/graph?g0.expr=notfound)\n- [prom7](%s/other/graph?g0.expr=notfound)\n- [prom8](%s/other/graph?g0.expr=notfound)\n- [prom9](%s/other/graph?g0.expr=notfound)\n- and 5 other server(s).\n\nYou might be trying to deploy this rule to the wrong Prometheus server instance.\n", uri, uri, uri, uri, uri, uri, uri, uri, uri, uri),
Severity: checks.Bug,
},
}
},
mocks: []*prometheusMock{
{
conds: []requestCondition{requestPathCond{path: "/other/api/v1/query"}},
resp: respondWithSingleInstantVector(),
},
{
conds: []requestCondition{requireQueryPath},
resp: respondWithEmptyVector(),
},
{
conds: []requestCondition{requireRangeQueryPath},
resp: respondWithEmptyMatrix(),
},
},
},
}
runTests(t, testCases)
}

0 comments on commit 160a26c

Please sign in to comment.