Skip to content

Commit

Permalink
Merge pull request #767 from cloudflare/alerts/external_labels
Browse files Browse the repository at this point in the history
Added alerts/external_labels check
  • Loading branch information
prymitive authored Oct 31, 2023
2 parents 8a7b01d + 88c3052 commit 4dbe0ab
Show file tree
Hide file tree
Showing 16 changed files with 1,043 additions and 459 deletions.
8 changes: 4 additions & 4 deletions cmd/pint/tests/0023_enabled_checks.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
pint.error -l debug --no-color lint rules
! stdout .
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)"] path=rules/1.yaml rule=one'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)"] path=rules/1.yaml rule=two'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)"] path=rules/2.yaml rule=one'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)"] path=rules/2.yaml rule=two'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)"] path=rules/1.yaml rule=one'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)"] path=rules/1.yaml rule=two'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)"] path=rules/2.yaml rule=one'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)"] path=rules/2.yaml rule=two'

-- rules/1.yaml --
- record: one
Expand Down
1 change: 1 addition & 0 deletions cmd/pint/tests/0025_config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ level=INFO msg="Loading configuration file" path=.pint.hcl
"enabled": [
"alerts/annotation",
"alerts/count",
"alerts/external_labels",
"alerts/for",
"alerts/template",
"labels/conflict",
Expand Down
3 changes: 3 additions & 0 deletions cmd/pint/tests/0037_disable_checks.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ prometheus "prom" {
uri = "http://127.0.0.1"
timeout = "5s"
}
checks {
disabled = [ "alerts/external_labels" ]
}
rule {
match {
kind = "recording"
Expand Down
4 changes: 4 additions & 0 deletions cmd/pint/tests/0054_watch_metrics_prometheus.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ parser {
# TYPE pint_check_duration_seconds summary
pint_check_duration_seconds_sum{check="alerts/comparison"}
pint_check_duration_seconds_count{check="alerts/comparison"}
pint_check_duration_seconds_sum{check="alerts/external_labels"}
pint_check_duration_seconds_count{check="alerts/external_labels"}
pint_check_duration_seconds_sum{check="alerts/for"}
pint_check_duration_seconds_count{check="alerts/for"}
pint_check_duration_seconds_sum{check="alerts/template"}
Expand Down Expand Up @@ -89,6 +91,8 @@ pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",p
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="couldn't run \"promql/series\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="prometheus \"prom1\" at http://127.0.0.1:7054 failed with: bad_data: bogus query",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="broken",owner="",problem="syntax error: no arguments for aggregate expression provided",reporter="promql/syntax",severity="fatal"}
pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"alerts/external_labels\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: server_error: server error: 500",reporter="alerts/external_labels",severity="bug"}
pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"alerts/external_labels\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="alerts/external_labels",severity="bug"}
pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"promql/range_query\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/range_query",severity="bug"}
pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"promql/rate\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"promql/rate\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/rate",severity="bug"}
Expand Down
2 changes: 2 additions & 0 deletions cmd/pint/tests/0057_watch_metrics_prometheus_ignore.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ parser {
# TYPE pint_check_duration_seconds summary
pint_check_duration_seconds_sum{check="alerts/comparison"}
pint_check_duration_seconds_count{check="alerts/comparison"}
pint_check_duration_seconds_sum{check="alerts/external_labels"}
pint_check_duration_seconds_count{check="alerts/external_labels"}
pint_check_duration_seconds_sum{check="alerts/for"}
pint_check_duration_seconds_count{check="alerts/for"}
pint_check_duration_seconds_sum{check="alerts/template"}
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0103_file_disable.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=[] inclu
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=colo:test1 lines=9-10
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/vector_matching(prom)","labels/conflict(prom)"] path=rules/0001.yml rule=colo:test1
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/vector_matching(prom)","labels/conflict(prom)","alerts/external_labels(prom)"] path=rules/0001.yml rule=colo:test1
level=DEBUG msg="Stopping query workers" name=prom uri=http://127.0.0.1:7103
-- rules/0001.yml --
# This should skip all online checks
Expand Down
1 change: 1 addition & 0 deletions cmd/pint/tests/0113_config_env_expand.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ level=INFO msg="Loading configuration file" path=.pint.hcl
"enabled": [
"alerts/annotation",
"alerts/count",
"alerts/external_labels",
"alerts/for",
"alerts/template",
"labels/conflict",
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0115_file_disable_tag.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=["foo","
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=colo:test1 lines="6 8"
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp"] path=rules/0001.yml rule=colo:test1
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","alerts/external_labels(prom)"] path=rules/0001.yml rule=colo:test1
level=DEBUG msg="Stopping query workers" name=prom uri=http://127.0.0.1:7103
-- rules/0001.yml --
# pint file/disable promql/series(+bar)
Expand Down
6 changes: 6 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## v0.49.0

### Added

- Added [alerts/external_labels](checks/alerts/external_labels.md) check.

## v0.48.2

### Fixed
Expand Down
118 changes: 118 additions & 0 deletions docs/checks/alerts/external_labels.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
---
layout: default
parent: Checks
grand_parent: Documentation
---

# alerts/external_labels

Alerting rules can be templated to render the value of external labels
configured for the Prometheus server these rules are being evaluated
using `$externalLabels` variable.
[See docs](https://prometheus.io/docs/prometheus/latest/configuration/template_reference/#alert-field-templates).

This check will look for alerting rules referencing external labels that are
not present on given Prometheus server.

If we define `cluster` label in `global:external_labels`, example:

```yaml
global:
external_labels:
cluster: mycluster
```
Then we can access it in alert rules deployed to that Prometheus server
by using `$externalLabels.cluster` variable:

```yaml
- alert: Abc Is Down
expr: up{job="abc"} == 0
annotations:
summary: "{{ $labels.job }} is down in {{ $externalLabels.cluster }} cluster"
```

But if we try to do that without `cluster` in `global:external_labels` configuration
then `$externalLabels.cluster` will be empty, and this is what this check would
report.

## Configuration

This check doesn't have any configuration options.

## How to enable it

This check is enabled by default for all configured Prometheus servers.

Example:

```js
prometheus "prod" {
uri = "https://prometheus-prod.example.com"
timeout = "60s"
include = [
"rules/prod/.*",
"rules/common/.*",
]
}
prometheus "dev" {
uri = "https://prometheus-dev.example.com"
timeout = "30s"
include = [
"rules/dev/.*",
"rules/common/.*",
]
}
```

## How to disable it

You can disable this check globally by adding this config block:

```js
checks {
disabled = ["alerts/external_labels"]
}
```

You can also disable it for all rules inside given file by adding
a comment anywhere in that file. Example:

```yaml
# pint file/disable alerts/external_labels
```

Or you can disable it per rule by adding a comment to it. Example:

```yaml
# pint disable alerts/external_labels
```

If you want to disable only individual instances of this check
you can add a more specific comment.

```yaml
# pint disable alerts/external_labels($prometheus)
```

Where `$prometheus` is the name of Prometheus server to disable.

Example:

```yaml
# pint disable alerts/external_labels(prod)
```

## How to snooze it

You can disable this check until given time by adding a comment to it. Example:

```yaml
# pint snooze $TIMESTAMP alerts/external_labels
```

Where `$TIMESTAMP` is either use [RFC3339](https://www.rfc-editor.org/rfc/rfc3339)
formatted or `YYYY-MM-DD`.
Adding this comment will disable `alerts/external_labels` _until_ `$TIMESTAMP`, after that
check will be re-enabled.
132 changes: 132 additions & 0 deletions internal/checks/alerts_external_labels.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
package checks

import (
"context"
"fmt"

"github.com/cloudflare/pint/internal/discovery"
"github.com/cloudflare/pint/internal/parser"
"github.com/cloudflare/pint/internal/promapi"
)

const (
AlertsExternalLabelsCheckName = "alerts/external_labels"
)

func NewAlertsExternalLabelsCheck(prom *promapi.FailoverGroup) AlertsExternalLabelsCheck {
return AlertsExternalLabelsCheck{
prom: prom,
}
}

type AlertsExternalLabelsCheck struct {
prom *promapi.FailoverGroup
}

func (c AlertsExternalLabelsCheck) Meta() CheckMeta {
return CheckMeta{IsOnline: true}
}

func (c AlertsExternalLabelsCheck) String() string {
return fmt.Sprintf("%s(%s)", AlertsExternalLabelsCheckName, c.prom.Name())
}

func (c AlertsExternalLabelsCheck) Reporter() string {
return AlertsExternalLabelsCheckName
}

func (c AlertsExternalLabelsCheck) Check(ctx context.Context, _ string, rule parser.Rule, _ []discovery.Entry) (problems []Problem) {
if rule.AlertingRule == nil {
return problems
}

if rule.AlertingRule.Expr.SyntaxError != nil {
return problems
}

cfg, err := c.prom.Config(ctx)
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
problems = append(problems, Problem{
Fragment: fmt.Sprintf("%s: %s", rule.AlertingRule.Alert.Key.Value, rule.AlertingRule.Alert.Value.Value),
Lines: rule.AlertingRule.Lines(),
Reporter: c.Reporter(),
Text: text,
Severity: severity,
})
return problems
}

if rule.AlertingRule.Labels != nil {
for _, label := range rule.AlertingRule.Labels.Items {
for _, name := range checkExternalLabels(label.Key.Value, label.Key.Value, cfg.Config.Global.ExternalLabels) {
problems = append(problems, Problem{
Fragment: fmt.Sprintf("%s: %s", label.Key.Value, label.Value.Value),
Lines: label.Lines(),
Reporter: c.Reporter(),
Text: fmt.Sprintf("template is using %q external label but %s doesn't have this label configured in global:external_labels", name, promText(c.prom.Name(), cfg.URI)),
Severity: Bug,
})
}
for _, name := range checkExternalLabels(label.Key.Value, label.Value.Value, cfg.Config.Global.ExternalLabels) {
problems = append(problems, Problem{
Fragment: fmt.Sprintf("%s: %s", label.Key.Value, label.Value.Value),
Lines: label.Lines(),
Reporter: c.Reporter(),
Text: fmt.Sprintf("template is using %q external label but %s doesn't have this label configured in global:external_labels", name, promText(c.prom.Name(), cfg.URI)), Severity: Bug,
})
}
}
}

if rule.AlertingRule.Annotations != nil {
for _, annotation := range rule.AlertingRule.Annotations.Items {
for _, name := range checkExternalLabels(annotation.Key.Value, annotation.Key.Value, cfg.Config.Global.ExternalLabels) {
problems = append(problems, Problem{
Fragment: fmt.Sprintf("%s: %s", annotation.Key.Value, annotation.Value.Value),
Lines: annotation.Lines(),
Reporter: c.Reporter(),
Text: fmt.Sprintf("template is using %q external label but %s doesn't have this label configured in global:external_labels", name, promText(c.prom.Name(), cfg.URI)),
Severity: Bug,
})
}
for _, name := range checkExternalLabels(annotation.Key.Value, annotation.Value.Value, cfg.Config.Global.ExternalLabels) {
problems = append(problems, Problem{
Fragment: fmt.Sprintf("%s: %s", annotation.Key.Value, annotation.Value.Value),
Lines: annotation.Lines(),
Reporter: c.Reporter(),
Text: fmt.Sprintf("template is using %q external label but %s doesn't have this label configured in global:external_labels", name, promText(c.prom.Name(), cfg.URI)),
Severity: Bug,
})
}
}
}

return problems
}

func checkExternalLabels(name, text string, externalLabels map[string]string) (labels []string) {
vars, aliases, ok := findTemplateVariables(name, text)
if !ok {
return nil
}

done := map[string]struct{}{}
externalLabelsAliases := aliases.varAliases(".ExternalLabels")
for _, v := range vars {
for _, a := range externalLabelsAliases {
if len(v) > 1 && v[0] == a {
name := v[1]
if _, ok = done[name]; ok {
continue
}
if _, ok := externalLabels[name]; !ok {
labels = append(labels, name)
}
done[name] = struct{}{}
}
}
}

return labels
}
Loading

0 comments on commit 4dbe0ab

Please sign in to comment.