Skip to content

Commit

Permalink
Merge pull request #1046 from cloudflare/alerts/absent
Browse files Browse the repository at this point in the history
Added alerts/absent check
  • Loading branch information
prymitive committed Jul 26, 2024
2 parents ed0906b + 1348867 commit bfeb640
Show file tree
Hide file tree
Showing 16 changed files with 716 additions and 194 deletions.
8 changes: 4 additions & 4 deletions cmd/pint/tests/0023_enabled_checks.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
pint.error -l debug --no-color lint rules
! stdout .
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)"] path=rules/1.yaml rule=one'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)"] path=rules/1.yaml rule=two'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)"] path=rules/2.yaml rule=one'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)"] path=rules/2.yaml rule=two'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)","alerts/absent\(prom\)"] path=rules/1.yaml rule=one'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)","alerts/absent\(prom\)"] path=rules/1.yaml rule=two'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)","alerts/absent\(prom\)"] path=rules/2.yaml rule=one'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)","alerts/absent\(prom\)"] path=rules/2.yaml rule=two'

-- rules/1.yaml --
- record: one
Expand Down
1 change: 1 addition & 0 deletions cmd/pint/tests/0025_config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ level=INFO msg="Loading configuration file" path=.pint.hcl
"parser": {},
"checks": {
"enabled": [
"alerts/absent",
"alerts/annotation",
"alerts/count",
"alerts/external_labels",
Expand Down
2 changes: 2 additions & 0 deletions cmd/pint/tests/0054_watch_metrics_prometheus.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ parser {
-- metrics.txt --
# HELP pint_check_duration_seconds How long did a check took to complete
# TYPE pint_check_duration_seconds summary
pint_check_duration_seconds_sum{check="alerts/absent"}
pint_check_duration_seconds_count{check="alerts/absent"}
pint_check_duration_seconds_sum{check="alerts/comparison"}
pint_check_duration_seconds_count{check="alerts/comparison"}
pint_check_duration_seconds_sum{check="alerts/external_labels"}
Expand Down
2 changes: 2 additions & 0 deletions cmd/pint/tests/0057_watch_metrics_prometheus_ignore.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ parser {
-- metrics.txt --
# HELP pint_check_duration_seconds How long did a check took to complete
# TYPE pint_check_duration_seconds summary
pint_check_duration_seconds_sum{check="alerts/absent"}
pint_check_duration_seconds_count{check="alerts/absent"}
pint_check_duration_seconds_sum{check="alerts/comparison"}
pint_check_duration_seconds_count{check="alerts/comparison"}
pint_check_duration_seconds_sum{check="alerts/external_labels"}
Expand Down
1 change: 1 addition & 0 deletions cmd/pint/tests/0113_config_env_expand.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ level=INFO msg="Loading configuration file" path=.pint.hcl
},
"checks": {
"enabled": [
"alerts/absent",
"alerts/annotation",
"alerts/count",
"alerts/external_labels",
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0115_file_disable_tag.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ level=INFO msg="Configured new Prometheus server" name=prom uris=1 uptime=up tag
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=colo:test1 lines=6-8
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","alerts/external_labels(prom)","promql/counter(prom)"] path=rules/0001.yml rule=colo:test1
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","alerts/external_labels(prom)","promql/counter(prom)","alerts/absent(prom)"] path=rules/0001.yml rule=colo:test1
level=DEBUG msg="Scheduling Prometheus metrics metadata query" uri=http://127.0.0.1:7103 metric=foo
level=DEBUG msg="Getting prometheus metrics metadata" uri=http://127.0.0.1:7103 metric=foo
level=ERROR msg="Query returned an error" err="failed to query Prometheus metrics metadata: Get \"http://127.0.0.1:7103/api/v1/metadata?metric=foo\": dial tcp 127.0.0.1:7103: connect: connection refused" uri=http://127.0.0.1:7103 query=foo
Expand Down
8 changes: 6 additions & 2 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,19 @@

## v0.63.0

### Fixed
### Added

- Fixed false positive warnings from [alerts/comparison](checks/alerts/comparison.md) when using `absent_over_time()`.
- Added [alerts/absent](checks/alerts/absent.md) check.

### Changed

- [promql/vector_matching](checks/promql/vector_matching.md) will now report more details, including which Prometheus server reports problems and which part of the query is the issue.
- GitHub report code was refactored, it should behave as before.

### Fixed

- Fixed false positive warnings from [alerts/comparison](checks/alerts/comparison.md) when using `absent_over_time()`.

## v0.62.2

### Fixed
Expand Down
94 changes: 94 additions & 0 deletions docs/checks/alerts/absent.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
---
layout: default
parent: Checks
grand_parent: Documentation
---

# alerts/absent

This check will warn you about alerting rules that are using `absent()` calls without having `for` option set
to at least 2x scrape interval.
Using `absent()` without `for` can cause false positive alerts when Prometheus is restarted and the rule
is evaluated before the metrics tested using `absent()` are scraped. Adding a `for` option with at least
2x scrape interval is usually enough to prevent this from happening.

## Configuration

This check doesn't have any configuration options.

## How to enable it

This check is enabled by default for all configured Prometheus servers.

Example:

```js
prometheus "prod" {
uri = "https://prometheus-prod.example.com"
timeout = "60s"
include = [
"rules/prod/.*",
"rules/common/.*",
]
}

prometheus "dev" {
uri = "https://prometheus-dev.example.com"
timeout = "30s"
include = [
"rules/dev/.*",
"rules/common/.*",
]
}
```

## How to disable it

You can disable this check globally by adding this config block:

```js
checks {
disabled = ["alerts/absent"]
}
```

You can also disable it for all rules inside given file by adding
a comment anywhere in that file. Example:

```yaml
# pint file/disable alerts/absent
```

Or you can disable it per rule by adding a comment to it. Example:

```yaml
# pint disable alerts/absent
```

If you want to disable only individual instances of this check
you can add a more specific comment.

```yaml
# pint disable alerts/absent($prometheus)
```

Where `$prometheus` is the name of Prometheus server to disable.

Example:

```yaml
# pint disable alerts/absent(prod)
```

## How to snooze it

You can disable this check until given time by adding a comment to it. Example:

```yaml
# pint snooze $TIMESTAMP alerts/absent
```

Where `$TIMESTAMP` is either use [RFC3339](https://www.rfc-editor.org/rfc/rfc3339)
formatted or `YYYY-MM-DD`.
Adding this comment will disable `alerts/absent` *until* `$TIMESTAMP`, after that
check will be re-enabled.
8 changes: 4 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ require (
go.uber.org/atomic v1.11.0
go.uber.org/automaxprocs v1.5.3
go.uber.org/ratelimit v0.3.1
golang.org/x/exp v0.0.0-20240119083558-1b970713d09a
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56
golang.org/x/oauth2 v0.21.0
gopkg.in/yaml.v3 v3.0.1
)
Expand Down Expand Up @@ -72,11 +72,11 @@ require (
go.opentelemetry.io/otel v1.27.0 // indirect
go.opentelemetry.io/otel/metric v1.27.0 // indirect
go.opentelemetry.io/otel/trace v1.27.0 // indirect
golang.org/x/mod v0.18.0 // indirect
golang.org/x/mod v0.19.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.21.0 // indirect
golang.org/x/sys v0.22.0 // indirect
golang.org/x/text v0.16.0 // indirect
golang.org/x/time v0.5.0 // indirect
golang.org/x/tools v0.22.0 // indirect
golang.org/x/tools v0.23.0 // indirect
google.golang.org/protobuf v1.34.2 // indirect
)
20 changes: 10 additions & 10 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -190,18 +190,18 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI=
golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM=
golang.org/x/exp v0.0.0-20240119083558-1b970713d09a h1:Q8/wZp0KX97QFTc2ywcOE0YRjZPVIx+MXInMzdvQqcA=
golang.org/x/exp v0.0.0-20240119083558-1b970713d09a/go.mod h1:idGWGoKP1toJGkd5/ig9ZLuPcZBC3ewk7SzmH0uou08=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0=
golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/mod v0.19.0 h1:fEdghXQSo20giMthA7cd28ZC+jts4amQ3YMXiP5oMQ8=
golang.org/x/mod v0.19.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ=
golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE=
golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys=
golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE=
golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs=
golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
Expand All @@ -214,8 +214,8 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
Expand All @@ -226,8 +226,8 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.22.0 h1:gqSGLZqv+AI9lIQzniJ0nZDRG5GBPsSi+DRNHWNz6yA=
golang.org/x/tools v0.22.0/go.mod h1:aCwcsjqvq7Yqt6TNyX7QMU2enbQ/Gt0bo6krSeEri+c=
golang.org/x/tools v0.23.0 h1:SGsXPZ+2l4JsgaCKkx+FQ9YZ5XEtA1GZYuoDjenLjvg=
golang.org/x/tools v0.23.0/go.mod h1:pnu6ufv6vQkll6szChhK3C3L/ruaIv5eBeztNG8wtsI=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
Expand Down
99 changes: 99 additions & 0 deletions internal/checks/alerts_absent.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package checks

import (
"context"
"fmt"
"time"

"github.com/cloudflare/pint/internal/discovery"
"github.com/cloudflare/pint/internal/output"
"github.com/cloudflare/pint/internal/parser"
"github.com/cloudflare/pint/internal/promapi"

"github.com/prometheus/common/model"
promParser "github.com/prometheus/prometheus/promql/parser"
)

const (
AlertsAbsentCheckName = "alerts/absent"
AlertsAbsentCheckDetails = "When Prometheus restart this alert rule might be evaluated before your service is scraped, which can cause false positives from absent() call.\nAdding `for` option that is at least 2x scrape interval will prevent this from happening."
)

func NewAlertsAbsentCheck(prom *promapi.FailoverGroup) AlertsAbsentCheck {
return AlertsAbsentCheck{
prom: prom,
}
}

type AlertsAbsentCheck struct {
prom *promapi.FailoverGroup
}

func (c AlertsAbsentCheck) Meta() CheckMeta {
return CheckMeta{
States: []discovery.ChangeType{
discovery.Noop,
discovery.Added,
discovery.Modified,
discovery.Moved,
},
IsOnline: true,
}
}

func (c AlertsAbsentCheck) String() string {
return fmt.Sprintf("%s(%s)", AlertsAbsentCheckName, c.prom.Name())
}

func (c AlertsAbsentCheck) Reporter() string {
return AlertsAbsentCheckName
}

func (c AlertsAbsentCheck) Check(ctx context.Context, _ discovery.Path, rule parser.Rule, _ []discovery.Entry) (problems []Problem) {
if rule.AlertingRule == nil {
return problems
}

if rule.AlertingRule.Expr.SyntaxError != nil {
return problems
}

if n, ok := rule.AlertingRule.Expr.Query.Expr.(*promParser.Call); !ok || n.Func.Name != "absent" {
return problems
}

cfg, err := c.prom.Config(ctx, 0)
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Warning)
problems = append(problems, Problem{
Lines: rule.AlertingRule.Expr.Value.Lines,
Reporter: c.Reporter(),
Text: text,
Severity: severity,
})
return problems
}

if rule.AlertingRule.For != nil {
forDur, err := model.ParseDuration(rule.AlertingRule.For.Value)
if err != nil {
return problems
}
if time.Duration(forDur) >= cfg.Config.Global.ScrapeInterval*2 {
return problems
}
}

problems = append(problems, Problem{
Lines: rule.AlertingRule.Expr.Value.Lines,
Reporter: c.Reporter(),
Text: fmt.Sprintf("Alert query is using absent() which might cause false positives when %s restarts, please add `for: %s` to avoid this.",
promText(c.prom.Name(), cfg.URI),
output.HumanizeDuration((cfg.Config.Global.ScrapeInterval * 2).Round(time.Minute)),
),
Details: AlertsAbsentCheckDetails,
Severity: Warning,
})

return problems
}
Loading

0 comments on commit bfeb640

Please sign in to comment.