Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added alerts/absent check #1046

Merged
merged 1 commit into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions cmd/pint/tests/0023_enabled_checks.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
pint.error -l debug --no-color lint rules
! stdout .
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)"] path=rules/1.yaml rule=one'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)"] path=rules/1.yaml rule=two'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)"] path=rules/2.yaml rule=one'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)"] path=rules/2.yaml rule=two'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)","alerts/absent\(prom\)"] path=rules/1.yaml rule=one'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)","alerts/absent\(prom\)"] path=rules/1.yaml rule=two'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)","alerts/absent\(prom\)"] path=rules/2.yaml rule=one'
stderr 'level=DEBUG msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)","rule/duplicate\(prom\)","labels/conflict\(prom\)","alerts/external_labels\(prom\)","promql/counter\(prom\)","alerts/absent\(prom\)"] path=rules/2.yaml rule=two'

-- rules/1.yaml --
- record: one
Expand Down
1 change: 1 addition & 0 deletions cmd/pint/tests/0025_config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ level=INFO msg="Loading configuration file" path=.pint.hcl
"parser": {},
"checks": {
"enabled": [
"alerts/absent",
"alerts/annotation",
"alerts/count",
"alerts/external_labels",
Expand Down
2 changes: 2 additions & 0 deletions cmd/pint/tests/0054_watch_metrics_prometheus.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ parser {
-- metrics.txt --
# HELP pint_check_duration_seconds How long did a check took to complete
# TYPE pint_check_duration_seconds summary
pint_check_duration_seconds_sum{check="alerts/absent"}
pint_check_duration_seconds_count{check="alerts/absent"}
pint_check_duration_seconds_sum{check="alerts/comparison"}
pint_check_duration_seconds_count{check="alerts/comparison"}
pint_check_duration_seconds_sum{check="alerts/external_labels"}
Expand Down
2 changes: 2 additions & 0 deletions cmd/pint/tests/0057_watch_metrics_prometheus_ignore.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ parser {
-- metrics.txt --
# HELP pint_check_duration_seconds How long did a check took to complete
# TYPE pint_check_duration_seconds summary
pint_check_duration_seconds_sum{check="alerts/absent"}
pint_check_duration_seconds_count{check="alerts/absent"}
pint_check_duration_seconds_sum{check="alerts/comparison"}
pint_check_duration_seconds_count{check="alerts/comparison"}
pint_check_duration_seconds_sum{check="alerts/external_labels"}
Expand Down
1 change: 1 addition & 0 deletions cmd/pint/tests/0113_config_env_expand.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ level=INFO msg="Loading configuration file" path=.pint.hcl
},
"checks": {
"enabled": [
"alerts/absent",
"alerts/annotation",
"alerts/count",
"alerts/external_labels",
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0115_file_disable_tag.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ level=INFO msg="Configured new Prometheus server" name=prom uris=1 uptime=up tag
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=colo:test1 lines=6-8
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","alerts/external_labels(prom)","promql/counter(prom)"] path=rules/0001.yml rule=colo:test1
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","alerts/external_labels(prom)","promql/counter(prom)","alerts/absent(prom)"] path=rules/0001.yml rule=colo:test1
level=DEBUG msg="Scheduling Prometheus metrics metadata query" uri=http://127.0.0.1:7103 metric=foo
level=DEBUG msg="Getting prometheus metrics metadata" uri=http://127.0.0.1:7103 metric=foo
level=ERROR msg="Query returned an error" err="failed to query Prometheus metrics metadata: Get \"http://127.0.0.1:7103/api/v1/metadata?metric=foo\": dial tcp 127.0.0.1:7103: connect: connection refused" uri=http://127.0.0.1:7103 query=foo
Expand Down
8 changes: 6 additions & 2 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,19 @@

## v0.63.0

### Fixed
### Added

- Fixed false positive warnings from [alerts/comparison](checks/alerts/comparison.md) when using `absent_over_time()`.
- Added [alerts/absent](checks/alerts/absent.md) check.

### Changed

- [promql/vector_matching](checks/promql/vector_matching.md) will now report more details, including which Prometheus server reports problems and which part of the query is the issue.
- GitHub report code was refactored, it should behave as before.

### Fixed

- Fixed false positive warnings from [alerts/comparison](checks/alerts/comparison.md) when using `absent_over_time()`.

## v0.62.2

### Fixed
Expand Down
94 changes: 94 additions & 0 deletions docs/checks/alerts/absent.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
---
layout: default
parent: Checks
grand_parent: Documentation
---

# alerts/absent

This check will warn you about alerting rules that are using `absent()` calls without having `for` option set
to at least 2x scrape interval.
Using `absent()` without `for` can cause false positive alerts when Prometheus is restarted and the rule
is evaluated before the metrics tested using `absent()` are scraped. Adding a `for` option with at least
2x scrape interval is usually enough to prevent this from happening.

## Configuration

This check doesn't have any configuration options.

## How to enable it

This check is enabled by default for all configured Prometheus servers.

Example:

```js
prometheus "prod" {
uri = "https://prometheus-prod.example.com"
timeout = "60s"
include = [
"rules/prod/.*",
"rules/common/.*",
]
}

prometheus "dev" {
uri = "https://prometheus-dev.example.com"
timeout = "30s"
include = [
"rules/dev/.*",
"rules/common/.*",
]
}
```

## How to disable it

You can disable this check globally by adding this config block:

```js
checks {
disabled = ["alerts/absent"]
}
```

You can also disable it for all rules inside given file by adding
a comment anywhere in that file. Example:

```yaml
# pint file/disable alerts/absent
```

Or you can disable it per rule by adding a comment to it. Example:

```yaml
# pint disable alerts/absent
```

If you want to disable only individual instances of this check
you can add a more specific comment.

```yaml
# pint disable alerts/absent($prometheus)
```

Where `$prometheus` is the name of Prometheus server to disable.

Example:

```yaml
# pint disable alerts/absent(prod)
```

## How to snooze it

You can disable this check until given time by adding a comment to it. Example:

```yaml
# pint snooze $TIMESTAMP alerts/absent
```

Where `$TIMESTAMP` is either use [RFC3339](https://www.rfc-editor.org/rfc/rfc3339)
formatted or `YYYY-MM-DD`.
Adding this comment will disable `alerts/absent` *until* `$TIMESTAMP`, after that
check will be re-enabled.
8 changes: 4 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ require (
go.uber.org/atomic v1.11.0
go.uber.org/automaxprocs v1.5.3
go.uber.org/ratelimit v0.3.1
golang.org/x/exp v0.0.0-20240119083558-1b970713d09a
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56
golang.org/x/oauth2 v0.21.0
gopkg.in/yaml.v3 v3.0.1
)
Expand Down Expand Up @@ -72,11 +72,11 @@ require (
go.opentelemetry.io/otel v1.27.0 // indirect
go.opentelemetry.io/otel/metric v1.27.0 // indirect
go.opentelemetry.io/otel/trace v1.27.0 // indirect
golang.org/x/mod v0.18.0 // indirect
golang.org/x/mod v0.19.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.21.0 // indirect
golang.org/x/sys v0.22.0 // indirect
golang.org/x/text v0.16.0 // indirect
golang.org/x/time v0.5.0 // indirect
golang.org/x/tools v0.22.0 // indirect
golang.org/x/tools v0.23.0 // indirect
google.golang.org/protobuf v1.34.2 // indirect
)
20 changes: 10 additions & 10 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -190,18 +190,18 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI=
golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM=
golang.org/x/exp v0.0.0-20240119083558-1b970713d09a h1:Q8/wZp0KX97QFTc2ywcOE0YRjZPVIx+MXInMzdvQqcA=
golang.org/x/exp v0.0.0-20240119083558-1b970713d09a/go.mod h1:idGWGoKP1toJGkd5/ig9ZLuPcZBC3ewk7SzmH0uou08=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0=
golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/mod v0.19.0 h1:fEdghXQSo20giMthA7cd28ZC+jts4amQ3YMXiP5oMQ8=
golang.org/x/mod v0.19.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ=
golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE=
golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys=
golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE=
golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs=
golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
Expand All @@ -214,8 +214,8 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
Expand All @@ -226,8 +226,8 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.22.0 h1:gqSGLZqv+AI9lIQzniJ0nZDRG5GBPsSi+DRNHWNz6yA=
golang.org/x/tools v0.22.0/go.mod h1:aCwcsjqvq7Yqt6TNyX7QMU2enbQ/Gt0bo6krSeEri+c=
golang.org/x/tools v0.23.0 h1:SGsXPZ+2l4JsgaCKkx+FQ9YZ5XEtA1GZYuoDjenLjvg=
golang.org/x/tools v0.23.0/go.mod h1:pnu6ufv6vQkll6szChhK3C3L/ruaIv5eBeztNG8wtsI=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
Expand Down
99 changes: 99 additions & 0 deletions internal/checks/alerts_absent.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package checks

import (
"context"
"fmt"
"time"

"github.com/cloudflare/pint/internal/discovery"
"github.com/cloudflare/pint/internal/output"
"github.com/cloudflare/pint/internal/parser"
"github.com/cloudflare/pint/internal/promapi"

"github.com/prometheus/common/model"
promParser "github.com/prometheus/prometheus/promql/parser"
)

const (
AlertsAbsentCheckName = "alerts/absent"
AlertsAbsentCheckDetails = "When Prometheus restart this alert rule might be evaluated before your service is scraped, which can cause false positives from absent() call.\nAdding `for` option that is at least 2x scrape interval will prevent this from happening."
)

func NewAlertsAbsentCheck(prom *promapi.FailoverGroup) AlertsAbsentCheck {
return AlertsAbsentCheck{
prom: prom,
}
}

type AlertsAbsentCheck struct {
prom *promapi.FailoverGroup
}

func (c AlertsAbsentCheck) Meta() CheckMeta {
return CheckMeta{
States: []discovery.ChangeType{
discovery.Noop,
discovery.Added,
discovery.Modified,
discovery.Moved,
},
IsOnline: true,
}
}

func (c AlertsAbsentCheck) String() string {
return fmt.Sprintf("%s(%s)", AlertsAbsentCheckName, c.prom.Name())
}

func (c AlertsAbsentCheck) Reporter() string {
return AlertsAbsentCheckName
}

func (c AlertsAbsentCheck) Check(ctx context.Context, _ discovery.Path, rule parser.Rule, _ []discovery.Entry) (problems []Problem) {
if rule.AlertingRule == nil {
return problems
}

if rule.AlertingRule.Expr.SyntaxError != nil {
return problems
}

if n, ok := rule.AlertingRule.Expr.Query.Expr.(*promParser.Call); !ok || n.Func.Name != "absent" {
return problems
}

cfg, err := c.prom.Config(ctx, 0)
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Warning)
problems = append(problems, Problem{
Lines: rule.AlertingRule.Expr.Value.Lines,
Reporter: c.Reporter(),
Text: text,
Severity: severity,
})
return problems
}

if rule.AlertingRule.For != nil {
forDur, err := model.ParseDuration(rule.AlertingRule.For.Value)
if err != nil {
return problems
}
if time.Duration(forDur) >= cfg.Config.Global.ScrapeInterval*2 {
return problems
}
}

problems = append(problems, Problem{
Lines: rule.AlertingRule.Expr.Value.Lines,
Reporter: c.Reporter(),
Text: fmt.Sprintf("Alert query is using absent() which might cause false positives when %s restarts, please add `for: %s` to avoid this.",
promText(c.prom.Name(), cfg.URI),
output.HumanizeDuration((cfg.Config.Global.ScrapeInterval * 2).Round(time.Minute)),
),
Details: AlertsAbsentCheckDetails,
Severity: Warning,
})

return problems
}
Loading