Skip to content

Commit

Permalink
Fix alerts/count gap detection
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Nov 9, 2023
1 parent 02875ec commit 15a0d03
Show file tree
Hide file tree
Showing 24 changed files with 35 additions and 27 deletions.
2 changes: 1 addition & 1 deletion cmd/pint/tests/0037_disable_checks.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ cmp stderr stderr.txt
level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=DEBUG msg="File parsed" path=rules/0001.yml rules=3
level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=[] include=[] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom uris=1 uptime=up tags=[] include=[] exclude=[]
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1 workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found alerting rule" path=rules/0001.yml alert=default-for lines=1-3
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0039_prom_selected_path.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ cmp stderr stderr.txt
level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=DEBUG msg="File parsed" path=rules/0001.yml rules=3
level=INFO msg="Configured new Prometheus server" name=disabled uris=1 tags=[] include=["^invalid/.+$"] exclude=["^invalid/rules/.+$"]
level=INFO msg="Configured new Prometheus server" name=disabled uris=1 uptime=up tags=[] include=["^invalid/.+$"] exclude=["^invalid/rules/.+$"]
level=DEBUG msg="Starting query workers" name=disabled uri=http://127.0.0.1:123 workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found alerting rule" path=rules/0001.yml alert=first lines=1-3
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0063_lint_offline.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ cmp stderr stderr.txt
-- stderr.txt --
level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=INFO msg="Configured new Prometheus server" name=disabled uris=1 tags=[] include=["^invalid/.+$"] exclude=[]
level=INFO msg="Configured new Prometheus server" name=disabled uris=1 uptime=up tags=[] include=["^invalid/.+$"] exclude=[]
-- rules/ok.yml --
- record: sum:foo
expr: sum(foo)
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0080_lint_online.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ cmp stderr stderr.txt
-- stderr.txt --
level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 tags=[] include=[] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 uptime=prometheus_ready tags=[] include=[] exclude=[]
level=WARN msg="No results for Prometheus uptime metric, you might have set uptime config option to a missing metric, please check your config" name=prom1 metric=prometheus_ready
level=WARN msg="Using dummy Prometheus uptime metric results with no gaps" name=prom1 metric=prometheus_ready
rules/1.yml:2 Warning: `http_errors_total[2d]` selector is trying to query Prometheus for 2d worth of metrics, but `prom1` Prometheus server at http://127.0.0.1:7080 is configured to only keep 1d of metrics history. (promql/range_query)
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0103_file_disable.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ cmp stderr stderr.txt
level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=DEBUG msg="File parsed" path=rules/0001.yml rules=1
level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=[] include=[] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom uris=1 uptime=up tags=[] include=[] exclude=[]
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=colo:test1 lines=9-10
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0108_rule_duplicate.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ cmp stderr stderr.txt
-- stderr.txt --
level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=[] include=[] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom uris=1 uptime=up tags=[] include=[] exclude=[]
level=ERROR msg="Query returned an error" err="failed to query Prometheus config: Get \"http://127.0.0.1:7108/api/v1/status/config\": dial tcp 127.0.0.1:7108: connect: connection refused" uri=http://127.0.0.1:7108 query=/api/v1/status/config
level=ERROR msg="Query returned an error" err="failed to query Prometheus config: Get \"http://127.0.0.1:7108/api/v1/status/config\": dial tcp 127.0.0.1:7108: connect: connection refused" uri=http://127.0.0.1:7108 query=/api/v1/status/config
level=ERROR msg="Query returned an error" err="failed to query Prometheus config: Get \"http://127.0.0.1:7108/api/v1/status/config\": dial tcp 127.0.0.1:7108: connect: connection refused" uri=http://127.0.0.1:7108 query=/api/v1/status/config
Expand Down
4 changes: 2 additions & 2 deletions cmd/pint/tests/0109_rule_duplicate_multiple_proms_include.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ cmp stderr stderr.txt
-- stderr.txt --
level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 tags=[] include=["^rules/0001.yml$"] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 tags=[] include=["^rules/0002.yml$"] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 uptime=up tags=[] include=["^rules/0001.yml$"] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 uptime=up tags=[] include=["^rules/0002.yml$"] exclude=[]
-- rules/0001.yml --
- record: "colo:duplicate"
expr: sum(foo) without(job)
Expand Down
4 changes: 2 additions & 2 deletions cmd/pint/tests/0110_rule_duplicate_multiple_proms_exclude.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ cmp stderr stderr.txt
-- stderr.txt --
level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 tags=[] include=[] exclude=["^rules/0002.yml$"]
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 tags=[] include=[] exclude=["^rules/0001.yml$"]
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 uptime=up tags=[] include=[] exclude=["^rules/0002.yml$"]
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 uptime=up tags=[] include=[] exclude=["^rules/0001.yml$"]
-- rules/0001.yml --
- record: "colo:duplicate"
expr: sum(foo) without(job)
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0115_file_disable_tag.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ cmp stderr stderr.txt
level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=DEBUG msg="File parsed" path=rules/0001.yml rules=1
level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=["foo","bar"] include=[] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom uris=1 uptime=up tags=["foo","bar"] include=[] exclude=[]
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=colo:test1 lines="6 8"
Expand Down
4 changes: 2 additions & 2 deletions cmd/pint/tests/0144_discovery_filepath.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ level=DEBUG msg="Rendered Prometheus server" name=prom2 uri=https://prom2.exampl
level=DEBUG msg="Path discovery match" match=^(?P<name>\w+).ya?ml$ path=prom2.yml
level=DEBUG msg="Extracted regexp variables" regexp=^(?P<name>\w+).ya?ml$ vars={"name":"prom2"}
level=DEBUG msg="Rendered Prometheus server" name=prom2 uri=https://prom2.example.com headers=["X-Host"] timeout=5s concurrency=16 rateLimit=100 uptime=up tags=["name/prom2"] required=true
level=INFO msg="Configured new Prometheus server" name=prom1 uris=2 tags=["name/prom1"] include=[] exclude=["^.*$"]
level=INFO msg="Configured new Prometheus server" name=prom1 uris=2 uptime=up tags=["name/prom1"] include=[] exclude=["^.*$"]
level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1.example.com workers=16
level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1-backup.example.com workers=16
level=INFO msg="Configured new Prometheus server" name=prom2 uris=2 tags=["name/prom2"] include=[] exclude=["^.*$"]
level=INFO msg="Configured new Prometheus server" name=prom2 uris=2 uptime=up tags=["name/prom2"] include=[] exclude=["^.*$"]
level=DEBUG msg="Starting query workers" name=prom2 uri=https://prom2.example.com workers=16
level=DEBUG msg="Starting query workers" name=prom2 uri=https://prom2-backup.example.com workers=16
level=DEBUG msg="Generated all Prometheus servers" count=2
Expand Down
4 changes: 2 additions & 2 deletions cmd/pint/tests/0145_discovery_filepath_dup.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ cmp stderr stderr.txt
level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=DEBUG msg="File parsed" path=rules/0001.yml rules=1
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 tags=[] include=[] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 uptime=up tags=[] include=[] exclude=[]
level=DEBUG msg="Starting query workers" name=prom2 uri=https://unique.example.com workers=16
level=INFO msg="Finding Prometheus servers using file paths" dir=servers match=^(?P<name>\w+).ya?ml$
level=DEBUG msg="Path discovery match" match=^(?P<name>\w+).ya?ml$ path=prom1.yaml
Expand All @@ -21,7 +21,7 @@ level=DEBUG msg="Rendered Prometheus server" name=prom2 uri=https://prom2.exampl
level=DEBUG msg="Path discovery match" match=^(?P<name>\w+).ya?ml$ path=prom2.yml
level=DEBUG msg="Extracted regexp variables" regexp=^(?P<name>\w+).ya?ml$ vars={"name":"prom2"}
level=DEBUG msg="Rendered Prometheus server" name=prom2 uri=https://prom2.example.com headers=[] timeout=5s concurrency=16 rateLimit=100 uptime=up tags=["name/prom2"] required=true
level=INFO msg="Configured new Prometheus server" name=prom1 uris=2 tags=["name/prom1"] include=[] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom1 uris=2 uptime=up tags=["name/prom1"] include=[] exclude=[]
level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1.example.com workers=16
level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1-backup.example.com workers=16
level=DEBUG msg="Stopping query workers" name=prom2 uri=https://unique.example.com
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0149_discovery_prom.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom1.exam
level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.example.com headers=["X-Host"] timeout=5s concurrency=16 rateLimit=100 uptime=up tags=[] required=false
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7149
level=DEBUG msg="Added new failover URI" name=prom-ha uri=https://prom2.example.com
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=2 tags=[] include=[] exclude=["^.*$"]
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=2 uptime=up tags=[] include=[] exclude=["^.*$"]
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom2.example.com workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0150_discovery_prom_dup_tags.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom1.exam
level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.example.com headers=["X-Host"] timeout=5s concurrency=16 rateLimit=100 uptime=up tags=["prom2"] required=false
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7150
level=WARN msg="Duplicated prometheus server with different tags" name=prom-ha a=["prom2"] b=["prom1"]
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 tags=["prom1"] include=[] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 uptime=up tags=["prom1"] include=[] exclude=[]
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com
level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom-ha"
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0152_discovery_prom_dup_uptime.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ level=DEBUG msg="Parsed response" uri=http://127.0.0.1:7152 query=prometheus_rea
level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom1.example.com headers=[] timeout=5s concurrency=16 rateLimit=100 uptime=prom2 tags=[] required=false
level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.example.com headers=[] timeout=5s concurrency=16 rateLimit=100 uptime=prom2 tags=[] required=false
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7152
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=2 tags=[] include=[] exclude=["^.*$"]
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=2 uptime=prom1 tags=[] include=[] exclude=["^.*$"]
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom2.example.com workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0155_discovery_prom_dup_include.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom1.exam
level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.example.com headers=[] timeout=2m0s concurrency=16 rateLimit=100 uptime=up tags=[] required=false
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7155
level=WARN msg="Duplicated prometheus server with different include" name=prom-ha a=["^prom2$"] b=["^prom1$"]
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 tags=[] include=["^prom1$"] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 uptime=up tags=[] include=["^prom1$"] exclude=[]
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com
level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom-ha"
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0156_discovery_prom_dup_exclude.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom1.exam
level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.example.com headers=[] timeout=2m0s concurrency=16 rateLimit=100 uptime=up tags=[] required=false
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7156
level=WARN msg="Duplicated prometheus server with different exclude" name=prom-ha a=["^prom2$"] b=["^prom1$"]
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 tags=[] include=[] exclude=["^prom1$"]
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 uptime=up tags=[] include=[] exclude=["^prom1$"]
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com
level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom-ha"
Expand Down
4 changes: 2 additions & 2 deletions cmd/pint/tests/0157_series_other_servers.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ cmp stderr stderr.txt
-- stderr.txt --
level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 tags=[] include=["^rules/1.yml$"] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 tags=[] include=["^rules/2.yml$"] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 uptime=up tags=[] include=["^rules/1.yml$"] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 uptime=up tags=[] include=["^rules/2.yml$"] exclude=[]
level=WARN msg="No results for Prometheus uptime metric, you might have set uptime config option to a missing metric, please check your config" name=prom1 metric=up
level=WARN msg="Using dummy Prometheus uptime metric results with no gaps" name=prom1 metric=up
rules/1.yml:5 Bug: `prom1` Prometheus server at http://127.0.0.1:7157 didn't have any series for `only_on_prom2` metric in the last 1w. (promql/series)
Expand Down
7 changes: 7 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog

## v0.49.1

### Fixed

- `alerts/count` check wasn't using `uptime` field from `prometheus` config blocks
for metric gap detection.

## v0.49.0

### Added
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/cloudflare/pint

go 1.21.4
go 1.21.3

require (
github.com/cespare/xxhash/v2 v2.2.0
Expand Down
2 changes: 1 addition & 1 deletion internal/checks/alerts_count.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func (c AlertsCheck) Check(ctx context.Context, _ string, rule parser.Rule, _ []
}

if len(qr.Series.Ranges) > 0 {
promUptime, err := c.prom.RangeQuery(ctx, "count(up)", params)
promUptime, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", c.prom.UptimeMetric()), params)
if err != nil {
slog.Warn("Cannot detect Prometheus uptime gaps", slog.Any("err", err), slog.String("name", c.prom.Name()))
} else {
Expand Down
1 change: 1 addition & 0 deletions internal/config/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ func (pg *PrometheusGenerator) addServer(server *promapi.FailoverGroup) error {
"Configured new Prometheus server",
slog.String("name", server.Name()),
slog.Int("uris", server.ServerCount()),
slog.String("uptime", server.UptimeMetric()),
slog.Any("tags", server.Tags()),
slog.Any("include", server.Include()),
slog.Any("exclude", server.Exclude()),
Expand Down
2 changes: 1 addition & 1 deletion tools/gofumpt/go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module _

go 1.21.4
go 1.21.3

require mvdan.cc/gofumpt v0.5.0

Expand Down
2 changes: 1 addition & 1 deletion tools/goimports/go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module _

go 1.21.4
go 1.21.3

require golang.org/x/tools v0.14.0

Expand Down
2 changes: 1 addition & 1 deletion tools/golangci-lint/go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module _

go 1.21.4
go 1.21.3

require github.com/golangci/golangci-lint v1.55.2

Expand Down

0 comments on commit 15a0d03

Please sign in to comment.