Skip to content

Commit

Permalink
Merge pull request #892 from SuperQ/superq/split-histogram
Browse files Browse the repository at this point in the history
Split kube_apiserver rules
  • Loading branch information
metalmatze authored Feb 27, 2024
2 parents 883f294 + 0b456ff commit a1c276d
Show file tree
Hide file tree
Showing 5 changed files with 306 additions and 284 deletions.
165 changes: 165 additions & 0 deletions rules/kube_apiserver-availability.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
{
prometheusRules+:: {
local SLODays = $._config.SLOs.apiserver.days + 'd',
local verbs = [
{ type: 'read', selector: $._config.kubeApiserverReadSelector },
{ type: 'write', selector: $._config.kubeApiserverWriteSelector },
],

groups+: [
{
name: 'kube-apiserver-availability.rules',
interval: '3m',
rules: [
{
record: 'code_verb:apiserver_request_total:increase%s' % SLODays,
expr: |||
avg_over_time(code_verb:apiserver_request_total:increase1h[%s]) * 24 * %d
||| % [SLODays, $._config.SLOs.apiserver.days],
},
] + [
{
record: 'code:apiserver_request_total:increase%s' % SLODays,
expr: |||
sum by (%s, code) (code_verb:apiserver_request_total:increase%s{%s})
||| % [$._config.clusterLabel, SLODays, verb.selector],
labels: {
verb: verb.type,
},
}
for verb in verbs
] + [
{
record: 'cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h',
expr: |||
sum by (%(clusterLabel)s, verb, scope) (increase(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s}[1h]))
||| % $._config,
},
{
record: 'cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%s' % SLODays,
expr: |||
sum by (%s, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[%s]) * 24 * %s)
||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days],
},
{
record: 'cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h',
expr: |||
sum by (%(clusterLabel)s, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))
||| % $._config,
},
{
record: 'cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%s' % SLODays,
expr: |||
sum by (%s, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[%s]) * 24 * %s)
||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days],
},
{
record: 'apiserver_request:availability%s' % SLODays,
expr: |||
1 - (
(
# write too slow
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s})
-
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"})
) +
(
# read too slow
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s})
-
(
(
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"})
or
vector(0)
)
+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"})
+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"})
)
) +
# errors
sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{code=~"5.."} or vector(0))
)
/
sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s)
||| % ($._config { SLODays: SLODays }),
labels: {
verb: 'all',
},
},
{
record: 'apiserver_request:availability%s' % SLODays,
expr: |||
1 - (
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s})
-
(
# too slow
(
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"})
or
vector(0)
)
+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"})
+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"})
)
+
# errors
sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="read",code=~"5.."} or vector(0))
)
/
sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="read"})
||| % ($._config { SLODays: SLODays, days: $._config.SLOs.apiserver.days }),
labels: {
verb: 'read',
},
},
{
record: 'apiserver_request:availability%s' % SLODays,
expr: |||
1 - (
(
# too slow
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s})
-
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"})
)
+
# errors
sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="write",code=~"5.."} or vector(0))
)
/
sum by (%(clusterLabel)s) (code:apiserver_request_total:increase%(SLODays)s{verb="write"})
||| % ($._config { SLODays: SLODays, days: $._config.SLOs.apiserver.days }),
labels: {
verb: 'write',
},
},
] + [
{
record: 'code_resource:apiserver_request_total:rate5m',
expr: |||
sum by (%s,code,resource) (rate(apiserver_request_total{%s}[5m]))
||| % [$._config.clusterLabel, std.join(',', [$._config.kubeApiserverSelector, verb.selector])],
labels: {
verb: verb.type,
},
}
for verb in verbs
] + [
{
record: 'code_verb:apiserver_request_total:increase1h',
expr: |||
sum by (%s, code, verb) (increase(apiserver_request_total{%s,verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"%s"}[1h]))
||| % [$._config.clusterLabel, $._config.kubeApiserverSelector, code],
}
for code in ['2..', '3..', '4..', '5..']
],
},
],
},
}
93 changes: 93 additions & 0 deletions rules/kube_apiserver-burnrate.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
{
prometheusRules+:: {
groups+: [
{
name: 'kube-apiserver-burnrate.rules',
rules: [
{
record: 'apiserver_request:burnrate%(window)s' % w,
expr: |||
(
(
# too slow
sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s]))
-
(
(
sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}[%(window)s]))
or
vector(0)
)
+
sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}[%(window)s]))
+
sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}[%(window)s]))
)
)
+
# errors
sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,code=~"5.."}[%(window)s]))
)
/
sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s}[%(window)s]))
||| % {
clusterLabel: $._config.clusterLabel,
window: w,
kubeApiserverSelector: $._config.kubeApiserverSelector,
kubeApiserverReadSelector: $._config.kubeApiserverReadSelector,
kubeApiserverNonStreamingSelector: $._config.kubeApiserverNonStreamingSelector,
kubeApiserverReadResourceLatency: $._config.kubeApiserverReadResourceLatency,
kubeApiserverReadNamespaceLatency: $._config.kubeApiserverReadNamespaceLatency,
kubeApiserverReadClusterLatency: $._config.kubeApiserverReadClusterLatency,
},
labels: {
verb: 'read',
},
}
for w in std.set([ // Get the unique array of short and long window rates
w.short
for w in $._config.SLOs.apiserver.windows
] + [
w.long
for w in $._config.SLOs.apiserver.windows
])
] + [
{
record: 'apiserver_request:burnrate%(window)s' % w,
expr: |||
(
(
# too slow
sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s]))
-
sum by (%(clusterLabel)s) (rate(apiserver_request_sli_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s,le="%(kubeApiserverWriteLatency)s"}[%(window)s]))
)
+
sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,code=~"5.."}[%(window)s]))
)
/
sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s}[%(window)s]))
||| % {
clusterLabel: $._config.clusterLabel,
window: w,
kubeApiserverSelector: $._config.kubeApiserverSelector,
kubeApiserverWriteSelector: $._config.kubeApiserverWriteSelector,
kubeApiserverNonStreamingSelector: $._config.kubeApiserverNonStreamingSelector,
kubeApiserverWriteLatency: $._config.kubeApiserverWriteLatency,
},
labels: {
verb: 'write',
},
}
for w in std.set([ // Get the unique array of short and long window rates
w.short
for w in $._config.SLOs.apiserver.windows
] + [
w.long
for w in $._config.SLOs.apiserver.windows
])
],
},
],
},
}
16 changes: 16 additions & 0 deletions rules/kube_apiserver-config.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
_config+:: {
kubeApiserverSelector: 'job="kube-apiserver"',
podLabel: 'pod',
kubeApiserverReadSelector: 'verb=~"LIST|GET"',
kubeApiserverWriteSelector: 'verb=~"POST|PUT|PATCH|DELETE"',
kubeApiserverNonStreamingSelector: 'subresource!~"proxy|attach|log|exec|portforward"',
// These are buckets that exist on the apiserver_request_sli_duration_seconds_bucket histogram.
// They are what the Kubernetes SIG Scalability is using to measure availability of Kubernetes clusters.
// If you want to change these, make sure the "le" buckets exist on the histogram!
kubeApiserverReadResourceLatency: '1',
kubeApiserverReadNamespaceLatency: '5',
kubeApiserverReadClusterLatency: '30',
kubeApiserverWriteLatency: '1',
},
}
28 changes: 28 additions & 0 deletions rules/kube_apiserver-histogram.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
prometheusRules+:: {
local verbs = [
{ type: 'read', selector: $._config.kubeApiserverReadSelector },
{ type: 'write', selector: $._config.kubeApiserverWriteSelector },
],

groups+: [
{
name: 'kube-apiserver-histogram.rules',
rules:
[
{
record: 'cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile',
expr: |||
histogram_quantile(0.99, sum by (%s, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{%s}[5m]))) > 0
||| % [$._config.clusterLabel, std.join(',', [$._config.kubeApiserverSelector, verb.selector, $._config.kubeApiserverNonStreamingSelector])],
labels: {
verb: verb.type,
quantile: '0.99',
},
}
for verb in verbs
],
},
],
},
}
Loading

0 comments on commit a1c276d

Please sign in to comment.