Skip to content

Commit

Permalink
Merge pull request #249 from vshn/redis-alerting
Browse files Browse the repository at this point in the history
Add Redis alerting
  • Loading branch information
wejdross authored Oct 24, 2023
2 parents 4d4aa06 + 060ec6e commit 99cd295
Show file tree
Hide file tree
Showing 22 changed files with 291 additions and 42 deletions.
2 changes: 1 addition & 1 deletion component/class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ parameters:
appcat:
registry: ghcr.io
repository: vshn/appcat
tag: v4.36.0
tag: v4.37.0
apiserver:
registry: ghcr.io
repository: vshn/appcat-apiserver
Expand Down
1 change: 1 addition & 0 deletions component/component/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -171,5 +171,6 @@ local emailSecret = kube.Secret(params.services.vshn.emailAlerting.secretName) {
[if params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_slo_vshn_postgresql']: slos.Get('vshn-postgresql'),
[if params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_slo_vshn_postgresql_ha']: slos.Get('vshn-postgresql-ha'),
[if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_slo_vshn_redis']: slos.Get('vshn-redis'),
[if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_slo_vshn_redis_ha']: slos.Get('vshn-redis-ha'),
}
else {}
35 changes: 31 additions & 4 deletions component/component/slos.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ local prometheusRule(name) =
events: {
// The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
error_query: 'sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla)',
total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[{{.window}}])) by (service, namespace, name, organization, sla)',
total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[{{.window}}])) by (service, namespace, name, organization, sla)',
},
},
alerting+: {
Expand All @@ -104,21 +104,48 @@ local prometheusRule(name) =
},
},
],
// redis without HA
'vshn-redis': [
newSLO('uptime', 'vshn-Redis', params.slos.vshn.redis.uptime) {
newSLO('uptime', 'vshn-redis', params.slos.vshn.redis.uptime) {
description: 'Uptime SLO for Redis by VSHN',
sli: {
events: {
// The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla)',
total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla)',
error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla)',
total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[{{.window}}])) by (service, namespace, name, organization, sla)',
},
},
alerting+: {
name: 'SLO_AppCat_VSHNRedisUptime',
annotations+: {
summary: 'Probes to Redis by VSHN instance fail',
},
labels+: {
service: 'VSHNRedis',
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
},
},
},
],
'vshn-redis-ha': [
newSLO('uptime', 'vshn-redis-ha', params.slos.vshn.redis.uptime) {
description: 'Uptime SLO for High Available Redis by VSHN',
sli: {
events: {
// The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla)',
total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[{{.window}}])) by (service, namespace, name, organization, sla)',
},
},
alerting+: {
name: 'SLO_AppCat_HAVSHNRedisUptime',
annotations+: {
summary: 'Probes to HA Redis by VSHN instance fail',
},
labels+: {
service: 'VSHNRedis',
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
},
},
},
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
env:
- name: PLANS_NAMESPACE
value: syn-appcat
image: ghcr.io/vshn/appcat:v4.36.0
image: ghcr.io/vshn/appcat:v4.37.0
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
value: "false"
- name: APPCAT_SLI_VSHNREDIS
value: "false"
image: ghcr.io/vshn/appcat:v4.36.0
image: ghcr.io/vshn/appcat:v4.37.0
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ spec:
data:
controlNamespace: syn-appcat-control
defaultPlan: standard-1
imageTag: v4.36.0
imageTag: v4.37.0
maintenanceSA: helm-based-service-maintenance
minioChartRepository: https://charts.min.io
minioChartVersion: 5.0.13
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
env:
- name: PLANS_NAMESPACE
value: syn-appcat
image: ghcr.io/vshn/appcat:v4.36.0
image: ghcr.io/vshn/appcat:v4.37.0
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ spec:
envFrom:
- secretRef:
name: appcat-sla-reports-creds
image: ghcr.io/vshn/appcat:v4.36.0
image: ghcr.io/vshn/appcat:v4.37.0
name: sla-reporter
resources:
limits:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
value: "false"
- name: APPCAT_SLI_VSHNREDIS
value: "false"
image: ghcr.io/vshn/appcat:v4.36.0
image: ghcr.io/vshn/appcat:v4.37.0
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
value: "false"
- name: APPCAT_SLI_VSHNREDIS
value: "false"
image: ghcr.io/vshn/appcat:v4.36.0
image: ghcr.io/vshn/appcat:v4.37.0
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,15 @@ spec:
redisSettings:
description: RedisSettings contains additional Redis settings.
type: string
serviceLevel:
default: besteffort
description: ServiceLevel defines the service level of this
service. Either Best Effort or Guaranteed Availability
is allowed.
enum:
- besteffort
- guaranteed
type: string
version:
default: '7.0'
description: Version contains supported version of Redis.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ spec:
data:
controlNamespace: syn-appcat-control
defaultPlan: standard-1
imageTag: v4.36.0
imageTag: v4.37.0
maintenanceSA: helm-based-service-maintenance
minioChartRepository: https://charts.min.io
minioChartVersion: 5.0.13
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ spec:
emailAlertingSmtpHost: smtp.eu.mailgun.org:465
emailAlertingSmtpUsername: myuser@example.com
externalDatabaseConnectionsEnabled: 'true'
imageTag: v4.36.0
imageTag: v4.37.0
quotasEnabled: 'false'
sgNamespace: stackgres
sideCars: '{"clusterController": {"limits": {"cpu": "32m", "memory": "2Gi"},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ spec:
emailAlertingSmtpHost: smtp.eu.mailgun.org:465
emailAlertingSmtpUsername: myuser@example.com
externalDatabaseConnectionsEnabled: 'true'
imageTag: v4.36.0
imageTag: v4.37.0
quotasEnabled: 'false'
sgNamespace: stackgres
sideCars: '{"clusterController": {"limits": {"cpu": "32m", "memory": "2Gi"},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ spec:
data:
bucketRegion: lpg
controlNamespace: syn-appcat-control
imageTag: v4.36.0
imageTag: v4.37.0
maintenanceSA: helm-based-service-maintenance
quotasEnabled: 'false'
restoreSA: redisrestoreserviceaccount
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
env:
- name: PLANS_NAMESPACE
value: syn-appcat
image: ghcr.io/vshn/appcat:v4.36.0
image: ghcr.io/vshn/appcat:v4.37.0
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ spec:
envFrom:
- secretRef:
name: appcat-sla-reports-creds
image: ghcr.io/vshn/appcat:v4.36.0
image: ghcr.io/vshn/appcat:v4.37.0
name: sla-reporter
resources:
limits:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ spec:
- expr: |
(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[5m])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[5m])) by (service, namespace, name, organization, sla))
/
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[5m])) by (service, namespace, name, organization, sla))
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[5m])) by (service, namespace, name, organization, sla))
labels:
sloth_id: appcat-vshn-postgresql-ha-uptime
sloth_service: appcat-vshn-postgresql-ha
Expand All @@ -23,7 +23,7 @@ spec:
- expr: |
(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[30m])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[30m])) by (service, namespace, name, organization, sla))
/
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[30m])) by (service, namespace, name, organization, sla))
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[30m])) by (service, namespace, name, organization, sla))
labels:
sloth_id: appcat-vshn-postgresql-ha-uptime
sloth_service: appcat-vshn-postgresql-ha
Expand All @@ -33,7 +33,7 @@ spec:
- expr: |
(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1h])) by (service, namespace, name, organization, sla))
/
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[1h])) by (service, namespace, name, organization, sla))
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[1h])) by (service, namespace, name, organization, sla))
labels:
sloth_id: appcat-vshn-postgresql-ha-uptime
sloth_service: appcat-vshn-postgresql-ha
Expand All @@ -43,7 +43,7 @@ spec:
- expr: |
(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[2h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[2h])) by (service, namespace, name, organization, sla))
/
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[2h])) by (service, namespace, name, organization, sla))
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[2h])) by (service, namespace, name, organization, sla))
labels:
sloth_id: appcat-vshn-postgresql-ha-uptime
sloth_service: appcat-vshn-postgresql-ha
Expand All @@ -53,7 +53,7 @@ spec:
- expr: |
(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[6h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[6h])) by (service, namespace, name, organization, sla))
/
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[6h])) by (service, namespace, name, organization, sla))
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[6h])) by (service, namespace, name, organization, sla))
labels:
sloth_id: appcat-vshn-postgresql-ha-uptime
sloth_service: appcat-vshn-postgresql-ha
Expand All @@ -63,7 +63,7 @@ spec:
- expr: |
(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1d])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1d])) by (service, namespace, name, organization, sla))
/
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[1d])) by (service, namespace, name, organization, sla))
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[1d])) by (service, namespace, name, organization, sla))
labels:
sloth_id: appcat-vshn-postgresql-ha-uptime
sloth_service: appcat-vshn-postgresql-ha
Expand All @@ -73,7 +73,7 @@ spec:
- expr: |
(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[3d])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[3d])) by (service, namespace, name, organization, sla))
/
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[3d])) by (service, namespace, name, organization, sla))
(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[3d])) by (service, namespace, name, organization, sla))
labels:
sloth_id: appcat-vshn-postgresql-ha-uptime
sloth_service: appcat-vshn-postgresql-ha
Expand Down
Loading

0 comments on commit 99cd295

Please sign in to comment.