Merge pull request #249 from vshn/redis-alerting

Add Redis alerting
vshn · Oct 24, 2023 · 99cd295 · 99cd295
2 parents 4d4aa06 + 060ec6e
commit 99cd295
Show file tree

Hide file tree

Showing 22 changed files with 291 additions and 42 deletions.
diff --git a/component/class/defaults.yml b/component/class/defaults.yml
@@ -39,7 +39,7 @@ parameters:
       appcat:
         registry: ghcr.io
         repository: vshn/appcat
-        tag: v4.36.0
+        tag: v4.37.0
       apiserver:
         registry: ghcr.io
         repository: vshn/appcat-apiserver

diff --git a/component/component/main.jsonnet b/component/component/main.jsonnet
@@ -171,5 +171,6 @@ local emailSecret = kube.Secret(params.services.vshn.emailAlerting.secretName) {
   [if params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_slo_vshn_postgresql']: slos.Get('vshn-postgresql'),
   [if params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_slo_vshn_postgresql_ha']: slos.Get('vshn-postgresql-ha'),
   [if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_slo_vshn_redis']: slos.Get('vshn-redis'),
+  [if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_slo_vshn_redis_ha']: slos.Get('vshn-redis-ha'),
 }
 else {}
diff --git a/component/component/slos.libsonnet b/component/component/slos.libsonnet
@@ -89,7 +89,7 @@ local prometheusRule(name) =
           events: {
             // The  0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
             error_query: 'sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla)',
-            total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL",  ha="true"}[{{.window}}])) by (service, namespace, name, organization, sla)',
+            total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[{{.window}}])) by (service, namespace, name, organization, sla)',
           },
         },
         alerting+: {
@@ -104,21 +104,48 @@ local prometheusRule(name) =
         },
       },
     ],
+    // redis without HA
     'vshn-redis': [
-      newSLO('uptime', 'vshn-Redis', params.slos.vshn.redis.uptime) {
+      newSLO('uptime', 'vshn-redis', params.slos.vshn.redis.uptime) {
         description: 'Uptime SLO for Redis by VSHN',
         sli: {
           events: {
             // The  0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
-            error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}]))  by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla)',
-            total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla)',
+            error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}]))  by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla)',
+            total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[{{.window}}])) by (service, namespace, name, organization, sla)',
           },
         },
         alerting+: {
           name: 'SLO_AppCat_VSHNRedisUptime',
           annotations+: {
             summary: 'Probes to Redis by VSHN instance fail',
           },
+          labels+: {
+            service: 'VSHNRedis',
+            OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
+          },
+        },
+      },
+    ],
+    'vshn-redis-ha': [
+      newSLO('uptime', 'vshn-redis-ha', params.slos.vshn.redis.uptime) {
+        description: 'Uptime SLO for High Available Redis by VSHN',
+        sli: {
+          events: {
+            // The  0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
+            error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}]))  by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0 or sum(0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla)',
+            total_query: 'sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[{{.window}}])) by (service, namespace, name, organization, sla)',
+          },
+        },
+        alerting+: {
+          name: 'SLO_AppCat_HAVSHNRedisUptime',
+          annotations+: {
+            summary: 'Probes to HA Redis by VSHN instance fail',
+          },
+          labels+: {
+            service: 'VSHNRedis',
+            OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
+          },
         },
       },
     ],

diff --git a/component/tests/golden/controllers/appcat/appcat/controllers/appcat/30_deployment.yaml b/component/tests/golden/controllers/appcat/appcat/controllers/appcat/30_deployment.yaml
@@ -23,7 +23,7 @@ spec:
           env:
             - name: PLANS_NAMESPACE
               value: syn-appcat
-          image: ghcr.io/vshn/appcat:v4.36.0
+          image: ghcr.io/vshn/appcat:v4.37.0
           livenessProbe:
             httpGet:
               path: /healthz

diff --git a/.../appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml b/.../appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml
@@ -28,7 +28,7 @@ spec:
           value: "false"
         - name: APPCAT_SLI_VSHNREDIS
           value: "false"
-        image: ghcr.io/vshn/appcat:v4.36.0
+        image: ghcr.io/vshn/appcat:v4.37.0
         livenessProbe:
           httpGet:
             path: /healthz

diff --git a/component/tests/golden/minio/appcat/appcat/21_composition_vshn_minio.yaml b/component/tests/golden/minio/appcat/appcat/21_composition_vshn_minio.yaml
@@ -25,7 +25,7 @@ spec:
         data:
           controlNamespace: syn-appcat-control
           defaultPlan: standard-1
-          imageTag: v4.36.0
+          imageTag: v4.37.0
           maintenanceSA: helm-based-service-maintenance
           minioChartRepository: https://charts.min.io
           minioChartVersion: 5.0.13

diff --git a/component/tests/golden/minio/appcat/appcat/controllers/appcat/30_deployment.yaml b/component/tests/golden/minio/appcat/appcat/controllers/appcat/30_deployment.yaml
@@ -23,7 +23,7 @@ spec:
           env:
             - name: PLANS_NAMESPACE
               value: syn-appcat
-          image: ghcr.io/vshn/appcat:v4.36.0
+          image: ghcr.io/vshn/appcat:v4.37.0
           livenessProbe:
             httpGet:
               path: /healthz

diff --git a/component/tests/golden/minio/appcat/appcat/sla_reporter/01_cronjob.yaml b/component/tests/golden/minio/appcat/appcat/sla_reporter/01_cronjob.yaml
@@ -30,7 +30,7 @@ spec:
               envFrom:
                 - secretRef:
                     name: appcat-sla-reports-creds
-              image: ghcr.io/vshn/appcat:v4.36.0
+              image: ghcr.io/vshn/appcat:v4.37.0
               name: sla-reporter
               resources:
                 limits:

diff --git a/.../appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml b/.../appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml
@@ -28,7 +28,7 @@ spec:
           value: "false"
         - name: APPCAT_SLI_VSHNREDIS
           value: "false"
-        image: ghcr.io/vshn/appcat:v4.36.0
+        image: ghcr.io/vshn/appcat:v4.37.0
         livenessProbe:
           httpGet:
             path: /healthz

diff --git a/.../appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml b/.../appcat/appcat/sli_exporter/apps_v1_deployment_appcat-sliexporter-controller-manager.yaml
@@ -28,7 +28,7 @@ spec:
           value: "false"
         - name: APPCAT_SLI_VSHNREDIS
           value: "false"
-        image: ghcr.io/vshn/appcat:v4.36.0
+        image: ghcr.io/vshn/appcat:v4.37.0
         livenessProbe:
           httpGet:
             path: /healthz

diff --git a/component/tests/golden/vshn/appcat/appcat/20_xrd_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/20_xrd_vshn_redis.yaml
@@ -125,6 +125,15 @@ spec:
                         redisSettings:
                           description: RedisSettings contains additional Redis settings.
                           type: string
+                        serviceLevel:
+                          default: besteffort
+                          description: ServiceLevel defines the service level of this
+                            service. Either Best Effort or Guaranteed Availability
+                            is allowed.
+                          enum:
+                            - besteffort
+                            - guaranteed
+                          type: string
                         version:
                           default: '7.0'
                           description: Version contains supported version of Redis.

diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_minio.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_minio.yaml
@@ -25,7 +25,7 @@ spec:
         data:
           controlNamespace: syn-appcat-control
           defaultPlan: standard-1
-          imageTag: v4.36.0
+          imageTag: v4.37.0
           maintenanceSA: helm-based-service-maintenance
           minioChartRepository: https://charts.min.io
           minioChartVersion: 5.0.13

diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgres.yaml
@@ -33,7 +33,7 @@ spec:
           emailAlertingSmtpHost: smtp.eu.mailgun.org:465
           emailAlertingSmtpUsername: myuser@example.com
           externalDatabaseConnectionsEnabled: 'true'
-          imageTag: v4.36.0
+          imageTag: v4.37.0
           quotasEnabled: 'false'
           sgNamespace: stackgres
           sideCars: '{"clusterController": {"limits": {"cpu": "32m", "memory": "2Gi"},

diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_postgresrestore.yaml
@@ -33,7 +33,7 @@ spec:
           emailAlertingSmtpHost: smtp.eu.mailgun.org:465
           emailAlertingSmtpUsername: myuser@example.com
           externalDatabaseConnectionsEnabled: 'true'
-          imageTag: v4.36.0
+          imageTag: v4.37.0
           quotasEnabled: 'false'
           sgNamespace: stackgres
           sideCars: '{"clusterController": {"limits": {"cpu": "32m", "memory": "2Gi"},

diff --git a/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml b/component/tests/golden/vshn/appcat/appcat/21_composition_vshn_redis.yaml
@@ -30,7 +30,7 @@ spec:
         data:
           bucketRegion: lpg
           controlNamespace: syn-appcat-control
-          imageTag: v4.36.0
+          imageTag: v4.37.0
           maintenanceSA: helm-based-service-maintenance
           quotasEnabled: 'false'
           restoreSA: redisrestoreserviceaccount

diff --git a/component/tests/golden/vshn/appcat/appcat/controllers/appcat/30_deployment.yaml b/component/tests/golden/vshn/appcat/appcat/controllers/appcat/30_deployment.yaml
@@ -23,7 +23,7 @@ spec:
           env:
             - name: PLANS_NAMESPACE
               value: syn-appcat
-          image: ghcr.io/vshn/appcat:v4.36.0
+          image: ghcr.io/vshn/appcat:v4.37.0
           livenessProbe:
             httpGet:
               path: /healthz

diff --git a/component/tests/golden/vshn/appcat/appcat/sla_reporter/01_cronjob.yaml b/component/tests/golden/vshn/appcat/appcat/sla_reporter/01_cronjob.yaml
@@ -30,7 +30,7 @@ spec:
               envFrom:
                 - secretRef:
                     name: appcat-sla-reports-creds
-              image: ghcr.io/vshn/appcat:v4.36.0
+              image: ghcr.io/vshn/appcat:v4.37.0
               name: sla-reporter
               resources:
                 limits:

diff --git a/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_postgresql_ha.yaml b/component/tests/golden/vshn/appcat/appcat/sli_exporter/90_slo_vshn_postgresql_ha.yaml
@@ -13,7 +13,7 @@ spec:
         - expr: |
             (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[5m])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[5m])) by (service, namespace, name, organization, sla))
             /
-            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL",  ha="true"}[5m])) by (service, namespace, name, organization, sla))
+            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[5m])) by (service, namespace, name, organization, sla))
           labels:
             sloth_id: appcat-vshn-postgresql-ha-uptime
             sloth_service: appcat-vshn-postgresql-ha
@@ -23,7 +23,7 @@ spec:
         - expr: |
             (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[30m])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[30m])) by (service, namespace, name, organization, sla))
             /
-            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL",  ha="true"}[30m])) by (service, namespace, name, organization, sla))
+            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[30m])) by (service, namespace, name, organization, sla))
           labels:
             sloth_id: appcat-vshn-postgresql-ha-uptime
             sloth_service: appcat-vshn-postgresql-ha
@@ -33,7 +33,7 @@ spec:
         - expr: |
             (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1h])) by (service, namespace, name, organization, sla))
             /
-            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL",  ha="true"}[1h])) by (service, namespace, name, organization, sla))
+            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[1h])) by (service, namespace, name, organization, sla))
           labels:
             sloth_id: appcat-vshn-postgresql-ha-uptime
             sloth_service: appcat-vshn-postgresql-ha
@@ -43,7 +43,7 @@ spec:
         - expr: |
             (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[2h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[2h])) by (service, namespace, name, organization, sla))
             /
-            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL",  ha="true"}[2h])) by (service, namespace, name, organization, sla))
+            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[2h])) by (service, namespace, name, organization, sla))
           labels:
             sloth_id: appcat-vshn-postgresql-ha-uptime
             sloth_service: appcat-vshn-postgresql-ha
@@ -53,7 +53,7 @@ spec:
         - expr: |
             (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[6h])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[6h])) by (service, namespace, name, organization, sla))
             /
-            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL",  ha="true"}[6h])) by (service, namespace, name, organization, sla))
+            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[6h])) by (service, namespace, name, organization, sla))
           labels:
             sloth_id: appcat-vshn-postgresql-ha-uptime
             sloth_service: appcat-vshn-postgresql-ha
@@ -63,7 +63,7 @@ spec:
         - expr: |
             (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1d])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1d])) by (service, namespace, name, organization, sla))
             /
-            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL",  ha="true"}[1d])) by (service, namespace, name, organization, sla))
+            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[1d])) by (service, namespace, name, organization, sla))
           labels:
             sloth_id: appcat-vshn-postgresql-ha-uptime
             sloth_service: appcat-vshn-postgresql-ha
@@ -73,7 +73,7 @@ spec:
         - expr: |
             (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[3d])) by (service, namespace, name, organization, sla) or sum(0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[3d])) by (service, namespace, name, organization, sla))
             /
-            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL",  ha="true"}[3d])) by (service, namespace, name, organization, sla))
+            (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[3d])) by (service, namespace, name, organization, sla))
           labels:
             sloth_id: appcat-vshn-postgresql-ha-uptime
             sloth_service: appcat-vshn-postgresql-ha