Skip to content

Commit

Permalink
Add redis sla reporting and fix non-working maintenance exclusion
Browse files Browse the repository at this point in the history
Signed-off-by: Nicolas Bigler <nicolas.bigler@vshn.ch>
  • Loading branch information
TheBigLee committed Oct 4, 2023
1 parent 60fad89 commit 1f1e0b2
Show file tree
Hide file tree
Showing 29 changed files with 87 additions and 67 deletions.
3 changes: 2 additions & 1 deletion class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ parameters:
appcat:
registry: ghcr.io
repository: vshn/appcat
tag: v4.33.0
tag: v4.34.0
apiserver:
registry: ghcr.io
repository: vshn/appcat-apiserver
Expand Down Expand Up @@ -381,6 +381,7 @@ parameters:
bucket_region: "lpg"
grpcEndpoint: ${appcat:grpcEndpoint}
defaultPlan: standard-1
sla: 99.25
plans:
standard-512m:
size:
Expand Down
2 changes: 1 addition & 1 deletion component/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ local maintenanceRule = kube._Object('monitoring.coreos.com/v1', 'PrometheusRule
name: 'appcat-cluster-maintenance',
rules: [
{
expr: 'scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) or vector(0))',
expr: 'max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m])) or vector(0)',
record: 'appcat:cluster:maintenance',
},
],
Expand Down
10 changes: 5 additions & 5 deletions component/slos.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,12 @@ local prometheusRule(name) =
sli: {
events: {
// The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0',
total_query: '(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0',
error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0',
total_query: '(sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0',
},
},
alerting+: {
name: 'SLO_AppCat_VSHNPosgtreSQLUptime',
name: 'SLO_AppCat_VSHNPostgreSQLUptime',
annotations+: {
summary: 'Probes to PostgreSQL by VSHN instance fail',
},
Expand Down Expand Up @@ -110,8 +110,8 @@ local prometheusRule(name) =
sli: {
events: {
// The 0*rate(...) makes sure that the query reports an error rate for all instances, even if that instance has never produced a single error
error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0',
total_query: '(sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - appcat:cluster:maintenance > 0',
error_query: '(sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis"}[{{.window}}]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0',
total_query: '(sum(rate(appcat_probes_seconds_count{service="VSHNRedis"}[{{.window}}])) by (service, namespace, name, organization, sla) or vector(0)) - scalar(appcat:cluster:maintenance) > 0',
},
},
alerting+: {
Expand Down
3 changes: 3 additions & 0 deletions component/vshn_redis.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ local xrd = xrds.XRDFromCRD(
connectionSecretKeys=connectionSecretKeys,
) + xrds.WithPlanDefaults(redisPlans, redisParams.defaultPlan);

local promRuleRedisSLA = common.PromRuleSLA(params.services.vshn.redis.sla, 'VSHNRedis');

local restoreServiceAccount = kube.ServiceAccount('redisrestoreserviceaccount') + {
metadata+: {
namespace: params.services.controlNamespace,
Expand Down Expand Up @@ -691,5 +693,6 @@ if params.services.vshn.enabled && redisParams.enabled then {
'20_rbac_vshn_redis_resize': [ resizeClusterRole, resizeServiceAccount, resizeClusterRoleBinding ],
'20_plans_vshn_redis': plansCM,
'21_composition_vshn_redis': composition,
'22_prom_rule_sla_redis': promRuleRedisSLA,
[if isOpenshift then '21_openshift_template_redis_vshn']: osTemplate,
} else {}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ spec:
groups:
- name: appcat-cluster-maintenance
rules:
- expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0))
- expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0)
record: appcat:cluster:maintenance
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ spec:
groups:
- name: appcat-cluster-maintenance
rules:
- expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0))
- expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0)
record: appcat:cluster:maintenance
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ spec:
groups:
- name: appcat-cluster-maintenance
rules:
- expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0))
- expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0)
record: appcat:cluster:maintenance
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
env:
- name: PLANS_NAMESPACE
value: syn-appcat
image: ghcr.io/vshn/appcat:v4.33.0
image: ghcr.io/vshn/appcat:v4.34.0
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ spec:
groups:
- name: appcat-cluster-maintenance
rules:
- expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0))
- expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0)
record: appcat:cluster:maintenance
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
value: "false"
- name: APPCAT_SLI_VSHNREDIS
value: "false"
image: ghcr.io/vshn/appcat:v4.33.0
image: ghcr.io/vshn/appcat:v4.34.0
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ spec:
groups:
- name: appcat-cluster-maintenance
rules:
- expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0))
- expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0)
record: appcat:cluster:maintenance
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ spec:
groups:
- name: appcat-cluster-maintenance
rules:
- expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0))
- expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0)
record: appcat:cluster:maintenance
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ spec:
data:
controlNamespace: syn-appcat-control
defaultPlan: standard-1
imageTag: v4.33.0
imageTag: v4.34.0
maintenanceSA: helm-based-service-maintenance
minioChartRepository: https://charts.min.io
minioChartVersion: 5.0.13
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
env:
- name: PLANS_NAMESPACE
value: syn-appcat
image: ghcr.io/vshn/appcat:v4.33.0
image: ghcr.io/vshn/appcat:v4.34.0
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ spec:
envFrom:
- secretRef:
name: appcat-sla-reports-creds
image: ghcr.io/vshn/appcat:v4.33.0
image: ghcr.io/vshn/appcat:v4.34.0
name: sla-reporter
resources:
limits:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
value: "false"
- name: APPCAT_SLI_VSHNREDIS
value: "false"
image: ghcr.io/vshn/appcat:v4.33.0
image: ghcr.io/vshn/appcat:v4.34.0
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ spec:
groups:
- name: appcat-cluster-maintenance
rules:
- expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0))
- expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0)
record: appcat:cluster:maintenance
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
value: "false"
- name: APPCAT_SLI_VSHNREDIS
value: "false"
image: ghcr.io/vshn/appcat:v4.33.0
image: ghcr.io/vshn/appcat:v4.34.0
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ spec:
groups:
- name: appcat-cluster-maintenance
rules:
- expr: scalar(max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0))
- expr: max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"}[10m]))
or vector(0)
record: appcat:cluster:maintenance
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ spec:
data:
controlNamespace: syn-appcat-control
defaultPlan: standard-1
imageTag: v4.33.0
imageTag: v4.34.0
maintenanceSA: helm-based-service-maintenance
minioChartRepository: https://charts.min.io
minioChartVersion: 5.0.13
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ spec:
emailAlertingSmtpHost: smtp.eu.mailgun.org:465
emailAlertingSmtpUsername: myuser@example.com
externalDatabaseConnectionsEnabled: 'true'
imageTag: v4.33.0
imageTag: v4.34.0
quotasEnabled: 'false'
sgNamespace: stackgres
sideCars: '{"clusterController": {"limits": {"cpu": "600m", "memory": "768Mi"},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ spec:
emailAlertingSmtpHost: smtp.eu.mailgun.org:465
emailAlertingSmtpUsername: myuser@example.com
externalDatabaseConnectionsEnabled: 'true'
imageTag: v4.33.0
imageTag: v4.34.0
quotasEnabled: 'false'
sgNamespace: stackgres
sideCars: '{"clusterController": {"limits": {"cpu": "600m", "memory": "768Mi"},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ spec:
data:
bucketRegion: lpg
controlNamespace: syn-appcat-control
imageTag: v4.33.0
imageTag: v4.34.0
maintenanceSA: helm-based-service-maintenance
quotasEnabled: 'false'
restoreSA: redisrestoreserviceaccount
Expand Down
16 changes: 16 additions & 0 deletions tests/golden/vshn/appcat/appcat/22_prom_rule_sla_redis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
annotations: {}
labels:
name: vshn-vshnredis-sla
name: vshn-vshnredis-sla
namespace: appcat-slos
spec:
groups:
- name: appcat-vshnredis-sla-target
rules:
- expr: vector(99.25)
labels:
service: VSHNRedis
record: sla:objective:ratio
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
env:
- name: PLANS_NAMESPACE
value: syn-appcat
image: ghcr.io/vshn/appcat:v4.33.0
image: ghcr.io/vshn/appcat:v4.34.0
livenessProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ spec:
envFrom:
- secretRef:
name: appcat-sla-reports-creds
image: ghcr.io/vshn/appcat:v4.33.0
image: ghcr.io/vshn/appcat:v4.34.0
name: sla-reporter
resources:
limits:
Expand Down
Loading

0 comments on commit 1f1e0b2

Please sign in to comment.