diff --git a/charts/sophora-server/Chart.yaml b/charts/sophora-server/Chart.yaml index f99ef57..4f08abc 100644 --- a/charts/sophora-server/Chart.yaml +++ b/charts/sophora-server/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.7.0 +version: 1.8.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/sophora-server/alerting-runbook.md b/charts/sophora-server/alerting-runbook.md new file mode 100644 index 0000000..fb34a07 --- /dev/null +++ b/charts/sophora-server/alerting-runbook.md @@ -0,0 +1,95 @@ +# Alerting Runbook + +This document is a reference to the alerts this Helm chart can fire. + +## Sophora Server: General + +### SophoraServerOffline + +**Severity:** medium + +**Summary:** The Sophora server is offline for more than 10 minutes. + +**Remediation steps:** + +* Check if the server is down for maintenance or incident remediation +* Check whether the server is in a crash loop +* Check the server logs for error messages +* Try to restart the server + +### SophoraServerAPISlow + +**Severity:** medium + +**Summary:** The API of the server exhibits a response time exceeding 150ms for more than 5 minutes at the 95th percentile. + +**Remediation steps:** + +* Check if the server is experiencing a higher API call volume than usual (e.g. imports) +* Check the server's logs for errors that could be related to a slower API response time +* Check if the server has enough RAM and CPU at hand +* If the server is a staging server, consider scaling the statefulset up to cover higher loads +* Check if a newly added or modified server script is inefficient and adds an overhead to many API calls + +## Sophora Server: State related alerts + +### SophoraServerStateUnknown + +**Severity:** medium + +**Summary:** Sophora server's state is unknown + +**Remediation steps:** + +* Check the logs of the server + +### SophoraServerStateSynchronizationDelayed + +**Severity:** medium + +**Summary:** Sophora server's synchronization is delayed + +**Remediation steps:** + +* Check the logs of the server +* See if the issue persists after waiting a little longer +* Try to fix the issue by restarting the server +* Check the logs of the primary server for any related errors + +### SophoraServerStateQueueTooLong + +**Severity:** medium + +**Summary:** Sophora server's queue is too long and the server is not up to date + +**Remediation steps:** + +* Check the logs of the server +* See if the issue persists after waiting a little longer +* Try to fix the issue by restarting the server +* Check the logs of the primary server for any related errors + +### SophoraServerStateUnavailable + +**Severity:** medium + +**Summary:** The Sophora server is unavailable and the cause should be investigated. + +**Remediation steps:** + +* Check the logs of the server +* Check the logs of the primary server for any related errors +* Restart the server + +### SophoraServerStateConnectionLost + +**Severity:** medium + +**Summary:** The Sophora server is disconnected from its primary server and cannot receive replication events. + +**Remediation steps:** + +* Check if the primary server is running +* Check the logs of the server +* Check the logs of the primary server +* Check whether there are any network issues \ No newline at end of file diff --git a/charts/sophora-server/templates/prometheusrule.yaml b/charts/sophora-server/templates/prometheusrule.yaml new file mode 100644 index 0000000..c9aba81 --- /dev/null +++ b/charts/sophora-server/templates/prometheusrule.yaml @@ -0,0 +1,82 @@ +{{- if .Values.prometheusRule.enabled }} +{{- $defaultRulesEnabled := .Values.prometheusRule.defaultRulesEnabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "sophora-server.fullname" . }} + labels: {{- include "sophora-server.labels" . | nindent 4 }} +spec: + groups: + - name: {{ template "sophora-server.fullname" $ }} + rules: + {{- if $defaultRulesEnabled }} + - alert: SophoraServerOffline + for: 10m + expr: 'up{container="sophora-server", job="{{ include "sophora-server.fullname" . }}"} != 1' + labels: + severity: medium + annotations: + summary: Sophora Server offline. + description: The server "{{`{{ $labels.service }}`}}" is offline for more than 10 minutes. + runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-server/alerting-runbook.md' + - alert: SophoraServerAPISlow + for: 5m + expr: 'histogram_quantile(0.95, sum(rate(sophora_server_contentmanager_call_duration_seconds_bucket{job="{{ include "sophora-server.fullname" . }}"}[1m])) by (pod, le)) > 0.15' + labels: + severity: medium + annotations: + summary: Sophora Server API is slow + description: The API of the server "{{`{{ $labels.pod }}`}}" exhibits a response time exceeding 150ms for more than 5 minutes at the 95th percentile. + runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-server/alerting-runbook.md' + # -- start of rules for unready server states + - alert: SophoraServerStateUnknown + for: 5m + expr: 'sophora_server_state{job="{{ include "sophora-server.fullname" . }}"} == -1' + labels: + severity: medium + annotations: + summary: Sophora server's state is unknown + description: The Sophora server's state is unknown. + runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-server/alerting-runbook.md' + - alert: SophoraServerStateSynchronizationDelayed + for: 10m + expr: 'sophora_server_state{job="{{ include "sophora-server.fullname" . }}"} == 3' + labels: + severity: medium + annotations: + summary: Sophora server's synchronization is delayed + description: The synchronization to the server server "{{`{{ $labels.pod }}`}}" is delayed. + runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-server/alerting-runbook.md' + - alert: SophoraServerStateQueueTooLong + for: 10m + expr: 'sophora_server_state{job="{{ include "sophora-server.fullname" . }}"} == 4' + labels: + severity: medium + annotations: + summary: Sophora server's queue is too long + description: The server "{{`{{ $labels.pod }}`}}" is not up-to-date due to a too long queue. + runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-server/alerting-runbook.md' + - alert: SophoraServerStateUnavailable + for: 10m + expr: 'sophora_server_state{job="{{ include "sophora-server.fullname" . }}"} == 5' + labels: + severity: high + annotations: + summary: Sophora server unavailable + description: The server "{{`{{ $labels.pod }}`}}" unavailable and the cause should be investigated + runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-server/alerting-runbook.md' + - alert: SophoraServerStateConnectionLost + for: 10m + expr: 'sophora_server_state{job="{{ include "sophora-server.fullname" . }}"} == 6' + labels: + severity: high + annotations: + summary: Sophora server lost connection to primary + description: The server "{{`{{ $labels.pod }}`}}" is disconnected from its primary server + runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-server/alerting-runbook.md' + # -- end of state alert rules + {{- end }} + {{- with .Values.prometheusRule.rules }} + {{ tpl (toYaml .) $ | nindent 8 }} + {{- end }} +{{- end }} diff --git a/charts/sophora-server/test-values.yaml b/charts/sophora-server/test-values.yaml index a9f15f3..54cce09 100644 --- a/charts/sophora-server/test-values.yaml +++ b/charts/sophora-server/test-values.yaml @@ -141,3 +141,17 @@ podDisruptionBudget: enabled: false minAvailable: 1 maxUnavailable: "" + +prometheusRule: + enabled: true + defaultRulesEnabled: true + rules: + - name: Test + for: 11m + expr: 'expr' + labels: + severity: high + annotations: + summary: My summary + description: Test description + runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-server/alerting-runbook.md' \ No newline at end of file diff --git a/charts/sophora-server/values.yaml b/charts/sophora-server/values.yaml index 9bfd4e5..deb65d7 100644 --- a/charts/sophora-server/values.yaml +++ b/charts/sophora-server/values.yaml @@ -322,6 +322,11 @@ serviceMonitor: enabled: false interval: 10s +prometheusRule: + enabled: false + defaultRulesEnabled: true + rules: [] + resources: requests: cpu: 2 @@ -329,6 +334,7 @@ resources: limits: memory: 16Gi +# This PDB should only be used for staging server setups podDisruptionBudget: ## @param enabled Whether the pod disruption budget resource should be deployed ## @@ -338,4 +344,4 @@ podDisruptionBudget: minAvailable: 1 ## @param podDisruptionBudget.maxUnavailable Max number of pods that can be unavailable after the eviction ## - maxUnavailable: "" + maxUnavailable: "" \ No newline at end of file