diff --git a/charts/sophora-cluster-common/.helmignore b/charts/sophora-cluster-common/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/charts/sophora-cluster-common/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/sophora-cluster-common/Chart.yaml b/charts/sophora-cluster-common/Chart.yaml new file mode 100644 index 0000000..130b108 --- /dev/null +++ b/charts/sophora-cluster-common/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: sophora-cluster-common +description: A Helm chart containing some common resources useful for Sophora cloud setups +type: application +version: 1.0.0 +appVersion: "4" diff --git a/charts/sophora-cluster-common/README.md b/charts/sophora-cluster-common/README.md new file mode 100644 index 0000000..ea2d08a --- /dev/null +++ b/charts/sophora-cluster-common/README.md @@ -0,0 +1,77 @@ +# Sophora Cluster Common + +This Helm chart contains resources that are useful for a Sophora cloud-installation in general and are not tied to +one specific product. + +The available resources in this chart are described in the following and are all optional and can be configured to one's +needs. + +## Available Resources + +### PodDistruptionBudget for Cluster Servers + +This will install a PodDisruptionBudget for the Sophora Cluster Servers (primary and replicas) to prevent situations +where all servers are shut down simultaneously. The PDB for staging servers can be installed via the server Helm chart. + +### "LoadBalancer" for Cluster Servers + +This is not actually a load balancer but rather a service and ingress definition always pointing to the primary Sophora +server. Typically, this is used to create a deterministic endpoint that can be entered by users to log in to Sophora. +To work out of the box, this requires that the *Server Mode Labeler* sidecar container of the servers is active (should be +by default). + +### Alerts + +This will install alerts that are not tied to one specific application but rather the general Sophora cluster state. +Look into the [alerting-runbook.md](./alerting-runbook.md) to see which alerts are available. Also check out the application's +charts to see if there are application specific alerts available. + +## Parameters + +### Common parameters + +| Name | Description | Value | +| ------------------ | ----------------------------------------- | ----- | +| `nameOverride` | String to partially override the name | `""` | +| `fullnameOverride` | String to fully override the release name | `""` | + +### Cluster Server Loadbalancer + +| Name | Description | Value | +| ------------------------------------------------------------------------- | --------------------------------------------------------- | ------------------- | +| `clusterServerLb.enabled` | whether the service and ingress should be deployed or not | `false` | +| `clusterServerLb.name` | names of the resources | `cluster-server-lb` | +| `clusterServerLb.ingress.enabled` | whether the ingress should be enabled | `true` | +| `clusterServerLb.ingress.ingressClassName` | name of the ingressClass used for the ingress | `""` | +| `clusterServerLb.ingress.annotations` | annotations for the ingress | `{}` | +| `clusterServerLb.ingress.hosts` | array with hostnames used for the ingress | `[]` | +| `clusterServerLb.service.type` | Kubernetes service type | `ClusterIP` | +| `clusterServerLb.service.selectorLabels.sophora.cloud/app` | labels used to select the primary Sophora server | `cluster-server` | +| `clusterServerLb.service.selectorLabels.server.sophora.cloud/server-mode` | labels used to select the primary Sophora server | `primary` | +| `clusterServerLb.service.httpPort` | the Sophora server's http port | `1196` | +| `clusterServerLb.service.jmsPort` | the Sophora server's jms port | `1197` | +| `clusterServerLb.service.publishNotReadyAddresses` | whether the service should publish not ready addresses | `true` | + +### Cluster Server Pod Disruption Budget + +| Name | Description | Value | +| --------------------------------------------------- | ------------------------------------------ | ------------------------ | +| `podDisruptionBudget.enabled` | whether the PDB should be installed or not | `false` | +| `podDisruptionBudget.name` | name of the PDB | `sophora-cluster-server` | +| `podDisruptionBudget.minAvailable` | minimum available replicas | `2` | +| `podDisruptionBudget.matchLabels.sophora.cloud/app` | selector label for the cluster servers | `cluster-server` | + +### Alerting / Prometheus Rules + +| Name | Description | Value | +| ------------------------------------- | --------------------------------------------- | ------- | +| `prometheusRules.enabled` | Whether the alerts should be installed | `false` | +| `prometheusRules.defaultRulesEnabled` | Whether the default rules should be installed | `true` | +| `prometheusRules.rules` | allows to add custom rules | `[]` | + +### Extra Deploy + +| Name | Description | Value | +| ------------- | ---------------------------------------------------------- | ----- | +| `extraDeploy` | Allows to specify custom resources that should be deployed | `[]` | + diff --git a/charts/sophora-cluster-common/alerting-runbook.md b/charts/sophora-cluster-common/alerting-runbook.md new file mode 100644 index 0000000..34ea331 --- /dev/null +++ b/charts/sophora-cluster-common/alerting-runbook.md @@ -0,0 +1,45 @@ +# Alerting Runbook + +This document is a reference to the alerts this Helm chart can fire. + +## Sophora Cluster Common + +### NoPrimarySophoraServer + +**Severity:** critical + +**Summary:** The Sophora Cluster has no primary server. No operations with client tools will succeed and no further +replication will happen to other running servers, if there are any. + +**Remediation steps:** + +* Check if the Sophora cluster is down for another maintenance or incident remediation +* Check if the deployment has been uninstalled by mistake +* Check whether the server might have crashed +* Check the server logs for error messages +* Check if it would be possible to elect another cluster server to the primary. This should be done carefully to ensure no data is lost. +* Try to restart the server, if it is running but unresponsive +* Restore the server from a working backup + +### SophoraServerNotInSync + +**Severity:** high + +**Summary:** The Sophora server is not in sync. This is concluded from comparing the server's *SourceTime* with the +SourceTime of the primary server. The SourceTime is the timestamp of the latest event that occured on the primary server. +Usually the SourceTimes of the servers should not diverge too much and stay equal when compared over a short time frame. + +**Remediation steps:** + +* Check if the primary server logged a message containing "ReplicationMaster stopped". If yes: The primary server needs to be +restarted **without electing another server to the primary**. The last part is absolutely critical to prevent data loss. As +the servers automatically switch using a shutdown hook, a workaround is to exec into the container and replace the +shutdown hook located in the `/tools/` directory with an empty executable file before restarting the server. Note that during the restart +working with Sophora will not be possible for a few minutes. If the error persists check the logs of the primary +to find error logs hinting at the root cause of the problem. +* Check if there is a large replication queue (e.g. due to a large amount of imports), which would result in a short replication +delay +* Check whether the not-in-sync server is in an erroneous state and stopped receiving replication messages +* Check whether network connection issues between the server and the primary server exist +* Check the server's and the primary server's logs for errors or warnings +* Restart the server \ No newline at end of file diff --git a/charts/sophora-cluster-common/templates/_helpers.tpl b/charts/sophora-cluster-common/templates/_helpers.tpl new file mode 100644 index 0000000..0d56d21 --- /dev/null +++ b/charts/sophora-cluster-common/templates/_helpers.tpl @@ -0,0 +1,75 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "sophora-cluster-common.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "sophora-cluster-common.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "sophora-cluster-common.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "sophora-cluster-common.labels" -}} +helm.sh/chart: {{ include "sophora-cluster-common.chart" . }} +{{ include "sophora-cluster-common.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "sophora-cluster-common.selectorLabels" -}} +app.kubernetes.io/name: {{ include "sophora-cluster-common.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "sophora-cluster-common.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "sophora-cluster-common.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +Renders a value that contains template. +Usage: +{{ include "common.tplvalues.render" ( dict "value" .Values.path.to.the.Value "context" $) }} +*/}} +{{- define "common.tplvalues.render" -}} + {{- if typeIs "string" .value }} + {{- tpl .value .context }} + {{- else }} + {{- tpl (.value | toYaml) .context }} + {{- end }} +{{- end -}} diff --git a/charts/sophora-cluster-common/templates/alerts/prometheusrule.yaml b/charts/sophora-cluster-common/templates/alerts/prometheusrule.yaml new file mode 100644 index 0000000..a9a2ee2 --- /dev/null +++ b/charts/sophora-cluster-common/templates/alerts/prometheusrule.yaml @@ -0,0 +1,35 @@ +{{- if .Values.prometheusRules.enabled }} +{{- $defaultRulesEnabled := .Values.prometheusRules.defaultRulesEnabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "sophora-cluster-common.fullname" . }} + labels: {{- include "sophora-cluster-common.labels" . | nindent 4 }} +spec: + groups: + - name: {{ template "sophora-cluster-common.fullname" $ }} + rules: + {{- if $defaultRulesEnabled }} + - alert: NoPrimarySophoraServer + for: 2m + expr: 'count(sophora_server_replication_mode == 1) == 0' + labels: + severity: critical + annotations: + summary: The Sophora Cluster has no primary. + description: No primary elected in the cluster for more than 2 minutes. + runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-cluster-common/alerting-runbook.md' + - alert: SophoraServerNotInSync + for: 2m + expr: 'max((sophora_server_source_time and sophora_server_is_primary_server == 1)) - max by (pod) (sophora_server_source_time and sophora_server_state == 2) > 60000' + labels: + severity: high + annotations: + summary: Server is not in sync + description: The server "{{`{{ $labels.pod }}`}}" is not in sync for more than 2 minutes. + runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-cluster-common/alerting-runbook.md' + {{- end }} + {{- with .Values.prometheusRules.rules }} + {{ tpl (toYaml .) $ | nindent 8 }} + {{- end }} +{{- end }} diff --git a/charts/sophora-cluster-common/templates/extra-deploy.yaml b/charts/sophora-cluster-common/templates/extra-deploy.yaml new file mode 100644 index 0000000..9c3ee4b --- /dev/null +++ b/charts/sophora-cluster-common/templates/extra-deploy.yaml @@ -0,0 +1,5 @@ +{{- range .Values.extraDeploy }} +--- +{{ include "common.tplvalues.render" (dict "value" . "context" $) }} +{{- end }} + diff --git a/charts/sophora-cluster-common/templates/lb/ingress.yaml b/charts/sophora-cluster-common/templates/lb/ingress.yaml new file mode 100644 index 0000000..33025c2 --- /dev/null +++ b/charts/sophora-cluster-common/templates/lb/ingress.yaml @@ -0,0 +1,39 @@ +{{- if .Values.clusterServerLb.enabled }} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ .Values.clusterServerLb.name }} + labels: + {{- include "sophora-cluster-common.labels" . | nindent 4 }} + {{- with .Values.clusterServerLb.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.clusterServerLb.ingress.ingressClassName }} + ingressClassName: {{ .Values.clusterServerLb.ingress.ingressClassName }} + {{- end -}} + {{- if .Values.clusterServerLb.ingress.tls }} + tls: + {{- range .Values.clusterServerLb.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.clusterServerLb.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + - path: {{ .path }} + pathType: {{ default "ImplementationSpecific" .pathType }} + backend: + service: + name: {{ $.Values.clusterServerLb.name }} + port: + number: {{ $.Values.clusterServerLb.service.httpPort }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/sophora-cluster-common/templates/lb/service.yaml b/charts/sophora-cluster-common/templates/lb/service.yaml new file mode 100644 index 0000000..109acfe --- /dev/null +++ b/charts/sophora-cluster-common/templates/lb/service.yaml @@ -0,0 +1,25 @@ +{{- if .Values.clusterServerLb.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ .Values.clusterServerLb.name }} + labels: {{- include "sophora-cluster-common.labels" . | nindent 4 }} + annotations: {{- toYaml .Values.clusterServerLb.service.annotations | nindent 4 }} +spec: + type: ClusterIP + selector: {{- toYaml .Values.clusterServerLb.service.selectorLabels | nindent 4 }} + sessionAffinity: ClientIP + publishNotReadyAddresses: {{ .Values.clusterServerLb.service.publishNotReadyAddresses }} + sessionAffinityConfig: + clientIP: + timeoutSeconds: 3600 + ports: + - port: {{ .Values.clusterServerLb.service.httpPort }} + targetPort: http + protocol: TCP + name: http + - port: {{ .Values.clusterServerLb.service.jmsPort }} + targetPort: jms + protocol: TCP + name: jms +{{- end }} \ No newline at end of file diff --git a/charts/sophora-cluster-common/templates/pdb/pdb.yaml b/charts/sophora-cluster-common/templates/pdb/pdb.yaml new file mode 100644 index 0000000..0412a36 --- /dev/null +++ b/charts/sophora-cluster-common/templates/pdb/pdb.yaml @@ -0,0 +1,14 @@ +{{- if .Values.podDisruptionBudget.enabled }} +{{- with .Values.podDisruptionBudget }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .name }} + labels: {{- include "sophora-cluster-common.labels" $ | nindent 4 }} +spec: + minAvailable: {{ .minAvailable }} + selector: + matchLabels: + {{- .matchLabels | toYaml | nindent 6 }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/sophora-cluster-common/test-values.yaml b/charts/sophora-cluster-common/test-values.yaml new file mode 100644 index 0000000..3385df2 --- /dev/null +++ b/charts/sophora-cluster-common/test-values.yaml @@ -0,0 +1,19 @@ +clusterServerLb: + enabled: true + ingress: + ingressClassName: "nginx" + hosts: + - host: "cms.mysophora.com" + +podDisruptionBudget: + enabled: true + +prometheusRules: + enabled: true + defaultRulesEnabled: true + rules: + - alert: Foo + expr: bar_metric > 10 + +extraDeploy: + - apiVersion: subshell/v2 \ No newline at end of file diff --git a/charts/sophora-cluster-common/values.yaml b/charts/sophora-cluster-common/values.yaml new file mode 100644 index 0000000..9cbec4b --- /dev/null +++ b/charts/sophora-cluster-common/values.yaml @@ -0,0 +1,78 @@ +## @section Common parameters +## + +## @param nameOverride String to partially override the name +nameOverride: "" + +## @param fullnameOverride String to fully override the release name +fullnameOverride: "" + +## @section Cluster Server Loadbalancer +## Parameters which configure service and ingress that point to the primary cluster server +## +clusterServerLb: + + ## @param clusterServerLb.enabled whether the service and ingress should be deployed or not + enabled: false + ## @param clusterServerLb.name names of the resources + name: cluster-server-lb + + ingress: + ## @param clusterServerLb.ingress.enabled whether the ingress should be enabled + enabled: true + ## @param clusterServerLb.ingress.ingressClassName name of the ingressClass used for the ingress + ingressClassName: "" + ## @param clusterServerLb.ingress.annotations annotations for the ingress + annotations: {} + ## @param clusterServerLb.ingress.hosts array with hostnames used for the ingress + ## e.g. + ## hosts: + ## - host: "cms.mysophora.com" + hosts: [] + # - host: "" + # path: / + + service: + ## @param clusterServerLb.service.type Kubernetes service type + type: ClusterIP + selectorLabels: + ## @param clusterServerLb.service.selectorLabels.sophora.cloud/app labels used to select the primary Sophora server + sophora.cloud/app: cluster-server + ## @param clusterServerLb.service.selectorLabels.server.sophora.cloud/server-mode labels used to select the primary Sophora server + server.sophora.cloud/server-mode: primary + ## @param clusterServerLb.service.httpPort the Sophora server's http port + httpPort: 1196 + ## @param clusterServerLb.service.jmsPort the Sophora server's jms port + jmsPort: 1197 + ## @param clusterServerLb.service.publishNotReadyAddresses whether the service should publish not ready addresses + publishNotReadyAddresses: true + +## @section Cluster Server Pod Disruption Budget +## Configuration of the PodDisruptionBudget for the Sophora Cluster Servers (primary and replicas) +## +podDisruptionBudget: + ## @param podDisruptionBudget.enabled whether the PDB should be installed or not + enabled: false + ## @param podDisruptionBudget.name name of the PDB + name: sophora-cluster-server + ## @param podDisruptionBudget.minAvailable minimum available replicas + minAvailable: 2 + ## @param podDisruptionBudget.matchLabels.sophora.cloud/app selector label for the cluster servers + matchLabels: + sophora.cloud/app: cluster-server + +## @section Alerting / Prometheus Rules +## Configuration of the alerting rules +## +prometheusRules: + ## @param prometheusRules.enabled Whether the alerts should be installed + enabled: false + ## @param prometheusRules.defaultRulesEnabled Whether the default rules should be installed + defaultRulesEnabled: true + ## @param prometheusRules.rules allows to add custom rules + rules: [] + +## @section Extra Deploy +## +## @param extraDeploy Allows to specify custom resources that should be deployed +extraDeploy: [] \ No newline at end of file