diff --git a/monitoring/templates/alert.rules b/monitoring/templates/alert.rules index 648ec7a..31880d5 100644 --- a/monitoring/templates/alert.rules +++ b/monitoring/templates/alert.rules @@ -2,15 +2,15 @@ groups: - name: basic rules: - # Alert for any instance that is unreachable for >2 minutes. + # Alert for any instance that is unreachable for >5 minutes. - alert: service_down - expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 - for: 2m + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 5 + for: 5m labels: severity: critical annotations: summary: "Instance {{ $labels.instance }} down" - description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." - alert: high_load expr: node_load1 > 2 @@ -66,4 +66,3 @@ groups: ${elasticsearch_rules} ${elasticsearch_additional_rules} ${custom_alert_rules} -