From 04477d16bf8765b175945c90db18f47e1735d561 Mon Sep 17 00:00:00 2001 From: Mahsa Soleimani Date: Mon, 8 Apr 2024 12:37:40 +0330 Subject: [PATCH] Threshold tuning (#14) * change monitoring thresholds * change hubble threshold - snappgroup --------- Co-authored-by: Mahsa --- deployment/charts/okd4-teh-1.yaml | 2 +- deployment/charts/okd4-teh-2.yaml | 2 +- deployment/charts/snappgroup-teh-1.yaml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/deployment/charts/okd4-teh-1.yaml b/deployment/charts/okd4-teh-1.yaml index b5c7a78..6027bcd 100644 --- a/deployment/charts/okd4-teh-1.yaml +++ b/deployment/charts/okd4-teh-1.yaml @@ -33,7 +33,7 @@ configs: - order: 6 name: "Monitoring" queries: - disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30' + disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 80' outage: 'up{job=~"monitoring/default"} == 0' - order: 7 name: "Logging" diff --git a/deployment/charts/okd4-teh-2.yaml b/deployment/charts/okd4-teh-2.yaml index 9288153..9fe5fa0 100644 --- a/deployment/charts/okd4-teh-2.yaml +++ b/deployment/charts/okd4-teh-2.yaml @@ -33,7 +33,7 @@ configs: - order: 6 name: "Monitoring" queries: - disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30' + disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 80' outage: 'up{job=~"monitoring/default"} == 0' - order: 7 name: "Logging" diff --git a/deployment/charts/snappgroup-teh-1.yaml b/deployment/charts/snappgroup-teh-1.yaml index 5efc0fc..122ddd2 100644 --- a/deployment/charts/snappgroup-teh-1.yaml +++ b/deployment/charts/snappgroup-teh-1.yaml @@ -33,7 +33,7 @@ configs: - order: 6 name: "Monitoring" queries: - disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30' + disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 80' outage: 'up{job=~"monitoring/default"} == 0' - order: 7 name: "Logging" @@ -43,7 +43,7 @@ configs: - order: 8 name: "Traffic observability: Hubble" queries: - disruption: 'sum(rate(hubble_flows_processed_total[2m])) < 100000' + disruption: 'sum(rate(hubble_flows_processed_total[2m])) < 10000' outage: 'sum (rate(hubble_flows_processed_total[2m])) < 1000' - order: 9 name: "ArgoCD"