From 3a26f691ec1b08202991c962737841da0157f169 Mon Sep 17 00:00:00 2001 From: Mahsa Date: Fri, 8 Sep 2023 21:52:25 +0330 Subject: [PATCH 1/5] Changing teh-1 queries for test --- deployments/server/okd4-teh-1.yaml | 72 +++++++++++++----------------- 1 file changed, 30 insertions(+), 42 deletions(-) diff --git a/deployments/server/okd4-teh-1.yaml b/deployments/server/okd4-teh-1.yaml index 53edd6b..b4d77f4 100644 --- a/deployments/server/okd4-teh-1.yaml +++ b/deployments/server/okd4-teh-1.yaml @@ -6,62 +6,50 @@ configs: thanos_frontend: "http://thanos-query-frontend-http.openshift-monitoring.svc.cluster.local:9090/api/v1/query" services: - order: 1 - name: "Pass" + name: "PaaS: master-nodes" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'count(cluster:master_nodes) <3 and count(cluster:master_nodes) > 0' + outage: 'count(cluster:master_nodes) == 0' - order: 2 - name: "IaaS" + name: "PaaS: etcd-servers" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(etcd_server_has_leader == 1) < 3 and sum(etcd_server_has_leader == 1) > 0' + outage: 'sum(etcd_server_has_leader == 1) == 0' - order: 3 - name: "Object Storage (S3)" + name: "Routes" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum by (router_name) (changes(openshift_route_status{status="True"}[1h])) > 1' + outage: 'count by (router_name) (openshift_route_status{status="True"}) == 0' - order: 4 - name: "Container Registry" + name: "IaaS" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum by (job, instance_id) (increase(openstack_nova_server_status{status="ACTIVE"}[5m])) > 0' + outage: 'sum(openstack_nova_up == 0) > 0 ' - order: 5 - name: "Service LoadBalancer (L4)" + name: "Object Storage (S3)" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'rate(ceph_osd_recovery_ops[5m]) > 0 ' + outage: 'absent(ceph_health_status==1) == 0' - order: 6 - name: "Ingress (L7)" + name: "Container Registry: Harbor" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'rate(harbor_core_http_request_total[5m]) > 100' + outage: '(avg by (cluster_id) (harbor_up)) == 0' - order: 7 - name: "Proxy" + name: "Ingress (L7)" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(changes(contour_httpproxy_valid[1h])) < sum(changes(contour_httpproxy_invalid[1h]))' + outage: '(envoy_cluster_upstream_rq_xx{envoy_cluster_name="primary"}) == 0' - order: 8 - name: "Monitoring" + name: "Monitoring: Thanos" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(thanos_status{check="healthy"}) < (count(thanos_status)/2)' + outage: 'sum(thanos_status{check="healthy"}) == 0' - order: 9 - name: "Logging" + name: "Monitoring: Prometheus" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' - - order: 10 - name: "Traffic observability (Hubble)" - queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' - - order: 11 - name: "ArgoCD" - queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' - - order: 12 - name: "ArgoWF" - queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'rate(prometheus_http_requests_total{code=~"5.."}[5m]) > 0' + outage: 'up{job="prometheus-operator"} == 0' + + + \ No newline at end of file From 8f75e15352acb24a5b60fc0ad194bb28323605e1 Mon Sep 17 00:00:00 2001 From: Mahsa Date: Sat, 9 Sep 2023 17:51:38 +0330 Subject: [PATCH 2/5] Combine some queries and add other services --- deployments/server/okd4-teh-1.yaml | 59 +++++++++++++++++++----------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/deployments/server/okd4-teh-1.yaml b/deployments/server/okd4-teh-1.yaml index b4d77f4..454b1ca 100644 --- a/deployments/server/okd4-teh-1.yaml +++ b/deployments/server/okd4-teh-1.yaml @@ -6,50 +6,65 @@ configs: thanos_frontend: "http://thanos-query-frontend-http.openshift-monitoring.svc.cluster.local:9090/api/v1/query" services: - order: 1 - name: "PaaS: master-nodes" + name: "PaaS" queries: - disruption: 'count(cluster:master_nodes) <3 and count(cluster:master_nodes) > 0' - outage: 'count(cluster:master_nodes) == 0' + disruption: '(count(cluster:master_nodes) > 0 and count(cluster:master_nodes) < 3) or (sum(etcd_server_has_leader == 1) > 0 and sum(etcd_server_has_leader == 1) < 3)' + outage: '(count(cluster:master_nodes) == 0) or (sum(etcd_server_has_leader == 1) == 0)' - order: 2 - name: "PaaS: etcd-servers" + name: "IaaS" queries: - disruption: 'sum(etcd_server_has_leader == 1) < 3 and sum(etcd_server_has_leader == 1) > 0' - outage: 'sum(etcd_server_has_leader == 1) == 0' + disruption: 'sum by (job, instance_id) (increase(openstack_nova_server_status{status="ACTIVE"}[5m])) > 0' + outage: 'sum(openstack_nova_up == 0) > 0 ' - order: 3 name: "Routes" queries: disruption: 'sum by (router_name) (changes(openshift_route_status{status="True"}[1h])) > 1' outage: 'count by (router_name) (openshift_route_status{status="True"}) == 0' - order: 4 - name: "IaaS" - queries: - disruption: 'sum by (job, instance_id) (increase(openstack_nova_server_status{status="ACTIVE"}[5m])) > 0' - outage: 'sum(openstack_nova_up == 0) > 0 ' - - order: 5 - name: "Object Storage (S3)" + name: "Storage" queries: disruption: 'rate(ceph_osd_recovery_ops[5m]) > 0 ' outage: 'absent(ceph_health_status==1) == 0' - - order: 6 - name: "Container Registry: Harbor" + - order: 5 + name: "Container Registry" queries: disruption: 'rate(harbor_core_http_request_total[5m]) > 100' outage: '(avg by (cluster_id) (harbor_up)) == 0' + - order: 6 + name: "Service LoadBalancer (L4)" + status: 'Available' + queries: + disruption: '' + outage: '' - order: 7 name: "Ingress (L7)" queries: disruption: 'sum(changes(contour_httpproxy_valid[1h])) < sum(changes(contour_httpproxy_invalid[1h]))' outage: '(envoy_cluster_upstream_rq_xx{envoy_cluster_name="primary"}) == 0' - order: 8 - name: "Monitoring: Thanos" + name: "Proxy" + status: 'Available' queries: - disruption: 'sum(thanos_status{check="healthy"}) < (count(thanos_status)/2)' - outage: 'sum(thanos_status{check="healthy"}) == 0' + disruption: '' + outage: '' - order: 9 - name: "Monitoring: Prometheus" + name: "Monitoring" queries: - disruption: 'rate(prometheus_http_requests_total{code=~"5.."}[5m]) > 0' - outage: 'up{job="prometheus-operator"} == 0' - + disruption: '(sum(thanos_status{check="healthy"}) < (count(thanos_status)/2)) or (rate(prometheus_http_requests_total{code=~"5.."}[5m]) > 0)' + outage: '(sum(thanos_status{check="healthy"}) == 0) or (up{job="prometheus-operator"} == 0)' + - order: 10 + name: "Logging" + queries: + disruption: 'sum(increase(logging_resource_state{status="inactive"}[5m])) < 0' + outage: 'up{namespace="snappcloud-logging"} == 0' + - order: 11 + name: "Traffic observability: Hubble" + queries: + disruption: 'sum(sum by (verdict) ((hubble_flows_processed_total) unless hubble_flows_processed_total{verdict="DROPPED"})) < sum(hubble_drop_total)' + outage: 'sum(sum by (verdict) ((hubble_flows_processed_total) unless hubble_flows_processed_total{verdict="DROPPED"})) == 0' + - order: 12 + name: "ArgoCD" + queries: + disruption: 'sum(argocd_app_info{health_status="Progressing"}) >= (count(argocd_app_info)/2)' + outage: 'argocd_cluster_info == 0' - \ No newline at end of file From f312d73578d96f4b6024f7ba09b6783a2669d3e8 Mon Sep 17 00:00:00 2001 From: Mahsa Date: Tue, 10 Oct 2023 17:13:38 +0330 Subject: [PATCH 3/5] Changes on IaaS, S3, SLB, Ingress and monitoring --- deployments/server/okd4-teh-1.yaml | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/deployments/server/okd4-teh-1.yaml b/deployments/server/okd4-teh-1.yaml index 454b1ca..8d0126e 100644 --- a/deployments/server/okd4-teh-1.yaml +++ b/deployments/server/okd4-teh-1.yaml @@ -13,18 +13,18 @@ configs: - order: 2 name: "IaaS" queries: - disruption: 'sum by (job, instance_id) (increase(openstack_nova_server_status{status="ACTIVE"}[5m])) > 0' - outage: 'sum(openstack_nova_up == 0) > 0 ' + disruption: '((sum(rate(openstack_cinder_volume_status{status=~"error.*"}[5m]))) and (sum(rate(openstack_nova_server_status{status="UNKNOWN"}[5m])))) > 0' + outage: '(openstack_cinder_up and openstack_nova_up) == 0' - order: 3 name: "Routes" queries: disruption: 'sum by (router_name) (changes(openshift_route_status{status="True"}[1h])) > 1' outage: 'count by (router_name) (openshift_route_status{status="True"}) == 0' - order: 4 - name: "Storage" + name: "Storage: S3" queries: - disruption: 'rate(ceph_osd_recovery_ops[5m]) > 0 ' - outage: 'absent(ceph_health_status==1) == 0' + disruption: '(sum(rate(radosgw_usage_ops_total[5m])) == 0) and (ceph_health_status == 1)' + outage: '(sum(rate(radosgw_usage_ops_total[10m])) == 0) and (ceph_health_status == 2)' - order: 5 name: "Container Registry" queries: @@ -32,17 +32,16 @@ configs: outage: '(avg by (cluster_id) (harbor_up)) == 0' - order: 6 name: "Service LoadBalancer (L4)" - status: 'Available' queries: - disruption: '' - outage: '' + disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 2.5' + outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 10' - order: 7 name: "Ingress (L7)" queries: - disruption: 'sum(changes(contour_httpproxy_valid[1h])) < sum(changes(contour_httpproxy_invalid[1h]))' - outage: '(envoy_cluster_upstream_rq_xx{envoy_cluster_name="primary"}) == 0' + disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 2.5' + outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 10' - order: 8 - name: "Proxy" + name: "Proxy" #TODO status: 'Available' queries: disruption: '' @@ -50,8 +49,8 @@ configs: - order: 9 name: "Monitoring" queries: - disruption: '(sum(thanos_status{check="healthy"}) < (count(thanos_status)/2)) or (rate(prometheus_http_requests_total{code=~"5.."}[5m]) > 0)' - outage: '(sum(thanos_status{check="healthy"}) == 0) or (up{job="prometheus-operator"} == 0)' + disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30' + outage: 'up{job=~"monitoring/default"} == 0' - order: 10 name: "Logging" queries: From 703d23c3ff5ea1555aced6ebce1eff4bf439ca2a Mon Sep 17 00:00:00 2001 From: Mahsa Date: Mon, 16 Oct 2023 10:54:58 +0330 Subject: [PATCH 4/5] Change orders, edit loki and hubble queries --- deployments/server/okd4-teh-1.yaml | 31 +++++++++++++----------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/deployments/server/okd4-teh-1.yaml b/deployments/server/okd4-teh-1.yaml index 8d0126e..5ff2ccd 100644 --- a/deployments/server/okd4-teh-1.yaml +++ b/deployments/server/okd4-teh-1.yaml @@ -16,52 +16,47 @@ configs: disruption: '((sum(rate(openstack_cinder_volume_status{status=~"error.*"}[5m]))) and (sum(rate(openstack_nova_server_status{status="UNKNOWN"}[5m])))) > 0' outage: '(openstack_cinder_up and openstack_nova_up) == 0' - order: 3 - name: "Routes" - queries: - disruption: 'sum by (router_name) (changes(openshift_route_status{status="True"}[1h])) > 1' - outage: 'count by (router_name) (openshift_route_status{status="True"}) == 0' - - order: 4 name: "Storage: S3" queries: disruption: '(sum(rate(radosgw_usage_ops_total[5m])) == 0) and (ceph_health_status == 1)' outage: '(sum(rate(radosgw_usage_ops_total[10m])) == 0) and (ceph_health_status == 2)' - - order: 5 - name: "Container Registry" + - order: 4 + name: "Container Registry" #TODO: change queries queries: disruption: 'rate(harbor_core_http_request_total[5m]) > 100' outage: '(avg by (cluster_id) (harbor_up)) == 0' - - order: 6 + - order: 5 name: "Service LoadBalancer (L4)" queries: disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 2.5' outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 10' - - order: 7 + - order: 6 name: "Ingress (L7)" queries: disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 2.5' outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 10' - - order: 8 + - order: 7 name: "Proxy" #TODO status: 'Available' queries: disruption: '' outage: '' - - order: 9 + - order: 8 name: "Monitoring" queries: disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30' outage: 'up{job=~"monitoring/default"} == 0' - - order: 10 + - order: 9 name: "Logging" queries: - disruption: 'sum(increase(logging_resource_state{status="inactive"}[5m])) < 0' - outage: 'up{namespace="snappcloud-logging"} == 0' - - order: 11 + disruption: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 100 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 350' + outage: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 76 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 375' + - order: 10 name: "Traffic observability: Hubble" queries: - disruption: 'sum(sum by (verdict) ((hubble_flows_processed_total) unless hubble_flows_processed_total{verdict="DROPPED"})) < sum(hubble_drop_total)' - outage: 'sum(sum by (verdict) ((hubble_flows_processed_total) unless hubble_flows_processed_total{verdict="DROPPED"})) == 0' - - order: 12 + disruption: 'sum(rate(hubble_flows_processed_total[2m])) < 100000' + outage: 'sum (rate(hubble_flows_processed_total[2m])) < 1000' + - order: 11 name: "ArgoCD" queries: disruption: 'sum(argocd_app_info{health_status="Progressing"}) >= (count(argocd_app_info)/2)' From 2b313987b9e5d29408e0e032acdc891130feb91f Mon Sep 17 00:00:00 2001 From: Mahsa Date: Mon, 16 Oct 2023 14:33:15 +0330 Subject: [PATCH 5/5] delete proxy and registry, change all regions --- deployments/server/okd4-teh-1.yaml | 21 ++------- deployments/server/okd4-teh-2.yaml | 58 +++++++++++------------- deployments/server/snappgroup-teh-1.yaml | 50 +++++++++++++++----- 3 files changed, 70 insertions(+), 59 deletions(-) diff --git a/deployments/server/okd4-teh-1.yaml b/deployments/server/okd4-teh-1.yaml index 5ff2ccd..843a0c1 100644 --- a/deployments/server/okd4-teh-1.yaml +++ b/deployments/server/okd4-teh-1.yaml @@ -21,42 +21,31 @@ configs: disruption: '(sum(rate(radosgw_usage_ops_total[5m])) == 0) and (ceph_health_status == 1)' outage: '(sum(rate(radosgw_usage_ops_total[10m])) == 0) and (ceph_health_status == 2)' - order: 4 - name: "Container Registry" #TODO: change queries - queries: - disruption: 'rate(harbor_core_http_request_total[5m]) > 100' - outage: '(avg by (cluster_id) (harbor_up)) == 0' - - order: 5 name: "Service LoadBalancer (L4)" queries: disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 2.5' outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 10' - - order: 6 + - order: 5 name: "Ingress (L7)" queries: disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 2.5' outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 10' - - order: 7 - name: "Proxy" #TODO - status: 'Available' - queries: - disruption: '' - outage: '' - - order: 8 + - order: 6 name: "Monitoring" queries: disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30' outage: 'up{job=~"monitoring/default"} == 0' - - order: 9 + - order: 7 name: "Logging" queries: disruption: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 100 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 350' outage: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 76 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 375' - - order: 10 + - order: 8 name: "Traffic observability: Hubble" queries: disruption: 'sum(rate(hubble_flows_processed_total[2m])) < 100000' outage: 'sum (rate(hubble_flows_processed_total[2m])) < 1000' - - order: 11 + - order: 9 name: "ArgoCD" queries: disruption: 'sum(argocd_app_info{health_status="Progressing"}) >= (count(argocd_app_info)/2)' diff --git a/deployments/server/okd4-teh-2.yaml b/deployments/server/okd4-teh-2.yaml index edc80a5..9cb8baa 100644 --- a/deployments/server/okd4-teh-2.yaml +++ b/deployments/server/okd4-teh-2.yaml @@ -6,52 +6,48 @@ configs: thanos_frontend: "http://thanos-query-frontend-http.openshift-monitoring.svc.cluster.local:9090/api/v1/query" services: - order: 1 - name: "Pass" + name: "PaaS" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '(count(cluster:master_nodes) > 0 and count(cluster:master_nodes) < 3) or (sum(etcd_server_has_leader == 1) > 0 and sum(etcd_server_has_leader == 1) < 3)' + outage: '(count(cluster:master_nodes) == 0) or (sum(etcd_server_has_leader == 1) == 0)' - order: 2 name: "IaaS" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '((sum(rate(openstack_cinder_volume_status{status=~"error.*"}[5m]))) and (sum(rate(openstack_nova_server_status{status="UNKNOWN"}[5m])))) > 0' + outage: '(openstack_cinder_up and openstack_nova_up) == 0' - order: 3 - name: "Object Storage (S3)" + name: "Storage: S3" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '(sum(rate(radosgw_usage_ops_total[5m])) == 0) and (ceph_health_status == 1)' + outage: '(sum(rate(radosgw_usage_ops_total[10m])) == 0) and (ceph_health_status == 2)' - order: 4 - name: "Container Registry" + name: "Service LoadBalancer (L4)" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 2.5' + outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 10' - order: 5 - name: "Service LoadBalancer (L4)" + name: "Ingress (L7)" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 2.5' + outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 10' - order: 6 - name: "Ingress (L7)" + name: "Monitoring" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30' + outage: 'up{job=~"monitoring/default"} == 0' - order: 7 - name: "Proxy" + name: "Logging" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 100 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 350' + outage: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 76 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 375' - order: 8 - name: "Monitoring" + name: "Traffic observability: Hubble" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(rate(hubble_flows_processed_total[2m])) < 100000' + outage: 'sum (rate(hubble_flows_processed_total[2m])) < 1000' - order: 9 - name: "Logging" + name: "ArgoCD" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' - - order: 10 - name: "Traffic observability (Hubble)" - queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(argocd_app_info{health_status="Progressing"}) >= (count(argocd_app_info)/2)' + outage: 'argocd_cluster_info == 0' + diff --git a/deployments/server/snappgroup-teh-1.yaml b/deployments/server/snappgroup-teh-1.yaml index d7e4002..5efc0fc 100644 --- a/deployments/server/snappgroup-teh-1.yaml +++ b/deployments/server/snappgroup-teh-1.yaml @@ -6,22 +6,48 @@ configs: thanos_frontend: "http://thanos-query-frontend-http.openshift-monitoring.svc.cluster.local:9090/api/v1/query" services: - order: 1 - name: "Container Registry" + name: "PaaS" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '(count(cluster:master_nodes) > 0 and count(cluster:master_nodes) < 3) or (sum(etcd_server_has_leader == 1) > 0 and sum(etcd_server_has_leader == 1) < 3)' + outage: '(count(cluster:master_nodes) == 0) or (sum(etcd_server_has_leader == 1) == 0)' - order: 2 - name: "Service LoadBalancer (L4)" + name: "IaaS" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '((sum(rate(openstack_cinder_volume_status{status=~"error.*"}[5m]))) and (sum(rate(openstack_nova_server_status{status="UNKNOWN"}[5m])))) > 0' + outage: '(openstack_cinder_up and openstack_nova_up) == 0' - order: 3 - name: "Ingress (L7)" + name: "Storage: S3" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '(sum(rate(radosgw_usage_ops_total[5m])) == 0) and (ceph_health_status == 1)' + outage: '(sum(rate(radosgw_usage_ops_total[10m])) == 0) and (ceph_health_status == 2)' - order: 4 - name: "Proxy" + name: "Service LoadBalancer (L4)" + queries: + disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 2.5' + outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 10' + - order: 5 + name: "Ingress (L7)" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 2.5' + outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 10' + - order: 6 + name: "Monitoring" + queries: + disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30' + outage: 'up{job=~"monitoring/default"} == 0' + - order: 7 + name: "Logging" + queries: + disruption: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 100 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 350' + outage: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 76 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 375' + - order: 8 + name: "Traffic observability: Hubble" + queries: + disruption: 'sum(rate(hubble_flows_processed_total[2m])) < 100000' + outage: 'sum (rate(hubble_flows_processed_total[2m])) < 1000' + - order: 9 + name: "ArgoCD" + queries: + disruption: 'sum(argocd_app_info{health_status="Progressing"}) >= (count(argocd_app_info)/2)' + outage: 'argocd_cluster_info == 0' +