diff --git a/deployments/server/okd4-teh-1.yaml b/deployments/server/okd4-teh-1.yaml index 53edd6b..843a0c1 100644 --- a/deployments/server/okd4-teh-1.yaml +++ b/deployments/server/okd4-teh-1.yaml @@ -6,62 +6,48 @@ configs: thanos_frontend: "http://thanos-query-frontend-http.openshift-monitoring.svc.cluster.local:9090/api/v1/query" services: - order: 1 - name: "Pass" + name: "PaaS" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '(count(cluster:master_nodes) > 0 and count(cluster:master_nodes) < 3) or (sum(etcd_server_has_leader == 1) > 0 and sum(etcd_server_has_leader == 1) < 3)' + outage: '(count(cluster:master_nodes) == 0) or (sum(etcd_server_has_leader == 1) == 0)' - order: 2 name: "IaaS" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '((sum(rate(openstack_cinder_volume_status{status=~"error.*"}[5m]))) and (sum(rate(openstack_nova_server_status{status="UNKNOWN"}[5m])))) > 0' + outage: '(openstack_cinder_up and openstack_nova_up) == 0' - order: 3 - name: "Object Storage (S3)" + name: "Storage: S3" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '(sum(rate(radosgw_usage_ops_total[5m])) == 0) and (ceph_health_status == 1)' + outage: '(sum(rate(radosgw_usage_ops_total[10m])) == 0) and (ceph_health_status == 2)' - order: 4 - name: "Container Registry" + name: "Service LoadBalancer (L4)" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 2.5' + outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 10' - order: 5 - name: "Service LoadBalancer (L4)" + name: "Ingress (L7)" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 2.5' + outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 10' - order: 6 - name: "Ingress (L7)" + name: "Monitoring" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30' + outage: 'up{job=~"monitoring/default"} == 0' - order: 7 - name: "Proxy" + name: "Logging" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 100 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 350' + outage: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 76 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 375' - order: 8 - name: "Monitoring" + name: "Traffic observability: Hubble" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(rate(hubble_flows_processed_total[2m])) < 100000' + outage: 'sum (rate(hubble_flows_processed_total[2m])) < 1000' - order: 9 - name: "Logging" - queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' - - order: 10 - name: "Traffic observability (Hubble)" - queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' - - order: 11 name: "ArgoCD" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' - - order: 12 - name: "ArgoWF" - queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(argocd_app_info{health_status="Progressing"}) >= (count(argocd_app_info)/2)' + outage: 'argocd_cluster_info == 0' + diff --git a/deployments/server/okd4-teh-2.yaml b/deployments/server/okd4-teh-2.yaml index edc80a5..9cb8baa 100644 --- a/deployments/server/okd4-teh-2.yaml +++ b/deployments/server/okd4-teh-2.yaml @@ -6,52 +6,48 @@ configs: thanos_frontend: "http://thanos-query-frontend-http.openshift-monitoring.svc.cluster.local:9090/api/v1/query" services: - order: 1 - name: "Pass" + name: "PaaS" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '(count(cluster:master_nodes) > 0 and count(cluster:master_nodes) < 3) or (sum(etcd_server_has_leader == 1) > 0 and sum(etcd_server_has_leader == 1) < 3)' + outage: '(count(cluster:master_nodes) == 0) or (sum(etcd_server_has_leader == 1) == 0)' - order: 2 name: "IaaS" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '((sum(rate(openstack_cinder_volume_status{status=~"error.*"}[5m]))) and (sum(rate(openstack_nova_server_status{status="UNKNOWN"}[5m])))) > 0' + outage: '(openstack_cinder_up and openstack_nova_up) == 0' - order: 3 - name: "Object Storage (S3)" + name: "Storage: S3" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '(sum(rate(radosgw_usage_ops_total[5m])) == 0) and (ceph_health_status == 1)' + outage: '(sum(rate(radosgw_usage_ops_total[10m])) == 0) and (ceph_health_status == 2)' - order: 4 - name: "Container Registry" + name: "Service LoadBalancer (L4)" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 2.5' + outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 10' - order: 5 - name: "Service LoadBalancer (L4)" + name: "Ingress (L7)" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 2.5' + outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 10' - order: 6 - name: "Ingress (L7)" + name: "Monitoring" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30' + outage: 'up{job=~"monitoring/default"} == 0' - order: 7 - name: "Proxy" + name: "Logging" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 100 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 350' + outage: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 76 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 375' - order: 8 - name: "Monitoring" + name: "Traffic observability: Hubble" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(rate(hubble_flows_processed_total[2m])) < 100000' + outage: 'sum (rate(hubble_flows_processed_total[2m])) < 1000' - order: 9 - name: "Logging" + name: "ArgoCD" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' - - order: 10 - name: "Traffic observability (Hubble)" - queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum(argocd_app_info{health_status="Progressing"}) >= (count(argocd_app_info)/2)' + outage: 'argocd_cluster_info == 0' + diff --git a/deployments/server/snappgroup-teh-1.yaml b/deployments/server/snappgroup-teh-1.yaml index d7e4002..5efc0fc 100644 --- a/deployments/server/snappgroup-teh-1.yaml +++ b/deployments/server/snappgroup-teh-1.yaml @@ -6,22 +6,48 @@ configs: thanos_frontend: "http://thanos-query-frontend-http.openshift-monitoring.svc.cluster.local:9090/api/v1/query" services: - order: 1 - name: "Container Registry" + name: "PaaS" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '(count(cluster:master_nodes) > 0 and count(cluster:master_nodes) < 3) or (sum(etcd_server_has_leader == 1) > 0 and sum(etcd_server_has_leader == 1) < 3)' + outage: '(count(cluster:master_nodes) == 0) or (sum(etcd_server_has_leader == 1) == 0)' - order: 2 - name: "Service LoadBalancer (L4)" + name: "IaaS" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '((sum(rate(openstack_cinder_volume_status{status=~"error.*"}[5m]))) and (sum(rate(openstack_nova_server_status{status="UNKNOWN"}[5m])))) > 0' + outage: '(openstack_cinder_up and openstack_nova_up) == 0' - order: 3 - name: "Ingress (L7)" + name: "Storage: S3" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: '(sum(rate(radosgw_usage_ops_total[5m])) == 0) and (ceph_health_status == 1)' + outage: '(sum(rate(radosgw_usage_ops_total[10m])) == 0) and (ceph_health_status == 2)' - order: 4 - name: "Proxy" + name: "Service LoadBalancer (L4)" + queries: + disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 2.5' + outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 10' + - order: 5 + name: "Ingress (L7)" queries: - disruption: 'up{job="node-exporter"}' - outage: 'up{job="node-exporter"}' + disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 2.5' + outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 10' + - order: 6 + name: "Monitoring" + queries: + disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30' + outage: 'up{job=~"monitoring/default"} == 0' + - order: 7 + name: "Logging" + queries: + disruption: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 100 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 350' + outage: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 76 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 375' + - order: 8 + name: "Traffic observability: Hubble" + queries: + disruption: 'sum(rate(hubble_flows_processed_total[2m])) < 100000' + outage: 'sum (rate(hubble_flows_processed_total[2m])) < 1000' + - order: 9 + name: "ArgoCD" + queries: + disruption: 'sum(argocd_app_info{health_status="Progressing"}) >= (count(argocd_app_info)/2)' + outage: 'argocd_cluster_info == 0' +