diff --git a/deployments/server/okd4-teh-1.yaml b/deployments/server/okd4-teh-1.yaml index b4d77f4..454b1ca 100644 --- a/deployments/server/okd4-teh-1.yaml +++ b/deployments/server/okd4-teh-1.yaml @@ -6,50 +6,65 @@ configs: thanos_frontend: "http://thanos-query-frontend-http.openshift-monitoring.svc.cluster.local:9090/api/v1/query" services: - order: 1 - name: "PaaS: master-nodes" + name: "PaaS" queries: - disruption: 'count(cluster:master_nodes) <3 and count(cluster:master_nodes) > 0' - outage: 'count(cluster:master_nodes) == 0' + disruption: '(count(cluster:master_nodes) > 0 and count(cluster:master_nodes) < 3) or (sum(etcd_server_has_leader == 1) > 0 and sum(etcd_server_has_leader == 1) < 3)' + outage: '(count(cluster:master_nodes) == 0) or (sum(etcd_server_has_leader == 1) == 0)' - order: 2 - name: "PaaS: etcd-servers" + name: "IaaS" queries: - disruption: 'sum(etcd_server_has_leader == 1) < 3 and sum(etcd_server_has_leader == 1) > 0' - outage: 'sum(etcd_server_has_leader == 1) == 0' + disruption: 'sum by (job, instance_id) (increase(openstack_nova_server_status{status="ACTIVE"}[5m])) > 0' + outage: 'sum(openstack_nova_up == 0) > 0 ' - order: 3 name: "Routes" queries: disruption: 'sum by (router_name) (changes(openshift_route_status{status="True"}[1h])) > 1' outage: 'count by (router_name) (openshift_route_status{status="True"}) == 0' - order: 4 - name: "IaaS" - queries: - disruption: 'sum by (job, instance_id) (increase(openstack_nova_server_status{status="ACTIVE"}[5m])) > 0' - outage: 'sum(openstack_nova_up == 0) > 0 ' - - order: 5 - name: "Object Storage (S3)" + name: "Storage" queries: disruption: 'rate(ceph_osd_recovery_ops[5m]) > 0 ' outage: 'absent(ceph_health_status==1) == 0' - - order: 6 - name: "Container Registry: Harbor" + - order: 5 + name: "Container Registry" queries: disruption: 'rate(harbor_core_http_request_total[5m]) > 100' outage: '(avg by (cluster_id) (harbor_up)) == 0' + - order: 6 + name: "Service LoadBalancer (L4)" + status: 'Available' + queries: + disruption: '' + outage: '' - order: 7 name: "Ingress (L7)" queries: disruption: 'sum(changes(contour_httpproxy_valid[1h])) < sum(changes(contour_httpproxy_invalid[1h]))' outage: '(envoy_cluster_upstream_rq_xx{envoy_cluster_name="primary"}) == 0' - order: 8 - name: "Monitoring: Thanos" + name: "Proxy" + status: 'Available' queries: - disruption: 'sum(thanos_status{check="healthy"}) < (count(thanos_status)/2)' - outage: 'sum(thanos_status{check="healthy"}) == 0' + disruption: '' + outage: '' - order: 9 - name: "Monitoring: Prometheus" + name: "Monitoring" queries: - disruption: 'rate(prometheus_http_requests_total{code=~"5.."}[5m]) > 0' - outage: 'up{job="prometheus-operator"} == 0' - + disruption: '(sum(thanos_status{check="healthy"}) < (count(thanos_status)/2)) or (rate(prometheus_http_requests_total{code=~"5.."}[5m]) > 0)' + outage: '(sum(thanos_status{check="healthy"}) == 0) or (up{job="prometheus-operator"} == 0)' + - order: 10 + name: "Logging" + queries: + disruption: 'sum(increase(logging_resource_state{status="inactive"}[5m])) < 0' + outage: 'up{namespace="snappcloud-logging"} == 0' + - order: 11 + name: "Traffic observability: Hubble" + queries: + disruption: 'sum(sum by (verdict) ((hubble_flows_processed_total) unless hubble_flows_processed_total{verdict="DROPPED"})) < sum(hubble_drop_total)' + outage: 'sum(sum by (verdict) ((hubble_flows_processed_total) unless hubble_flows_processed_total{verdict="DROPPED"})) == 0' + - order: 12 + name: "ArgoCD" + queries: + disruption: 'sum(argocd_app_info{health_status="Progressing"}) >= (count(argocd_app_info)/2)' + outage: 'argocd_cluster_info == 0' - \ No newline at end of file