Skip to content

Commit

Permalink
Combine some queries and add other services
Browse files Browse the repository at this point in the history
  • Loading branch information
Mahsa committed Sep 9, 2023
1 parent 3a26f69 commit 8f75e15
Showing 1 changed file with 37 additions and 22 deletions.
59 changes: 37 additions & 22 deletions deployments/server/okd4-teh-1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,50 +6,65 @@ configs:
thanos_frontend: "http://thanos-query-frontend-http.openshift-monitoring.svc.cluster.local:9090/api/v1/query"
services:
- order: 1
name: "PaaS: master-nodes"
name: "PaaS"
queries:
disruption: 'count(cluster:master_nodes) <3 and count(cluster:master_nodes) > 0'
outage: 'count(cluster:master_nodes) == 0'
disruption: '(count(cluster:master_nodes) > 0 and count(cluster:master_nodes) < 3) or (sum(etcd_server_has_leader == 1) > 0 and sum(etcd_server_has_leader == 1) < 3)'
outage: '(count(cluster:master_nodes) == 0) or (sum(etcd_server_has_leader == 1) == 0)'
- order: 2
name: "PaaS: etcd-servers"
name: "IaaS"
queries:
disruption: 'sum(etcd_server_has_leader == 1) < 3 and sum(etcd_server_has_leader == 1) > 0'
outage: 'sum(etcd_server_has_leader == 1) == 0'
disruption: 'sum by (job, instance_id) (increase(openstack_nova_server_status{status="ACTIVE"}[5m])) > 0'
outage: 'sum(openstack_nova_up == 0) > 0 '
- order: 3
name: "Routes"
queries:
disruption: 'sum by (router_name) (changes(openshift_route_status{status="True"}[1h])) > 1'
outage: 'count by (router_name) (openshift_route_status{status="True"}) == 0'
- order: 4
name: "IaaS"
queries:
disruption: 'sum by (job, instance_id) (increase(openstack_nova_server_status{status="ACTIVE"}[5m])) > 0'
outage: 'sum(openstack_nova_up == 0) > 0 '
- order: 5
name: "Object Storage (S3)"
name: "Storage"
queries:
disruption: 'rate(ceph_osd_recovery_ops[5m]) > 0 '
outage: 'absent(ceph_health_status==1) == 0'
- order: 6
name: "Container Registry: Harbor"
- order: 5
name: "Container Registry"
queries:
disruption: 'rate(harbor_core_http_request_total[5m]) > 100'
outage: '(avg by (cluster_id) (harbor_up)) == 0'
- order: 6
name: "Service LoadBalancer (L4)"
status: 'Available'
queries:
disruption: ''
outage: ''
- order: 7
name: "Ingress (L7)"
queries:
disruption: 'sum(changes(contour_httpproxy_valid[1h])) < sum(changes(contour_httpproxy_invalid[1h]))'
outage: '(envoy_cluster_upstream_rq_xx{envoy_cluster_name="primary"}) == 0'
- order: 8
name: "Monitoring: Thanos"
name: "Proxy"
status: 'Available'
queries:
disruption: 'sum(thanos_status{check="healthy"}) < (count(thanos_status)/2)'
outage: 'sum(thanos_status{check="healthy"}) == 0'
disruption: ''
outage: ''
- order: 9
name: "Monitoring: Prometheus"
name: "Monitoring"
queries:
disruption: 'rate(prometheus_http_requests_total{code=~"5.."}[5m]) > 0'
outage: 'up{job="prometheus-operator"} == 0'

disruption: '(sum(thanos_status{check="healthy"}) < (count(thanos_status)/2)) or (rate(prometheus_http_requests_total{code=~"5.."}[5m]) > 0)'
outage: '(sum(thanos_status{check="healthy"}) == 0) or (up{job="prometheus-operator"} == 0)'
- order: 10
name: "Logging"
queries:
disruption: 'sum(increase(logging_resource_state{status="inactive"}[5m])) < 0'
outage: 'up{namespace="snappcloud-logging"} == 0'
- order: 11
name: "Traffic observability: Hubble"
queries:
disruption: 'sum(sum by (verdict) ((hubble_flows_processed_total) unless hubble_flows_processed_total{verdict="DROPPED"})) < sum(hubble_drop_total)'
outage: 'sum(sum by (verdict) ((hubble_flows_processed_total) unless hubble_flows_processed_total{verdict="DROPPED"})) == 0'
- order: 12
name: "ArgoCD"
queries:
disruption: 'sum(argocd_app_info{health_status="Progressing"}) >= (count(argocd_app_info)/2)'
outage: 'argocd_cluster_info == 0'


0 comments on commit 8f75e15

Please sign in to comment.