Skip to content

Commit

Permalink
Merge pull request #3 from snapp-incubator/cld4638-queries
Browse files Browse the repository at this point in the history
New queries for status page on all regions
  • Loading branch information
divergentluna authored Oct 16, 2023
2 parents 4ccc420 + 2b31398 commit 9005990
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 83 deletions.
66 changes: 26 additions & 40 deletions deployments/server/okd4-teh-1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,62 +6,48 @@ configs:
thanos_frontend: "http://thanos-query-frontend-http.openshift-monitoring.svc.cluster.local:9090/api/v1/query"
services:
- order: 1
name: "Pass"
name: "PaaS"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: '(count(cluster:master_nodes) > 0 and count(cluster:master_nodes) < 3) or (sum(etcd_server_has_leader == 1) > 0 and sum(etcd_server_has_leader == 1) < 3)'
outage: '(count(cluster:master_nodes) == 0) or (sum(etcd_server_has_leader == 1) == 0)'
- order: 2
name: "IaaS"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: '((sum(rate(openstack_cinder_volume_status{status=~"error.*"}[5m]))) and (sum(rate(openstack_nova_server_status{status="UNKNOWN"}[5m])))) > 0'
outage: '(openstack_cinder_up and openstack_nova_up) == 0'
- order: 3
name: "Object Storage (S3)"
name: "Storage: S3"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: '(sum(rate(radosgw_usage_ops_total[5m])) == 0) and (ceph_health_status == 1)'
outage: '(sum(rate(radosgw_usage_ops_total[10m])) == 0) and (ceph_health_status == 2)'
- order: 4
name: "Container Registry"
name: "Service LoadBalancer (L4)"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 2.5'
outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 10'
- order: 5
name: "Service LoadBalancer (L4)"
name: "Ingress (L7)"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 2.5'
outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 10'
- order: 6
name: "Ingress (L7)"
name: "Monitoring"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30'
outage: 'up{job=~"monitoring/default"} == 0'
- order: 7
name: "Proxy"
name: "Logging"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 100 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 350'
outage: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 76 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 375'
- order: 8
name: "Monitoring"
name: "Traffic observability: Hubble"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: 'sum(rate(hubble_flows_processed_total[2m])) < 100000'
outage: 'sum (rate(hubble_flows_processed_total[2m])) < 1000'
- order: 9
name: "Logging"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
- order: 10
name: "Traffic observability (Hubble)"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
- order: 11
name: "ArgoCD"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
- order: 12
name: "ArgoWF"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: 'sum(argocd_app_info{health_status="Progressing"}) >= (count(argocd_app_info)/2)'
outage: 'argocd_cluster_info == 0'

58 changes: 27 additions & 31 deletions deployments/server/okd4-teh-2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,52 +6,48 @@ configs:
thanos_frontend: "http://thanos-query-frontend-http.openshift-monitoring.svc.cluster.local:9090/api/v1/query"
services:
- order: 1
name: "Pass"
name: "PaaS"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: '(count(cluster:master_nodes) > 0 and count(cluster:master_nodes) < 3) or (sum(etcd_server_has_leader == 1) > 0 and sum(etcd_server_has_leader == 1) < 3)'
outage: '(count(cluster:master_nodes) == 0) or (sum(etcd_server_has_leader == 1) == 0)'
- order: 2
name: "IaaS"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: '((sum(rate(openstack_cinder_volume_status{status=~"error.*"}[5m]))) and (sum(rate(openstack_nova_server_status{status="UNKNOWN"}[5m])))) > 0'
outage: '(openstack_cinder_up and openstack_nova_up) == 0'
- order: 3
name: "Object Storage (S3)"
name: "Storage: S3"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: '(sum(rate(radosgw_usage_ops_total[5m])) == 0) and (ceph_health_status == 1)'
outage: '(sum(rate(radosgw_usage_ops_total[10m])) == 0) and (ceph_health_status == 2)'
- order: 4
name: "Container Registry"
name: "Service LoadBalancer (L4)"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 2.5'
outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 10'
- order: 5
name: "Service LoadBalancer (L4)"
name: "Ingress (L7)"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 2.5'
outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 10'
- order: 6
name: "Ingress (L7)"
name: "Monitoring"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30'
outage: 'up{job=~"monitoring/default"} == 0'
- order: 7
name: "Proxy"
name: "Logging"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 100 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 350'
outage: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 76 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 375'
- order: 8
name: "Monitoring"
name: "Traffic observability: Hubble"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: 'sum(rate(hubble_flows_processed_total[2m])) < 100000'
outage: 'sum (rate(hubble_flows_processed_total[2m])) < 1000'
- order: 9
name: "Logging"
name: "ArgoCD"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
- order: 10
name: "Traffic observability (Hubble)"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: 'sum(argocd_app_info{health_status="Progressing"}) >= (count(argocd_app_info)/2)'
outage: 'argocd_cluster_info == 0'

50 changes: 38 additions & 12 deletions deployments/server/snappgroup-teh-1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,48 @@ configs:
thanos_frontend: "http://thanos-query-frontend-http.openshift-monitoring.svc.cluster.local:9090/api/v1/query"
services:
- order: 1
name: "Container Registry"
name: "PaaS"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: '(count(cluster:master_nodes) > 0 and count(cluster:master_nodes) < 3) or (sum(etcd_server_has_leader == 1) > 0 and sum(etcd_server_has_leader == 1) < 3)'
outage: '(count(cluster:master_nodes) == 0) or (sum(etcd_server_has_leader == 1) == 0)'
- order: 2
name: "Service LoadBalancer (L4)"
name: "IaaS"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: '((sum(rate(openstack_cinder_volume_status{status=~"error.*"}[5m]))) and (sum(rate(openstack_nova_server_status{status="UNKNOWN"}[5m])))) > 0'
outage: '(openstack_cinder_up and openstack_nova_up) == 0'
- order: 3
name: "Ingress (L7)"
name: "Storage: S3"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: '(sum(rate(radosgw_usage_ops_total[5m])) == 0) and (ceph_health_status == 1)'
outage: '(sum(rate(radosgw_usage_ops_total[10m])) == 0) and (ceph_health_status == 2)'
- order: 4
name: "Proxy"
name: "Service LoadBalancer (L4)"
queries:
disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 2.5'
outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name="svc-lb"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name="svc-lb"}[2m])) * 100 > 10'
- order: 5
name: "Ingress (L7)"
queries:
disruption: 'up{job="node-exporter"}'
outage: 'up{job="node-exporter"}'
disruption: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 2.5'
outage: 'sum by (name) (increase(health_http_requests_total{status_code!="200",name=~"(inter-dc|public|private).*"}[2m])) / on(name) group_left() sum by (name) (increase(health_http_requests_total{name=~"(inter-dc|public|private).*"}[2m])) * 100 > 10'
- order: 6
name: "Monitoring"
queries:
disruption: 'sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) > 50 or sum(rate(container_cpu_usage_seconds_total{job="kubelet", namespace=~".*-monitoring"}[5m])) < 30'
outage: 'up{job=~"monitoring/default"} == 0'
- order: 7
name: "Logging"
queries:
disruption: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 100 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 350'
outage: 'sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) < 76 and sum(rate(loki_request_duration_seconds_count{status_code!="200"}[5m])) / sum(rate(loki_request_duration_seconds_count{status_code="200"}[5m])) > 375'
- order: 8
name: "Traffic observability: Hubble"
queries:
disruption: 'sum(rate(hubble_flows_processed_total[2m])) < 100000'
outage: 'sum (rate(hubble_flows_processed_total[2m])) < 1000'
- order: 9
name: "ArgoCD"
queries:
disruption: 'sum(argocd_app_info{health_status="Progressing"}) >= (count(argocd_app_info)/2)'
outage: 'argocd_cluster_info == 0'

0 comments on commit 9005990

Please sign in to comment.