From a9b422c33651f93bc607d4430e40487c58899642 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 28 Jul 2021 22:25:33 +0300 Subject: [PATCH 1/5] Reduce labels from kube-state-metrics and kubelet exporters --- manager/manifests/prometheus-kube-state-metrics.yaml | 11 ++++++++++- manager/manifests/prometheus-kubelet-exporter.yaml | 12 ++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml index 89da6c4842..a42e78a7f2 100644 --- a/manager/manifests/prometheus-kube-state-metrics.yaml +++ b/manager/manifests/prometheus-kube-state-metrics.yaml @@ -266,7 +266,16 @@ spec: job_status_active\ )" - action: labelkeep - regex: (__name__|exported_pod|exported_container|job_name|resource|deployment) + regex: (__name__|pod|job_name|resource|deployment) + - action: drop + regex: (node-exporter|aws-node|fluent-bit|kube-proxy)-(.+) + sourceLabels: + - pod + relabelings: + - sourceLabels: + - exported_pod + action: replace + targetLabel: pod namespaceSelector: any: true selector: diff --git a/manager/manifests/prometheus-kubelet-exporter.yaml b/manager/manifests/prometheus-kubelet-exporter.yaml index 87855746a0..ad8f9fe05a 100644 --- a/manager/manifests/prometheus-kubelet-exporter.yaml +++ b/manager/manifests/prometheus-kubelet-exporter.yaml @@ -49,6 +49,18 @@ spec: )" - action: labelkeep regex: (__name__|pod|container|name) + - action: keep + regex: () + sourceLabels: + - container + - action: keep + regex: () + sourceLabels: + - name + - action: drop + regex: (node-exporter|aws-node|fluent-bit|kube-proxy)-(.+) + sourceLabels: + - pod path: /metrics/cadvisor port: https-metrics relabelings: From fad52c13945d3ed7efd4cc99e338a88c495914ca Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 28 Jul 2021 22:29:00 +0300 Subject: [PATCH 2/5] Relabel exported_pod to pod --- manager/manifests/prometheus-dcgm-exporter.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml index 96d82a5644..f3eb4232f7 100644 --- a/manager/manifests/prometheus-dcgm-exporter.yaml +++ b/manager/manifests/prometheus-dcgm-exporter.yaml @@ -126,7 +126,12 @@ spec: FB_FREE\ )" - action: labelkeep - regex: (__name__|exported_pod) + regex: (__name__|pod) + relabelings: + - sourceLabels: + - exported_pod + action: replace + targetLabel: pod namespaceSelector: any: true selector: From 90db24e52234a3aa403a55b09f97ae5bb322f924 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 28 Jul 2021 22:29:28 +0300 Subject: [PATCH 3/5] Update the Grafana dashboards --- .../grafana/grafana-dashboard-async.yaml | 32 +++++++++---------- .../grafana/grafana-dashboard-batch.yaml | 32 +++++++++---------- .../grafana/grafana-dashboard-cluster.yaml | 2 +- .../grafana/grafana-dashboard-realtime.yaml | 32 +++++++++---------- .../grafana/grafana-dashboard-task.yaml | 32 +++++++++---------- 5 files changed, 65 insertions(+), 65 deletions(-) diff --git a/manager/manifests/grafana/grafana-dashboard-async.yaml b/manager/manifests/grafana/grafana-dashboard-async.yaml index a6c45a1186..5bfe81070a 100644 --- a/manager/manifests/grafana/grafana-dashboard-async.yaml +++ b/manager/manifests/grafana/grafana-dashboard-async.yaml @@ -1086,7 +1086,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"}[1m]))", "format": "time_series", "instant": false, "interval": "", @@ -1095,7 +1095,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})", "hide": false, "interval": "", "legendFormat": "Total CPU Request", @@ -1190,7 +1190,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", "format": "time_series", "instant": false, "interval": "", @@ -1199,7 +1199,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2", "hide": false, "interval": "", "legendFormat": "Total Memory Request", @@ -1294,14 +1294,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100", "hide": false, "interval": "", "legendFormat": "Total GPU Usage", "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total GPU Capacity", @@ -1395,7 +1395,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total Used GPU Memory", @@ -1403,7 +1403,7 @@ data: }, { "exemplar": false, - "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1515,7 +1515,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1524,7 +1524,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg CPU Request", @@ -1621,7 +1621,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1630,7 +1630,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Memory Request", @@ -1726,14 +1726,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg GPU Usage", "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) by (exported_pod))", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) by (pod))", "hide": false, "interval": "", "legendFormat": "Avg GPU Capacity", @@ -1829,7 +1829,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Used GPU Memory", @@ -1837,7 +1837,7 @@ data: }, { "exemplar": false, - "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", diff --git a/manager/manifests/grafana/grafana-dashboard-batch.yaml b/manager/manifests/grafana/grafana-dashboard-batch.yaml index da99fecb12..413dc5325e 100644 --- a/manager/manifests/grafana/grafana-dashboard-batch.yaml +++ b/manager/manifests/grafana/grafana-dashboard-batch.yaml @@ -522,7 +522,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))", "format": "time_series", "instant": false, "interval": "", @@ -531,7 +531,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})", "hide": false, "interval": "", "legendFormat": "Total CPU Request", @@ -628,7 +628,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", "format": "time_series", "instant": false, "interval": "", @@ -637,7 +637,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2", "hide": false, "interval": "", "legendFormat": "Total Memory Request", @@ -734,14 +734,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100", "hide": false, "interval": "", "legendFormat": "Total GPU Usage", "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total GPU Capacity", @@ -837,7 +837,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total Used GPU Memory", @@ -845,7 +845,7 @@ data: }, { "exemplar": false, - "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -963,7 +963,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -972,7 +972,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg CPU Request", @@ -1071,7 +1071,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1080,7 +1080,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(kube_pod_container_resource{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Memory Request", @@ -1179,7 +1179,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})", "hide": false, "instant": false, "interval": "", @@ -1187,7 +1187,7 @@ data: "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) by (pod))", "hide": false, "instant": false, "interval": "", @@ -1286,7 +1286,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Used GPU Memory", @@ -1294,7 +1294,7 @@ data: }, { "exemplar": false, - "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", diff --git a/manager/manifests/grafana/grafana-dashboard-cluster.yaml b/manager/manifests/grafana/grafana-dashboard-cluster.yaml index 0e819f7e16..c14d088ad1 100644 --- a/manager/manifests/grafana/grafana-dashboard-cluster.yaml +++ b/manager/manifests/grafana/grafana-dashboard-cluster.yaml @@ -213,7 +213,7 @@ data: "targets": [ { "exemplar": true, - "expr": "sum(kube_pod_info{exported_pod!~\"(aws-node|grafana|autoscaler|cluster-autoscaler|coredns|event-exporter|fluent-bit|kube-proxy|k8s-neuron-scheduler|kube-state-metrics|metrics-server|node-exporter|operator|operator-controller-manager|prometheus-operator|prometheus-prometheus|prometheus-statsd-exporter|dcgm-exporter|ingressgateway|istiod|activator|enqueuer|gateway|nvidia-device-plugin-daemonset|neuron-device-plugin-daemonset)-(.+)\"})", + "expr": "sum(kube_pod_info{pod!~\"(aws-node|grafana|autoscaler|cluster-autoscaler|coredns|event-exporter|fluent-bit|kube-proxy|k8s-neuron-scheduler|kube-state-metrics|metrics-server|node-exporter|operator|operator-controller-manager|prometheus-operator|prometheus-prometheus|prometheus-statsd-exporter|dcgm-exporter|ingressgateway|istiod|activator|enqueuer|gateway|nvidia-device-plugin-daemonset|neuron-device-plugin-daemonset)-(.+)\"})", "format": "time_series", "interval": "", "intervalFactor": 2, diff --git a/manager/manifests/grafana/grafana-dashboard-realtime.yaml b/manager/manifests/grafana/grafana-dashboard-realtime.yaml index 97e91cc318..37b942add1 100644 --- a/manager/manifests/grafana/grafana-dashboard-realtime.yaml +++ b/manager/manifests/grafana/grafana-dashboard-realtime.yaml @@ -1193,7 +1193,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"}[1m]))", "format": "time_series", "instant": false, "interval": "", @@ -1202,7 +1202,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})", "hide": false, "interval": "", "legendFormat": "Total CPU Request", @@ -1299,7 +1299,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", "format": "time_series", "instant": false, "interval": "", @@ -1308,7 +1308,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2", "hide": false, "interval": "", "legendFormat": "Total Memory Request", @@ -1406,7 +1406,7 @@ data: "targets": [ { "exemplar": true, - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100", "hide": false, "interval": "", "legendFormat": "Total GPU Usage", @@ -1414,7 +1414,7 @@ data: }, { "exemplar": true, - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total GPU Capacity", @@ -1511,7 +1511,7 @@ data: "targets": [ { "exemplar": true, - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total Used GPU Memory", @@ -1519,7 +1519,7 @@ data: }, { "exemplar": false, - "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1633,7 +1633,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"}[1m]))\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1642,7 +1642,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg CPU Request", @@ -1741,7 +1741,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1750,7 +1750,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Memory Request", @@ -1848,14 +1848,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg GPU Usage", "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) by (exported_pod))", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) by (pod))", "hide": false, "interval": "", "legendFormat": "Avg GPU Capacity", @@ -1953,7 +1953,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Used GPU Memory", @@ -1961,7 +1961,7 @@ data: }, { "exemplar": false, - "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", diff --git a/manager/manifests/grafana/grafana-dashboard-task.yaml b/manager/manifests/grafana/grafana-dashboard-task.yaml index 1305d94407..070aaa1813 100644 --- a/manager/manifests/grafana/grafana-dashboard-task.yaml +++ b/manager/manifests/grafana/grafana-dashboard-task.yaml @@ -512,7 +512,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))", "format": "time_series", "instant": false, "interval": "", @@ -521,7 +521,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})", "hide": false, "interval": "", "legendFormat": "Total CPU Request", @@ -618,7 +618,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m])) / 1024^2", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m])) / 1024^2", "format": "time_series", "instant": false, "interval": "", @@ -627,7 +627,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2", "hide": false, "interval": "", "legendFormat": "Total Memory Request", @@ -724,14 +724,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100", "hide": false, "interval": "", "legendFormat": "Total GPU Usage", "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total GPU Capacity", @@ -827,7 +827,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total Used GPU Memory", @@ -835,7 +835,7 @@ data: }, { "exemplar": false, - "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -949,7 +949,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -958,7 +958,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg CPU Request", @@ -1057,7 +1057,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1066,7 +1066,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(kube_pod_container_resource{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Memory Request", @@ -1165,7 +1165,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})", "hide": false, "instant": false, "interval": "", @@ -1173,7 +1173,7 @@ data: "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) by (pod))", "hide": false, "instant": false, "interval": "", @@ -1272,7 +1272,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Used GPU Memory", @@ -1280,7 +1280,7 @@ data: }, { "exemplar": false, - "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", From 79068ac074304884a961a77b025381bdc0cf12cb Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 28 Jul 2021 22:36:28 +0300 Subject: [PATCH 4/5] Update prometheus.md --- dev/prometheus.md | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/dev/prometheus.md b/dev/prometheus.md index a9b90b03c1..10f81130e2 100644 --- a/dev/prometheus.md +++ b/dev/prometheus.md @@ -51,21 +51,16 @@ The following is a list of metrics that are currently in use. #### Kubelet metrics 1. container_cpu_usage_seconds_total with the following labels: 1. pod - 1. container - 1. name 1. container_memory_working_set_bytes with the following labels: 1. pod - 1. name - 1. container #### Kube-state-metrics metrics 1. kube_pod_container_resource_requests with the following labels: - 1. exported_pod + 1. pod 1. resource - 1. exported_container (required for not dropping the values for each container of each pod) 1. kube_pod_info with the following labels: - 1. exported_pod + 1. pod 1. kube_deployment_status_replicas_available with the following labels: 1. deployment 1. kube_job_status_active with the following labels: @@ -74,11 +69,11 @@ The following is a list of metrics that are currently in use. #### DCGM metrics 1. DCGM_FI_DEV_GPU_UTIL with the following labels: - 1. exported_pod + 1. pod 1. DCGM_FI_DEV_FB_USED with the following labels: - 1. exported_pod + 1. pod 1. DCGM_FI_DEV_FB_FREE with the following labels: - 1. exported_pod + 1. pod #### Node metrics From b00b54f5dc861d41ed9699fd2858d674cf3dbea2 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 28 Jul 2021 22:39:51 +0300 Subject: [PATCH 5/5] Make lint --- manager/manifests/prometheus-dcgm-exporter.yaml | 2 +- manager/manifests/prometheus-kube-state-metrics.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml index f3eb4232f7..d5f2c6a774 100644 --- a/manager/manifests/prometheus-dcgm-exporter.yaml +++ b/manager/manifests/prometheus-dcgm-exporter.yaml @@ -128,7 +128,7 @@ spec: - action: labelkeep regex: (__name__|pod) relabelings: - - sourceLabels: + - sourceLabels: - exported_pod action: replace targetLabel: pod diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml index a42e78a7f2..8b1b1412ff 100644 --- a/manager/manifests/prometheus-kube-state-metrics.yaml +++ b/manager/manifests/prometheus-kube-state-metrics.yaml @@ -272,7 +272,7 @@ spec: sourceLabels: - pod relabelings: - - sourceLabels: + - sourceLabels: - exported_pod action: replace targetLabel: pod