diff --git a/dev/prometheus.md b/dev/prometheus.md index a9b90b03c1..10f81130e2 100644 --- a/dev/prometheus.md +++ b/dev/prometheus.md @@ -51,21 +51,16 @@ The following is a list of metrics that are currently in use. #### Kubelet metrics 1. container_cpu_usage_seconds_total with the following labels: 1. pod - 1. container - 1. name 1. container_memory_working_set_bytes with the following labels: 1. pod - 1. name - 1. container #### Kube-state-metrics metrics 1. kube_pod_container_resource_requests with the following labels: - 1. exported_pod + 1. pod 1. resource - 1. exported_container (required for not dropping the values for each container of each pod) 1. kube_pod_info with the following labels: - 1. exported_pod + 1. pod 1. kube_deployment_status_replicas_available with the following labels: 1. deployment 1. kube_job_status_active with the following labels: @@ -74,11 +69,11 @@ The following is a list of metrics that are currently in use. #### DCGM metrics 1. DCGM_FI_DEV_GPU_UTIL with the following labels: - 1. exported_pod + 1. pod 1. DCGM_FI_DEV_FB_USED with the following labels: - 1. exported_pod + 1. pod 1. DCGM_FI_DEV_FB_FREE with the following labels: - 1. exported_pod + 1. pod #### Node metrics diff --git a/manager/manifests/grafana/grafana-dashboard-async.yaml b/manager/manifests/grafana/grafana-dashboard-async.yaml index a6c45a1186..5bfe81070a 100644 --- a/manager/manifests/grafana/grafana-dashboard-async.yaml +++ b/manager/manifests/grafana/grafana-dashboard-async.yaml @@ -1086,7 +1086,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"}[1m]))", "format": "time_series", "instant": false, "interval": "", @@ -1095,7 +1095,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})", "hide": false, "interval": "", "legendFormat": "Total CPU Request", @@ -1190,7 +1190,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", "format": "time_series", "instant": false, "interval": "", @@ -1199,7 +1199,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2", "hide": false, "interval": "", "legendFormat": "Total Memory Request", @@ -1294,14 +1294,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100", "hide": false, "interval": "", "legendFormat": "Total GPU Usage", "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total GPU Capacity", @@ -1395,7 +1395,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total Used GPU Memory", @@ -1403,7 +1403,7 @@ data: }, { "exemplar": false, - "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1515,7 +1515,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1524,7 +1524,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg CPU Request", @@ -1621,7 +1621,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1630,7 +1630,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Memory Request", @@ -1726,14 +1726,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg GPU Usage", "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) by (exported_pod))", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) by (pod))", "hide": false, "interval": "", "legendFormat": "Avg GPU Capacity", @@ -1829,7 +1829,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Used GPU Memory", @@ -1837,7 +1837,7 @@ data: }, { "exemplar": false, - "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", diff --git a/manager/manifests/grafana/grafana-dashboard-batch.yaml b/manager/manifests/grafana/grafana-dashboard-batch.yaml index da99fecb12..413dc5325e 100644 --- a/manager/manifests/grafana/grafana-dashboard-batch.yaml +++ b/manager/manifests/grafana/grafana-dashboard-batch.yaml @@ -522,7 +522,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))", "format": "time_series", "instant": false, "interval": "", @@ -531,7 +531,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})", "hide": false, "interval": "", "legendFormat": "Total CPU Request", @@ -628,7 +628,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", "format": "time_series", "instant": false, "interval": "", @@ -637,7 +637,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2", "hide": false, "interval": "", "legendFormat": "Total Memory Request", @@ -734,14 +734,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100", "hide": false, "interval": "", "legendFormat": "Total GPU Usage", "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total GPU Capacity", @@ -837,7 +837,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total Used GPU Memory", @@ -845,7 +845,7 @@ data: }, { "exemplar": false, - "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -963,7 +963,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -972,7 +972,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg CPU Request", @@ -1071,7 +1071,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1080,7 +1080,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(kube_pod_container_resource{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Memory Request", @@ -1179,7 +1179,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})", "hide": false, "instant": false, "interval": "", @@ -1187,7 +1187,7 @@ data: "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) by (pod))", "hide": false, "instant": false, "interval": "", @@ -1286,7 +1286,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Used GPU Memory", @@ -1294,7 +1294,7 @@ data: }, { "exemplar": false, - "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", diff --git a/manager/manifests/grafana/grafana-dashboard-cluster.yaml b/manager/manifests/grafana/grafana-dashboard-cluster.yaml index 0e819f7e16..c14d088ad1 100644 --- a/manager/manifests/grafana/grafana-dashboard-cluster.yaml +++ b/manager/manifests/grafana/grafana-dashboard-cluster.yaml @@ -213,7 +213,7 @@ data: "targets": [ { "exemplar": true, - "expr": "sum(kube_pod_info{exported_pod!~\"(aws-node|grafana|autoscaler|cluster-autoscaler|coredns|event-exporter|fluent-bit|kube-proxy|k8s-neuron-scheduler|kube-state-metrics|metrics-server|node-exporter|operator|operator-controller-manager|prometheus-operator|prometheus-prometheus|prometheus-statsd-exporter|dcgm-exporter|ingressgateway|istiod|activator|enqueuer|gateway|nvidia-device-plugin-daemonset|neuron-device-plugin-daemonset)-(.+)\"})", + "expr": "sum(kube_pod_info{pod!~\"(aws-node|grafana|autoscaler|cluster-autoscaler|coredns|event-exporter|fluent-bit|kube-proxy|k8s-neuron-scheduler|kube-state-metrics|metrics-server|node-exporter|operator|operator-controller-manager|prometheus-operator|prometheus-prometheus|prometheus-statsd-exporter|dcgm-exporter|ingressgateway|istiod|activator|enqueuer|gateway|nvidia-device-plugin-daemonset|neuron-device-plugin-daemonset)-(.+)\"})", "format": "time_series", "interval": "", "intervalFactor": 2, diff --git a/manager/manifests/grafana/grafana-dashboard-realtime.yaml b/manager/manifests/grafana/grafana-dashboard-realtime.yaml index 97e91cc318..37b942add1 100644 --- a/manager/manifests/grafana/grafana-dashboard-realtime.yaml +++ b/manager/manifests/grafana/grafana-dashboard-realtime.yaml @@ -1193,7 +1193,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"}[1m]))", "format": "time_series", "instant": false, "interval": "", @@ -1202,7 +1202,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})", "hide": false, "interval": "", "legendFormat": "Total CPU Request", @@ -1299,7 +1299,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", "format": "time_series", "instant": false, "interval": "", @@ -1308,7 +1308,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2", "hide": false, "interval": "", "legendFormat": "Total Memory Request", @@ -1406,7 +1406,7 @@ data: "targets": [ { "exemplar": true, - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100", "hide": false, "interval": "", "legendFormat": "Total GPU Usage", @@ -1414,7 +1414,7 @@ data: }, { "exemplar": true, - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total GPU Capacity", @@ -1511,7 +1511,7 @@ data: "targets": [ { "exemplar": true, - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total Used GPU Memory", @@ -1519,7 +1519,7 @@ data: }, { "exemplar": false, - "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1633,7 +1633,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"}[1m]))\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1642,7 +1642,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg CPU Request", @@ -1741,7 +1741,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1750,7 +1750,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Memory Request", @@ -1848,14 +1848,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg GPU Usage", "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) by (exported_pod))", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) by (pod))", "hide": false, "interval": "", "legendFormat": "Avg GPU Capacity", @@ -1953,7 +1953,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Used GPU Memory", @@ -1961,7 +1961,7 @@ data: }, { "exemplar": false, - "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", diff --git a/manager/manifests/grafana/grafana-dashboard-task.yaml b/manager/manifests/grafana/grafana-dashboard-task.yaml index 1305d94407..070aaa1813 100644 --- a/manager/manifests/grafana/grafana-dashboard-task.yaml +++ b/manager/manifests/grafana/grafana-dashboard-task.yaml @@ -512,7 +512,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))", "format": "time_series", "instant": false, "interval": "", @@ -521,7 +521,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})", "hide": false, "interval": "", "legendFormat": "Total CPU Request", @@ -618,7 +618,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m])) / 1024^2", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m])) / 1024^2", "format": "time_series", "instant": false, "interval": "", @@ -627,7 +627,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2", "hide": false, "interval": "", "legendFormat": "Total Memory Request", @@ -724,14 +724,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100", "hide": false, "interval": "", "legendFormat": "Total GPU Usage", "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total GPU Capacity", @@ -827,7 +827,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Total Used GPU Memory", @@ -835,7 +835,7 @@ data: }, { "exemplar": false, - "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -949,7 +949,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -958,7 +958,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg CPU Request", @@ -1057,7 +1057,7 @@ data: "targets": [ { "exemplar": false, - "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", @@ -1066,7 +1066,7 @@ data: }, { "exemplar": true, - "expr": "sum(kube_pod_container_resource{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(kube_pod_container_resource{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Memory Request", @@ -1165,7 +1165,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})", "hide": false, "instant": false, "interval": "", @@ -1173,7 +1173,7 @@ data: "refId": "GPU Usage" }, { - "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))", + "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) by (pod))", "hide": false, "instant": false, "interval": "", @@ -1272,7 +1272,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Avg Used GPU Memory", @@ -1280,7 +1280,7 @@ data: }, { "exemplar": false, - "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})", "format": "time_series", "instant": false, "interval": "", diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml index 96d82a5644..d5f2c6a774 100644 --- a/manager/manifests/prometheus-dcgm-exporter.yaml +++ b/manager/manifests/prometheus-dcgm-exporter.yaml @@ -126,7 +126,12 @@ spec: FB_FREE\ )" - action: labelkeep - regex: (__name__|exported_pod) + regex: (__name__|pod) + relabelings: + - sourceLabels: + - exported_pod + action: replace + targetLabel: pod namespaceSelector: any: true selector: diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml index 89da6c4842..8b1b1412ff 100644 --- a/manager/manifests/prometheus-kube-state-metrics.yaml +++ b/manager/manifests/prometheus-kube-state-metrics.yaml @@ -266,7 +266,16 @@ spec: job_status_active\ )" - action: labelkeep - regex: (__name__|exported_pod|exported_container|job_name|resource|deployment) + regex: (__name__|pod|job_name|resource|deployment) + - action: drop + regex: (node-exporter|aws-node|fluent-bit|kube-proxy)-(.+) + sourceLabels: + - pod + relabelings: + - sourceLabels: + - exported_pod + action: replace + targetLabel: pod namespaceSelector: any: true selector: diff --git a/manager/manifests/prometheus-kubelet-exporter.yaml b/manager/manifests/prometheus-kubelet-exporter.yaml index 87855746a0..ad8f9fe05a 100644 --- a/manager/manifests/prometheus-kubelet-exporter.yaml +++ b/manager/manifests/prometheus-kubelet-exporter.yaml @@ -49,6 +49,18 @@ spec: )" - action: labelkeep regex: (__name__|pod|container|name) + - action: keep + regex: () + sourceLabels: + - container + - action: keep + regex: () + sourceLabels: + - name + - action: drop + regex: (node-exporter|aws-node|fluent-bit|kube-proxy)-(.+) + sourceLabels: + - pod path: /metrics/cadvisor port: https-metrics relabelings: