Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce prometheus memory usage by dropping more labels #2378

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions dev/prometheus.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,16 @@ The following is a list of metrics that are currently in use.
#### Kubelet metrics
1. container_cpu_usage_seconds_total with the following labels:
1. pod
1. container
1. name
1. container_memory_working_set_bytes with the following labels:
1. pod
1. name
1. container

#### Kube-state-metrics metrics

1. kube_pod_container_resource_requests with the following labels:
1. exported_pod
1. pod
1. resource
1. exported_container (required for not dropping the values for each container of each pod)
1. kube_pod_info with the following labels:
1. exported_pod
1. pod
1. kube_deployment_status_replicas_available with the following labels:
1. deployment
1. kube_job_status_active with the following labels:
Expand All @@ -74,11 +69,11 @@ The following is a list of metrics that are currently in use.
#### DCGM metrics

1. DCGM_FI_DEV_GPU_UTIL with the following labels:
1. exported_pod
1. pod
1. DCGM_FI_DEV_FB_USED with the following labels:
1. exported_pod
1. pod
1. DCGM_FI_DEV_FB_FREE with the following labels:
1. exported_pod
1. pod

#### Node metrics

Expand Down
32 changes: 16 additions & 16 deletions manager/manifests/grafana/grafana-dashboard-async.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1086,7 +1086,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"}[1m]))",
"format": "time_series",
"instant": false,
"interval": "",
Expand All @@ -1095,7 +1095,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})",
"hide": false,
"interval": "",
"legendFormat": "Total CPU Request",
Expand Down Expand Up @@ -1190,7 +1190,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
"format": "time_series",
"instant": false,
"interval": "",
Expand All @@ -1199,7 +1199,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2",
"hide": false,
"interval": "",
"legendFormat": "Total Memory Request",
Expand Down Expand Up @@ -1294,14 +1294,14 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100",
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100",
"hide": false,
"interval": "",
"legendFormat": "Total GPU Usage",
"refId": "GPU Usage"
},
{
"expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
"expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Total GPU Capacity",
Expand Down Expand Up @@ -1395,15 +1395,15 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Total Used GPU Memory",
"refId": "GPU Used Memory"
},
{
"exemplar": false,
"expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
Expand Down Expand Up @@ -1515,7 +1515,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
Expand All @@ -1524,7 +1524,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg CPU Request",
Expand Down Expand Up @@ -1621,7 +1621,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
Expand All @@ -1630,7 +1630,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg Memory Request",
Expand Down Expand Up @@ -1726,14 +1726,14 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg GPU Usage",
"refId": "GPU Usage"
},
{
"expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) by (exported_pod))",
"expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) by (pod))",
"hide": false,
"interval": "",
"legendFormat": "Avg GPU Capacity",
Expand Down Expand Up @@ -1829,15 +1829,15 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg Used GPU Memory",
"refId": "GPU Used Memory"
},
{
"exemplar": false,
"expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
"expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
Expand Down
32 changes: 16 additions & 16 deletions manager/manifests/grafana/grafana-dashboard-batch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))",
"format": "time_series",
"instant": false,
"interval": "",
Expand All @@ -531,7 +531,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})",
"hide": false,
"interval": "",
"legendFormat": "Total CPU Request",
Expand Down Expand Up @@ -628,7 +628,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
"format": "time_series",
"instant": false,
"interval": "",
Expand All @@ -637,7 +637,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2",
"hide": false,
"interval": "",
"legendFormat": "Total Memory Request",
Expand Down Expand Up @@ -734,14 +734,14 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100",
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100",
"hide": false,
"interval": "",
"legendFormat": "Total GPU Usage",
"refId": "GPU Usage"
},
{
"expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
"expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Total GPU Capacity",
Expand Down Expand Up @@ -837,15 +837,15 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Total Used GPU Memory",
"refId": "GPU Used Memory"
},
{
"exemplar": false,
"expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
Expand Down Expand Up @@ -963,7 +963,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
Expand All @@ -972,7 +972,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg CPU Request",
Expand Down Expand Up @@ -1071,7 +1071,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
Expand All @@ -1080,7 +1080,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
"expr": "sum(kube_pod_container_resource{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg Memory Request",
Expand Down Expand Up @@ -1179,15 +1179,15 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})",
"hide": false,
"instant": false,
"interval": "",
"legendFormat": "Avg GPU Usage",
"refId": "GPU Usage"
},
{
"expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))",
"expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) by (pod))",
"hide": false,
"instant": false,
"interval": "",
Expand Down Expand Up @@ -1286,15 +1286,15 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg Used GPU Memory",
"refId": "GPU Used Memory"
},
{
"exemplar": false,
"expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
"expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
Expand Down
2 changes: 1 addition & 1 deletion manager/manifests/grafana/grafana-dashboard-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ data:
"targets": [
{
"exemplar": true,
"expr": "sum(kube_pod_info{exported_pod!~\"(aws-node|grafana|autoscaler|cluster-autoscaler|coredns|event-exporter|fluent-bit|kube-proxy|k8s-neuron-scheduler|kube-state-metrics|metrics-server|node-exporter|operator|operator-controller-manager|prometheus-operator|prometheus-prometheus|prometheus-statsd-exporter|dcgm-exporter|ingressgateway|istiod|activator|enqueuer|gateway|nvidia-device-plugin-daemonset|neuron-device-plugin-daemonset)-(.+)\"})",
"expr": "sum(kube_pod_info{pod!~\"(aws-node|grafana|autoscaler|cluster-autoscaler|coredns|event-exporter|fluent-bit|kube-proxy|k8s-neuron-scheduler|kube-state-metrics|metrics-server|node-exporter|operator|operator-controller-manager|prometheus-operator|prometheus-prometheus|prometheus-statsd-exporter|dcgm-exporter|ingressgateway|istiod|activator|enqueuer|gateway|nvidia-device-plugin-daemonset|neuron-device-plugin-daemonset)-(.+)\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
Expand Down
Loading