From 2624aa16be3bf5da2d76389a1d41c73527b3be2d Mon Sep 17 00:00:00 2001 From: Geoffrey Beausire Date: Fri, 5 Apr 2024 14:18:36 +0200 Subject: [PATCH] Fix bugs on metrics (#61) * Enable auto discovery of metrics Before they had to be specified in the main. With promauto, they can be defined in only one place * Add a metric to export state as label it is an alternative to using the value. * Fix leak of disruptions and nodes Over time the budget might target new nodes but we only cleaned nodes or disruption when a budget is deleted Now, we wipe the list of node/disruptions metrics before repopulating again --- cmd/main.go | 8 --- internal/controller/budget.go | 4 ++ internal/controller/metrics.go | 49 ++++++++++++------- .../controller/nodedisruption_controller.go | 14 +++++- 4 files changed, 46 insertions(+), 29 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 4047669..855801e 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -34,7 +34,6 @@ import ( nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1" "github.com/criteo/node-disruption-controller/internal/controller" - "sigs.k8s.io/controller-runtime/pkg/metrics" //+kubebuilder:scaffold:imports ) @@ -48,13 +47,6 @@ func init() { utilruntime.Must(nodedisruptionv1alpha1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme - - metrics.Registry.MustRegister( - controller.NodeDisruptionState, - controller.NodeDisruptionCreated, - controller.NodeDisruptionDeadline, - controller.NodeDisruptionImpactedNodes, - ) } func main() { diff --git a/internal/controller/budget.go b/internal/controller/budget.go index f104ed0..67e11eb 100644 --- a/internal/controller/budget.go +++ b/internal/controller/budget.go @@ -38,9 +38,13 @@ func PruneBudgetStatusMetrics(ref nodedisruptionv1alpha1.NamespacedName) { } func UpdateBudgetStatusMetrics(ref nodedisruptionv1alpha1.NamespacedName, status nodedisruptionv1alpha1.DisruptionBudgetStatus) { + // delete before updating to avoid leaking metrics/nodes over time + DisruptionBudgetWatchedNodes.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind}) for _, node_name := range status.WatchedNodes { DisruptionBudgetWatchedNodes.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, node_name).Set(1) } + // delete before updating to avoid leaking metrics/disruptions over time + DisruptionBudgetDisruptions.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind}) for _, disruption := range status.Disruptions { nd_state := 0 state := nodedisruptionv1alpha1.NodeDisruptionState(disruption.State) diff --git a/internal/controller/metrics.go b/internal/controller/metrics.go index 60213b7..80597be 100644 --- a/internal/controller/metrics.go +++ b/internal/controller/metrics.go @@ -1,6 +1,10 @@ package controller -import "github.com/prometheus/client_golang/prometheus" +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) const ( METIC_PREFIX = "node_disruption_controller_" @@ -8,42 +12,49 @@ const ( var ( // NODE DISRUPTION METRICS - NodeDisruptionGrantedTotal = prometheus.NewCounterVec( + NodeDisruptionGrantedTotal = promauto.With(metrics.Registry).NewCounterVec( prometheus.CounterOpts{ Name: METIC_PREFIX + "node_disruption_granted_total", Help: "Total number of granted node disruptions", }, []string{}, ) - NodeDisruptionRejectedTotal = prometheus.NewCounterVec( + NodeDisruptionRejectedTotal = promauto.With(metrics.Registry).NewCounterVec( prometheus.CounterOpts{ Name: METIC_PREFIX + "node_disruption_rejected_total", Help: "Total number of rejected node disruptions", }, []string{}, ) - NodeDisruptionState = prometheus.NewGaugeVec( + NodeDisruptionStateAsValue = promauto.With(metrics.Registry).NewGaugeVec( prometheus.GaugeOpts{ - Name: METIC_PREFIX + "node_disruption_state", + Name: METIC_PREFIX + "node_disruption_state_value", Help: "State of node disruption: pending=0, rejected=-1, accepted=1", }, []string{"node_disruption_name"}, ) - NodeDisruptionCreated = prometheus.NewGaugeVec( + NodeDisruptionStateAsLabel = promauto.With(metrics.Registry).NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "node_disruption_state_label", + Help: "State of node disruption: 0 not in this state; 1 is in state", + }, + []string{"node_disruption_name", "state"}, + ) + NodeDisruptionCreated = promauto.With(metrics.Registry).NewGaugeVec( prometheus.GaugeOpts{ Name: METIC_PREFIX + "node_disruption_created", Help: "Date of create of the node disruption", }, []string{"node_disruption_name"}, ) - NodeDisruptionDeadline = prometheus.NewGaugeVec( + NodeDisruptionDeadline = promauto.With(metrics.Registry).NewGaugeVec( prometheus.GaugeOpts{ Name: METIC_PREFIX + "node_disruption_deadline", Help: "Date of the deadline of the node disruption (0 if unset)", }, []string{"node_disruption_name"}, ) - NodeDisruptionImpactedNodes = prometheus.NewGaugeVec( + NodeDisruptionImpactedNodes = promauto.With(metrics.Registry).NewGaugeVec( prometheus.GaugeOpts{ Name: METIC_PREFIX + "node_disruption_impacted_node", Help: "high cardinality: create a metric for each node impacted by a given node disruption", @@ -51,77 +62,77 @@ var ( []string{"node_disruption_name", "node_name"}, ) // DISRUPTION BUDGET METRICS - DisruptionBudgetCheckHealthHookStatusCodeTotal = prometheus.NewCounterVec( + DisruptionBudgetCheckHealthHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec( prometheus.CounterOpts{ Name: METIC_PREFIX + "disruption_budget_health_hook_status_code_total", Help: "Total number of request by HTTP status code", }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"}, ) - DisruptionBudgetCheckHealthHookErrorTotal = prometheus.NewCounterVec( + DisruptionBudgetCheckHealthHookErrorTotal = promauto.With(metrics.Registry).NewCounterVec( prometheus.CounterOpts{ Name: METIC_PREFIX + "disruption_budget_health_hook_error_total", Help: "Total number of connection/response errors while requesting health hook", }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, ) - DisruptionBudgetRejectedTotal = prometheus.NewCounterVec( + DisruptionBudgetRejectedTotal = promauto.With(metrics.Registry).NewCounterVec( prometheus.CounterOpts{ Name: METIC_PREFIX + "disruption_budget_rejected_total", Help: "Total number of rejected node disruption by the disruption budget", }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, ) - DisruptionBudgetGrantedTotal = prometheus.NewCounterVec( + DisruptionBudgetGrantedTotal = promauto.With(metrics.Registry).NewCounterVec( prometheus.CounterOpts{ Name: METIC_PREFIX + "disruption_budget_granted_total", Help: "Total number of granted node disruption by the disruption budget", }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, ) - DisruptionBudgetMaxDisruptions = prometheus.NewGaugeVec( + DisruptionBudgetMaxDisruptions = promauto.With(metrics.Registry).NewGaugeVec( prometheus.GaugeOpts{ Name: METIC_PREFIX + "disruption_budget_max_disruptions", Help: "Reflect the MaxDisruptions fields from budget Spec", }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, ) - DisruptionBudgetCurrentDisruptions = prometheus.NewGaugeVec( + DisruptionBudgetCurrentDisruptions = promauto.With(metrics.Registry).NewGaugeVec( prometheus.GaugeOpts{ Name: METIC_PREFIX + "disruption_budget_current_disruptions", Help: "Reflect the CurrentDisruptions fields from budget Status", }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, ) - DisruptionBudgetDisruptionsAllowed = prometheus.NewGaugeVec( + DisruptionBudgetDisruptionsAllowed = promauto.With(metrics.Registry).NewGaugeVec( prometheus.GaugeOpts{ Name: METIC_PREFIX + "disruption_budget_disruptions_allowed", Help: "Reflect the DisruptionsAllowed fields from budget Status", }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, ) - DisruptionBudgetMaxDisruptedNodes = prometheus.NewGaugeVec( + DisruptionBudgetMaxDisruptedNodes = promauto.With(metrics.Registry).NewGaugeVec( prometheus.GaugeOpts{ Name: METIC_PREFIX + "disruption_budget_max_disrupted_nodes", Help: "Reflect the MaxDisruptedNodes fields from budget Spec", }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, ) - DisruptionBudgetMinUndisruptedNodes = prometheus.NewGaugeVec( + DisruptionBudgetMinUndisruptedNodes = promauto.With(metrics.Registry).NewGaugeVec( prometheus.GaugeOpts{ Name: METIC_PREFIX + "disruption_budget_min_undisrupted_nodes", Help: "Reflect the MinUndisruptedNodes fields from budget Spec", }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, ) - DisruptionBudgetWatchedNodes = prometheus.NewGaugeVec( + DisruptionBudgetWatchedNodes = promauto.With(metrics.Registry).NewGaugeVec( prometheus.GaugeOpts{ Name: METIC_PREFIX + "node_disruption_watched_nodes", Help: "high cardinality: create a metric for each node watched by a budget", }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "node_name"}, ) - DisruptionBudgetDisruptions = prometheus.NewGaugeVec( + DisruptionBudgetDisruptions = promauto.With(metrics.Registry).NewGaugeVec( prometheus.GaugeOpts{ Name: METIC_PREFIX + "budget_disruption_disruptions", Help: "high cardinality: create a metric for each disruption by a budget", diff --git a/internal/controller/nodedisruption_controller.go b/internal/controller/nodedisruption_controller.go index 36727d0..fa9da74 100644 --- a/internal/controller/nodedisruption_controller.go +++ b/internal/controller/nodedisruption_controller.go @@ -111,7 +111,8 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque // PruneNodeDisruptionMetric remove metrics for a Node Disruption that don't exist anymore func PruneNodeDisruptionMetrics(nd_name string) { - NodeDisruptionState.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name}) + NodeDisruptionStateAsValue.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name}) + NodeDisruptionStateAsLabel.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name}) NodeDisruptionCreated.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name}) NodeDisruptionDeadline.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name}) NodeDisruptionImpactedNodes.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name}) @@ -122,12 +123,21 @@ func UpdateNodeDisruptionMetrics(nd *nodedisruptionv1alpha1.NodeDisruption) { nd_state := 0 if nd.Status.State == nodedisruptionv1alpha1.Pending { nd_state = 0 + NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(1) + NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(0) + NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(0) } else if nd.Status.State == nodedisruptionv1alpha1.Rejected { nd_state = -1 + NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(0) + NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(1) + NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(0) } else if nd.Status.State == nodedisruptionv1alpha1.Granted { nd_state = 1 + NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(0) + NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(0) + NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(1) } - NodeDisruptionState.WithLabelValues(nd.Name).Set(float64(nd_state)) + NodeDisruptionStateAsValue.WithLabelValues(nd.Name).Set(float64(nd_state)) NodeDisruptionCreated.WithLabelValues(nd.Name).Set(float64(nd.CreationTimestamp.Unix())) // Deadline might not be set so it will be 0 but timestamp in Go are not Unix epoch // so converting a 0 timestamp will not result in epoch 0. We override this to have nice values