From 2846ae914c57f9622653e6a6068548c4db4f2ead Mon Sep 17 00:00:00 2001 From: "g.beausire" Date: Wed, 27 Mar 2024 18:29:34 +0100 Subject: [PATCH] Add metrics to reflect the state of budgets --- .../applicationdisruptionbudget_controller.go | 20 ++++++++ internal/controller/budget.go | 34 +++++++++++++ internal/controller/metrics.go | 51 ++++++++++++++++++- .../controller/nodedisruption_controller.go | 9 ++-- .../nodedisruptionbudget_controller.go | 27 +++++++++- 5 files changed, 135 insertions(+), 6 deletions(-) diff --git a/internal/controller/applicationdisruptionbudget_controller.go b/internal/controller/applicationdisruptionbudget_controller.go index d701ae8..e1b5835 100644 --- a/internal/controller/applicationdisruptionbudget_controller.go +++ b/internal/controller/applicationdisruptionbudget_controller.go @@ -41,6 +41,7 @@ import ( nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1" "github.com/criteo/node-disruption-controller/pkg/resolver" + "github.com/prometheus/client_golang/prometheus" ) // ApplicationDisruptionBudgetReconciler reconciles a ApplicationDisruptionBudget object @@ -68,15 +69,22 @@ func (r *ApplicationDisruptionBudgetReconciler) Reconcile(ctx context.Context, r logger := log.FromContext(ctx) adb := &nodedisruptionv1alpha1.ApplicationDisruptionBudget{} err := r.Client.Get(ctx, req.NamespacedName, adb) + ref := nodedisruptionv1alpha1.NamespacedName{ + Namespace: req.Namespace, + Name: req.Name, + Kind: "ApplicationDisruptionBudget", + } if err != nil { if errors.IsNotFound(err) { // If the resource was not found, nothing has to be done + PruneADBMetrics(ref) return ctrl.Result{}, nil } return ctrl.Result{}, err } + UpdateADBMetrics(ref, adb) logger.Info("Start reconcile of adb", "version", adb.ResourceVersion) resolver := ApplicationDisruptionBudgetResolver{ @@ -98,6 +106,18 @@ func (r *ApplicationDisruptionBudgetReconciler) Reconcile(ctx context.Context, r return ctrl.Result{}, err } +// PruneNodeDisruptionMetric remove metrics for an ADB that don't exist anymore +func PruneADBMetrics(ref nodedisruptionv1alpha1.NamespacedName) { + DisruptionBudgetMaxDisruptions.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind}) + PruneBudgetStatusMetrics(ref) +} + +// UpdateADBMetrics update metrics for an ADB +func UpdateADBMetrics(ref nodedisruptionv1alpha1.NamespacedName, adb *nodedisruptionv1alpha1.ApplicationDisruptionBudget) { + DisruptionBudgetMaxDisruptions.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(adb.Spec.MaxDisruptions)) + UpdateBudgetStatusMetrics(ref, adb.Status) +} + // MapFuncBuilder returns a MapFunc that is used to dispatch reconcile requests to // budgets when an event is triggered by one of their matching object func (r *ApplicationDisruptionBudgetReconciler) MapFuncBuilder() handler.MapFunc { diff --git a/internal/controller/budget.go b/internal/controller/budget.go index 0ee8cc6..4038226 100644 --- a/internal/controller/budget.go +++ b/internal/controller/budget.go @@ -5,6 +5,7 @@ import ( nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1" "github.com/criteo/node-disruption-controller/pkg/resolver" + "github.com/prometheus/client_golang/prometheus" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -25,6 +26,39 @@ type Budget interface { GetNamespacedName() nodedisruptionv1alpha1.NamespacedName } +// PruneBudgetMetrics remove metrics for a Disruption Budget that doesn't exist anymore +func PruneBudgetStatusMetrics(ref nodedisruptionv1alpha1.NamespacedName) { + DisruptionBudgetDisruptions.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind}) + DisruptionBudgetWatchedNodes.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind}) + DisruptionBudgetDisruptionsAllowed.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind}) + DisruptionBudgetCurrentDisruptions.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind}) + + DisruptionBudgetRejectedTotal.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind}) + DisruptionBudgetGrantedTotal.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind}) + DisruptionBudgetCheckHealthHookStatusCodeTotal.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind}) + DisruptionBudgetCheckHealthHookErrorTotal.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind}) +} + +func UpdateBudgetStatusMetrics(ref nodedisruptionv1alpha1.NamespacedName, status nodedisruptionv1alpha1.DisruptionBudgetStatus) { + for _, node_name := range status.WatchedNodes { + DisruptionBudgetWatchedNodes.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, node_name).Set(1) + } + for _, disruption := range status.Disruptions { + nd_state := 0 + state := nodedisruptionv1alpha1.NodeDisruptionState(disruption.State) + if state == nodedisruptionv1alpha1.Pending { + nd_state = 0 + } else if state == nodedisruptionv1alpha1.Rejected { + nd_state = -1 + } else if state == nodedisruptionv1alpha1.Granted { + nd_state = 1 + } + DisruptionBudgetDisruptions.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, disruption.Name).Set(float64(nd_state)) + } + DisruptionBudgetDisruptionsAllowed.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(status.DisruptionsAllowed)) + DisruptionBudgetCurrentDisruptions.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(status.CurrentDisruptions)) +} + // GetAllBudgetsInSync fetch all the budgets from Kubernetes and synchronise them func GetAllBudgetsInSync(ctx context.Context, k8sClient client.Client) ([]Budget, error) { opts := []client.ListOption{} diff --git a/internal/controller/metrics.go b/internal/controller/metrics.go index 16361c6..60213b7 100644 --- a/internal/controller/metrics.go +++ b/internal/controller/metrics.go @@ -50,7 +50,7 @@ var ( }, []string{"node_disruption_name", "node_name"}, ) - // APPLICATION DISRUPTION BUDGET METRICS + // DISRUPTION BUDGET METRICS DisruptionBudgetCheckHealthHookStatusCodeTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: METIC_PREFIX + "disruption_budget_health_hook_status_code_total", @@ -79,4 +79,53 @@ var ( }, []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, ) + DisruptionBudgetMaxDisruptions = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "disruption_budget_max_disruptions", + Help: "Reflect the MaxDisruptions fields from budget Spec", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + ) + DisruptionBudgetCurrentDisruptions = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "disruption_budget_current_disruptions", + Help: "Reflect the CurrentDisruptions fields from budget Status", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + ) + DisruptionBudgetDisruptionsAllowed = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "disruption_budget_disruptions_allowed", + Help: "Reflect the DisruptionsAllowed fields from budget Status", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + ) + DisruptionBudgetMaxDisruptedNodes = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "disruption_budget_max_disrupted_nodes", + Help: "Reflect the MaxDisruptedNodes fields from budget Spec", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + ) + DisruptionBudgetMinUndisruptedNodes = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "disruption_budget_min_undisrupted_nodes", + Help: "Reflect the MinUndisruptedNodes fields from budget Spec", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + ) + DisruptionBudgetWatchedNodes = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "node_disruption_watched_nodes", + Help: "high cardinality: create a metric for each node watched by a budget", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "node_name"}, + ) + DisruptionBudgetDisruptions = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "budget_disruption_disruptions", + Help: "high cardinality: create a metric for each disruption by a budget", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "node_disruption_name"}, + ) ) diff --git a/internal/controller/nodedisruption_controller.go b/internal/controller/nodedisruption_controller.go index 8fcc7cc..1260a1a 100644 --- a/internal/controller/nodedisruption_controller.go +++ b/internal/controller/nodedisruption_controller.go @@ -73,14 +73,14 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque if err != nil { if errors.IsNotFound(err) { - PruneNodeDisruptionMetric(req.NamespacedName.Name) + PruneNodeDisruptionMetrics(req.NamespacedName.Name) // If the ressource was not found, nothing has to be done return clusterResult, nil } return clusterResult, err } logger.Info("Updating metrics") - UpdateNodeDisruptionMetric(nd) + UpdateNodeDisruptionMetrics(nd) logger.Info("Start reconcile of NodeDisruption", "state", nd.Status.State, "retryDate", nd.Status.NextRetryDate.Time) if time.Now().Before(nd.Status.NextRetryDate.Time) { @@ -110,14 +110,15 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque } // PruneNodeDisruptionMetric remove metrics for a Node Disruption that don't exist anymore -func PruneNodeDisruptionMetric(nd_name string) { +func PruneNodeDisruptionMetrics(nd_name string) { NodeDisruptionState.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name}) NodeDisruptionCreated.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name}) NodeDisruptionDeadline.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name}) NodeDisruptionImpactedNodes.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name}) } -func UpdateNodeDisruptionMetric(nd *nodedisruptionv1alpha1.NodeDisruption) { +// UpdateNodeDisruptionMetrics update metrics for a Node Disruption +func UpdateNodeDisruptionMetrics(nd *nodedisruptionv1alpha1.NodeDisruption) { nd_state := 0 if nd.Status.State == nodedisruptionv1alpha1.Pending { nd_state = 0 diff --git a/internal/controller/nodedisruptionbudget_controller.go b/internal/controller/nodedisruptionbudget_controller.go index 8c1ef60..59cf484 100644 --- a/internal/controller/nodedisruptionbudget_controller.go +++ b/internal/controller/nodedisruptionbudget_controller.go @@ -35,6 +35,7 @@ import ( nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1" "github.com/criteo/node-disruption-controller/pkg/resolver" + "github.com/prometheus/client_golang/prometheus" ) // NodeDisruptionBudgetReconciler reconciles a NodeDisruptionBudget object @@ -57,17 +58,27 @@ type NodeDisruptionBudgetReconciler struct { // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.15.0/pkg/reconcile func (r *NodeDisruptionBudgetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) ndb := &nodedisruptionv1alpha1.NodeDisruptionBudget{} err := r.Client.Get(ctx, req.NamespacedName, ndb) + ref := nodedisruptionv1alpha1.NamespacedName{ + Namespace: req.Namespace, + Name: req.Name, + Kind: "NodeDisruptionBudget", + } if err != nil { if errors.IsNotFound(err) { // If the resource was not found, nothing has to be done + PruneNDBMetrics(ref) return ctrl.Result{}, nil } return ctrl.Result{}, err } + UpdateNDBMetrics(ref, ndb) + logger.Info("Start reconcile of NDB", "version", ndb.ResourceVersion) + resolver := NodeDisruptionBudgetResolver{ NodeDisruptionBudget: ndb.DeepCopy(), Client: r.Client, @@ -86,10 +97,24 @@ func (r *NodeDisruptionBudgetReconciler) Reconcile(ctx context.Context, req ctrl return ctrl.Result{}, err } +// PruneNodeDisruptionMetric remove metrics for a NDB that don't exist anymore +func PruneNDBMetrics(ref nodedisruptionv1alpha1.NamespacedName) { + DisruptionBudgetMaxDisruptedNodes.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind}) + DisruptionBudgetMinUndisruptedNodes.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind}) + PruneBudgetStatusMetrics(ref) +} + +// UpdateNDBMetrics update metrics for a NDB +func UpdateNDBMetrics(ref nodedisruptionv1alpha1.NamespacedName, ndb *nodedisruptionv1alpha1.NodeDisruptionBudget) { + DisruptionBudgetMaxDisruptedNodes.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(ndb.Spec.MaxDisruptedNodes)) + DisruptionBudgetMinUndisruptedNodes.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(ndb.Spec.MinUndisruptedNodes)) + UpdateBudgetStatusMetrics(ref, ndb.Status) +} + // MapFuncBuilder returns a MapFunc that is used to dispatch reconcile requests to // budgets when an event is triggered by one of their matching object func (r *NodeDisruptionBudgetReconciler) MapFuncBuilder() handler.MapFunc { - // Look for all ADBs in the namespace, then see if they match the object + // Look for all NDBs in the namespace, then see if they match the object return func(ctx context.Context, object client.Object) (requests []reconcile.Request) { ndbs := nodedisruptionv1alpha1.NodeDisruptionBudgetList{} err := r.Client.List(ctx, &ndbs, &client.ListOptions{Namespace: object.GetNamespace()})