Skip to content

Commit

Permalink
Add metrics to reflect the state of budgets
Browse files Browse the repository at this point in the history
  • Loading branch information
geobeau committed Mar 27, 2024
1 parent bca75cb commit 2846ae9
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 6 deletions.
20 changes: 20 additions & 0 deletions internal/controller/applicationdisruptionbudget_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (

nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1"
"github.com/criteo/node-disruption-controller/pkg/resolver"
"github.com/prometheus/client_golang/prometheus"
)

// ApplicationDisruptionBudgetReconciler reconciles a ApplicationDisruptionBudget object
Expand Down Expand Up @@ -68,15 +69,22 @@ func (r *ApplicationDisruptionBudgetReconciler) Reconcile(ctx context.Context, r
logger := log.FromContext(ctx)
adb := &nodedisruptionv1alpha1.ApplicationDisruptionBudget{}
err := r.Client.Get(ctx, req.NamespacedName, adb)
ref := nodedisruptionv1alpha1.NamespacedName{
Namespace: req.Namespace,
Name: req.Name,
Kind: "ApplicationDisruptionBudget",
}

if err != nil {
if errors.IsNotFound(err) {
// If the resource was not found, nothing has to be done
PruneADBMetrics(ref)
return ctrl.Result{}, nil
}
return ctrl.Result{}, err
}

UpdateADBMetrics(ref, adb)
logger.Info("Start reconcile of adb", "version", adb.ResourceVersion)

resolver := ApplicationDisruptionBudgetResolver{
Expand All @@ -98,6 +106,18 @@ func (r *ApplicationDisruptionBudgetReconciler) Reconcile(ctx context.Context, r
return ctrl.Result{}, err
}

// PruneNodeDisruptionMetric remove metrics for an ADB that don't exist anymore
func PruneADBMetrics(ref nodedisruptionv1alpha1.NamespacedName) {
DisruptionBudgetMaxDisruptions.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind})
PruneBudgetStatusMetrics(ref)
}

// UpdateADBMetrics update metrics for an ADB
func UpdateADBMetrics(ref nodedisruptionv1alpha1.NamespacedName, adb *nodedisruptionv1alpha1.ApplicationDisruptionBudget) {
DisruptionBudgetMaxDisruptions.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(adb.Spec.MaxDisruptions))
UpdateBudgetStatusMetrics(ref, adb.Status)
}

// MapFuncBuilder returns a MapFunc that is used to dispatch reconcile requests to
// budgets when an event is triggered by one of their matching object
func (r *ApplicationDisruptionBudgetReconciler) MapFuncBuilder() handler.MapFunc {
Expand Down
34 changes: 34 additions & 0 deletions internal/controller/budget.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (

nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1"
"github.com/criteo/node-disruption-controller/pkg/resolver"
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/client"
)

Expand All @@ -25,6 +26,39 @@ type Budget interface {
GetNamespacedName() nodedisruptionv1alpha1.NamespacedName
}

// PruneBudgetMetrics remove metrics for a Disruption Budget that doesn't exist anymore
func PruneBudgetStatusMetrics(ref nodedisruptionv1alpha1.NamespacedName) {
DisruptionBudgetDisruptions.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind})
DisruptionBudgetWatchedNodes.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind})
DisruptionBudgetDisruptionsAllowed.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind})
DisruptionBudgetCurrentDisruptions.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind})

DisruptionBudgetRejectedTotal.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind})
DisruptionBudgetGrantedTotal.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind})
DisruptionBudgetCheckHealthHookStatusCodeTotal.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind})
DisruptionBudgetCheckHealthHookErrorTotal.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind})
}

func UpdateBudgetStatusMetrics(ref nodedisruptionv1alpha1.NamespacedName, status nodedisruptionv1alpha1.DisruptionBudgetStatus) {
for _, node_name := range status.WatchedNodes {
DisruptionBudgetWatchedNodes.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, node_name).Set(1)
}
for _, disruption := range status.Disruptions {
nd_state := 0
state := nodedisruptionv1alpha1.NodeDisruptionState(disruption.State)
if state == nodedisruptionv1alpha1.Pending {
nd_state = 0
} else if state == nodedisruptionv1alpha1.Rejected {
nd_state = -1
} else if state == nodedisruptionv1alpha1.Granted {
nd_state = 1
}
DisruptionBudgetDisruptions.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, disruption.Name).Set(float64(nd_state))
}
DisruptionBudgetDisruptionsAllowed.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(status.DisruptionsAllowed))
DisruptionBudgetCurrentDisruptions.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(status.CurrentDisruptions))
}

// GetAllBudgetsInSync fetch all the budgets from Kubernetes and synchronise them
func GetAllBudgetsInSync(ctx context.Context, k8sClient client.Client) ([]Budget, error) {
opts := []client.ListOption{}
Expand Down
51 changes: 50 additions & 1 deletion internal/controller/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ var (
},
[]string{"node_disruption_name", "node_name"},
)
// APPLICATION DISRUPTION BUDGET METRICS
// DISRUPTION BUDGET METRICS
DisruptionBudgetCheckHealthHookStatusCodeTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_health_hook_status_code_total",
Expand Down Expand Up @@ -79,4 +79,53 @@ var (
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetMaxDisruptions = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "disruption_budget_max_disruptions",
Help: "Reflect the MaxDisruptions fields from budget Spec",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetCurrentDisruptions = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "disruption_budget_current_disruptions",
Help: "Reflect the CurrentDisruptions fields from budget Status",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetDisruptionsAllowed = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "disruption_budget_disruptions_allowed",
Help: "Reflect the DisruptionsAllowed fields from budget Status",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetMaxDisruptedNodes = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "disruption_budget_max_disrupted_nodes",
Help: "Reflect the MaxDisruptedNodes fields from budget Spec",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetMinUndisruptedNodes = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "disruption_budget_min_undisrupted_nodes",
Help: "Reflect the MinUndisruptedNodes fields from budget Spec",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetWatchedNodes = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_watched_nodes",
Help: "high cardinality: create a metric for each node watched by a budget",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "node_name"},
)
DisruptionBudgetDisruptions = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "budget_disruption_disruptions",
Help: "high cardinality: create a metric for each disruption by a budget",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "node_disruption_name"},
)
)
9 changes: 5 additions & 4 deletions internal/controller/nodedisruption_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,14 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque

if err != nil {
if errors.IsNotFound(err) {
PruneNodeDisruptionMetric(req.NamespacedName.Name)
PruneNodeDisruptionMetrics(req.NamespacedName.Name)
// If the ressource was not found, nothing has to be done
return clusterResult, nil
}
return clusterResult, err
}
logger.Info("Updating metrics")
UpdateNodeDisruptionMetric(nd)
UpdateNodeDisruptionMetrics(nd)

logger.Info("Start reconcile of NodeDisruption", "state", nd.Status.State, "retryDate", nd.Status.NextRetryDate.Time)
if time.Now().Before(nd.Status.NextRetryDate.Time) {
Expand Down Expand Up @@ -110,14 +110,15 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
}

// PruneNodeDisruptionMetric remove metrics for a Node Disruption that don't exist anymore
func PruneNodeDisruptionMetric(nd_name string) {
func PruneNodeDisruptionMetrics(nd_name string) {
NodeDisruptionState.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionCreated.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionDeadline.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionImpactedNodes.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
}

func UpdateNodeDisruptionMetric(nd *nodedisruptionv1alpha1.NodeDisruption) {
// UpdateNodeDisruptionMetrics update metrics for a Node Disruption
func UpdateNodeDisruptionMetrics(nd *nodedisruptionv1alpha1.NodeDisruption) {
nd_state := 0
if nd.Status.State == nodedisruptionv1alpha1.Pending {
nd_state = 0
Expand Down
27 changes: 26 additions & 1 deletion internal/controller/nodedisruptionbudget_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (

nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1"
"github.com/criteo/node-disruption-controller/pkg/resolver"
"github.com/prometheus/client_golang/prometheus"
)

// NodeDisruptionBudgetReconciler reconciles a NodeDisruptionBudget object
Expand All @@ -57,17 +58,27 @@ type NodeDisruptionBudgetReconciler struct {
// For more details, check Reconcile and its Result here:
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.15.0/pkg/reconcile
func (r *NodeDisruptionBudgetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx)
ndb := &nodedisruptionv1alpha1.NodeDisruptionBudget{}
err := r.Client.Get(ctx, req.NamespacedName, ndb)
ref := nodedisruptionv1alpha1.NamespacedName{
Namespace: req.Namespace,
Name: req.Name,
Kind: "NodeDisruptionBudget",
}

if err != nil {
if errors.IsNotFound(err) {
// If the resource was not found, nothing has to be done
PruneNDBMetrics(ref)
return ctrl.Result{}, nil
}
return ctrl.Result{}, err
}

UpdateNDBMetrics(ref, ndb)
logger.Info("Start reconcile of NDB", "version", ndb.ResourceVersion)

resolver := NodeDisruptionBudgetResolver{
NodeDisruptionBudget: ndb.DeepCopy(),
Client: r.Client,
Expand All @@ -86,10 +97,24 @@ func (r *NodeDisruptionBudgetReconciler) Reconcile(ctx context.Context, req ctrl
return ctrl.Result{}, err
}

// PruneNodeDisruptionMetric remove metrics for a NDB that don't exist anymore
func PruneNDBMetrics(ref nodedisruptionv1alpha1.NamespacedName) {
DisruptionBudgetMaxDisruptedNodes.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind})
DisruptionBudgetMinUndisruptedNodes.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind})
PruneBudgetStatusMetrics(ref)
}

// UpdateNDBMetrics update metrics for a NDB
func UpdateNDBMetrics(ref nodedisruptionv1alpha1.NamespacedName, ndb *nodedisruptionv1alpha1.NodeDisruptionBudget) {
DisruptionBudgetMaxDisruptedNodes.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(ndb.Spec.MaxDisruptedNodes))
DisruptionBudgetMinUndisruptedNodes.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Set(float64(ndb.Spec.MinUndisruptedNodes))
UpdateBudgetStatusMetrics(ref, ndb.Status)
}

// MapFuncBuilder returns a MapFunc that is used to dispatch reconcile requests to
// budgets when an event is triggered by one of their matching object
func (r *NodeDisruptionBudgetReconciler) MapFuncBuilder() handler.MapFunc {
// Look for all ADBs in the namespace, then see if they match the object
// Look for all NDBs in the namespace, then see if they match the object
return func(ctx context.Context, object client.Object) (requests []reconcile.Request) {
ndbs := nodedisruptionv1alpha1.NodeDisruptionBudgetList{}
err := r.Client.List(ctx, &ndbs, &client.ListOptions{Namespace: object.GetNamespace()})
Expand Down

0 comments on commit 2846ae9

Please sign in to comment.