Skip to content

Commit

Permalink
Add metrics about grant/rejected of disruption budgets
Browse files Browse the repository at this point in the history
Co-authored-by: Fatma Bouzghaia <f.bouzghaia@criteo.com>
  • Loading branch information
geobeau and Fatma Bouzghaia committed Mar 27, 2024
1 parent 8c68d47 commit bca75cb
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 54 deletions.
10 changes: 8 additions & 2 deletions internal/controller/applicationdisruptionbudget_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"io"
"net/http"
"reflect"
"strconv"

"k8s.io/apimachinery/pkg/api/errors"

Expand Down Expand Up @@ -76,7 +77,7 @@ func (r *ApplicationDisruptionBudgetReconciler) Reconcile(ctx context.Context, r
return ctrl.Result{}, err
}

logger.Info("Start reconcile of ADB", "version", adb.ResourceVersion)
logger.Info("Start reconcile of adb", "version", adb.ResourceVersion)

resolver := ApplicationDisruptionBudgetResolver{
ApplicationDisruptionBudget: adb.DeepCopy(),
Expand Down Expand Up @@ -107,7 +108,7 @@ func (r *ApplicationDisruptionBudgetReconciler) MapFuncBuilder() handler.MapFunc
if err != nil {
// We cannot return an error so at least it should be logged
logger := log.FromContext(context.Background())
logger.Error(err, "Could not list ADBs in watch function")
logger.Error(err, "Could not list adbs in watch function")
return requests
}

Expand Down Expand Up @@ -231,6 +232,7 @@ func (r *ApplicationDisruptionBudgetResolver) CallHealthHook(ctx context.Context

req, err := http.NewRequestWithContext(ctx, http.MethodPost, r.ApplicationDisruptionBudget.Spec.HealthHook.URL, bytes.NewReader(data))
if err != nil {
DisruptionBudgetCheckHealthHookErrorTotal.WithLabelValues(nd.Namespace, nd.Name, nd.Kind).Inc()
return err
}

Expand All @@ -240,14 +242,18 @@ func (r *ApplicationDisruptionBudgetResolver) CallHealthHook(ctx context.Context

resp, err := client.Do(req)
if err != nil {
DisruptionBudgetCheckHealthHookErrorTotal.WithLabelValues(nd.Namespace, nd.Name, nd.Kind).Inc()
return err
}

body, err := io.ReadAll(resp.Body)
if err != nil {
DisruptionBudgetCheckHealthHookErrorTotal.WithLabelValues(nd.Namespace, nd.Name, nd.Kind).Inc()
return err
}

DisruptionBudgetCheckHealthHookStatusCodeTotal.WithLabelValues(nd.Namespace, nd.Name, nd.Kind, strconv.Itoa(resp.StatusCode)).Inc()

if resp.StatusCode >= 200 && resp.StatusCode < 300 {
return nil
}
Expand Down
82 changes: 82 additions & 0 deletions internal/controller/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
package controller

import "github.com/prometheus/client_golang/prometheus"

const (
METIC_PREFIX = "node_disruption_controller_"
)

var (
// NODE DISRUPTION METRICS
NodeDisruptionGrantedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "node_disruption_granted_total",
Help: "Total number of granted node disruptions",
},
[]string{},
)
NodeDisruptionRejectedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "node_disruption_rejected_total",
Help: "Total number of rejected node disruptions",
},
[]string{},
)
NodeDisruptionState = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_state",
Help: "State of node disruption: pending=0, rejected=-1, accepted=1",
},
[]string{"node_disruption_name"},
)
NodeDisruptionCreated = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_created",
Help: "Date of create of the node disruption",
},
[]string{"node_disruption_name"},
)
NodeDisruptionDeadline = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_deadline",
Help: "Date of the deadline of the node disruption (0 if unset)",
},
[]string{"node_disruption_name"},
)
NodeDisruptionImpactedNodes = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_impacted_node",
Help: "high cardinality: create a metric for each node impacted by a given node disruption",
},
[]string{"node_disruption_name", "node_name"},
)
// APPLICATION DISRUPTION BUDGET METRICS
DisruptionBudgetCheckHealthHookStatusCodeTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_health_hook_status_code_total",
Help: "Total number of request by HTTP status code",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"},
)
DisruptionBudgetCheckHealthHookErrorTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_health_hook_error_total",
Help: "Total number of connection/response errors while requesting health hook",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetRejectedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_rejected_total",
Help: "Total number of rejected node disruption by the disruption budget",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetGrantedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_granted_total",
Help: "Total number of granted node disruption by the disruption budget",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
)
62 changes: 10 additions & 52 deletions internal/controller/nodedisruption_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,55 +47,6 @@ type NodeDisruptionReconcilerConfig struct {
RejectOverlappingDisruption bool
}

const (
METIC_PREFIX = "node_disruption_controller_"
)

var (
NodeDisruptionGrantedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "node_disruption_granted_total",
Help: "Total number of granted node disruptions",
},
[]string{},
)
NodeDisruptionRejectedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "node_disruption_rejected_total",
Help: "Total number of rejected node disruptions",
},
[]string{},
)
NodeDisruptionState = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_state",
Help: "State of node disruption: pending=0, rejected=-1, accepted=1",
},
[]string{"node_disruption_name"},
)
NodeDisruptionCreated = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_created",
Help: "Date of create of the node disruption",
},
[]string{"node_disruption_name"},
)
NodeDisruptionDeadline = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_deadline",
Help: "Date of the deadline of the node disruption (0 if unset)",
},
[]string{"node_disruption_name"},
)
NodeDisruptionImpactedNodes = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_impacted_node",
Help: "high cardinality: create a metric for each node impacted by a given node disruption",
},
[]string{"node_disruption_name", "node_name"},
)
)

// NodeDisruptionReconciler reconciles NodeDisruptions
type NodeDisruptionReconciler struct {
client.Client
Expand Down Expand Up @@ -380,13 +331,15 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithBudgetConstraints(ctx con

if !budget.TolerateDisruption(disruptedNodes) {
anyFailed = true
ref := budget.GetNamespacedName()
status := nodedisruptionv1alpha1.DisruptedBudgetStatus{
Reference: budget.GetNamespacedName(),
Reference: ref,
Reason: "No more disruption allowed",
Ok: false,
}
statuses = append(statuses, status)
logger.Info("Disruption rejected because: ", "status", status)
DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc()
break
}
impactedBudgets = append(impactedBudgets, budget)
Expand All @@ -398,15 +351,17 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithBudgetConstraints(ctx con

for _, budget := range impactedBudgets {
err := budget.CheckHealth(ctx)
ref := budget.GetNamespacedName()
if err != nil {
anyFailed = true
status := nodedisruptionv1alpha1.DisruptedBudgetStatus{
Reference: budget.GetNamespacedName(),
Reference: ref,
Reason: fmt.Sprintf("Unhealthy: %s", err),
Ok: false,
}
statuses = append(statuses, status)
logger.Info("Disruption rejected because: ", "status", status)
DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc()
break
}
}
Expand All @@ -417,17 +372,20 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithBudgetConstraints(ctx con

for _, budget := range impactedBudgets {
err := budget.CallHealthHook(ctx, ndr.NodeDisruption)
ref := budget.GetNamespacedName()
if err != nil {
anyFailed = true
status := nodedisruptionv1alpha1.DisruptedBudgetStatus{
Reference: budget.GetNamespacedName(),
Reference: ref,
Reason: fmt.Sprintf("Unhealthy: %s", err),
Ok: false,
}
statuses = append(statuses, status)
logger.Info("Disruption rejected because: ", "status", status)
DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc()
break
}
DisruptionBudgetGrantedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc()
statuses = append(statuses, nodedisruptionv1alpha1.DisruptedBudgetStatus{
Reference: budget.GetNamespacedName(),
Reason: "",
Expand Down

0 comments on commit bca75cb

Please sign in to comment.