diff --git a/internal/controller/applicationdisruptionbudget_controller.go b/internal/controller/applicationdisruptionbudget_controller.go index 83c78ff..d701ae8 100644 --- a/internal/controller/applicationdisruptionbudget_controller.go +++ b/internal/controller/applicationdisruptionbudget_controller.go @@ -24,6 +24,7 @@ import ( "io" "net/http" "reflect" + "strconv" "k8s.io/apimachinery/pkg/api/errors" @@ -76,7 +77,7 @@ func (r *ApplicationDisruptionBudgetReconciler) Reconcile(ctx context.Context, r return ctrl.Result{}, err } - logger.Info("Start reconcile of ADB", "version", adb.ResourceVersion) + logger.Info("Start reconcile of adb", "version", adb.ResourceVersion) resolver := ApplicationDisruptionBudgetResolver{ ApplicationDisruptionBudget: adb.DeepCopy(), @@ -107,7 +108,7 @@ func (r *ApplicationDisruptionBudgetReconciler) MapFuncBuilder() handler.MapFunc if err != nil { // We cannot return an error so at least it should be logged logger := log.FromContext(context.Background()) - logger.Error(err, "Could not list ADBs in watch function") + logger.Error(err, "Could not list adbs in watch function") return requests } @@ -231,6 +232,7 @@ func (r *ApplicationDisruptionBudgetResolver) CallHealthHook(ctx context.Context req, err := http.NewRequestWithContext(ctx, http.MethodPost, r.ApplicationDisruptionBudget.Spec.HealthHook.URL, bytes.NewReader(data)) if err != nil { + DisruptionBudgetCheckHealthHookErrorTotal.WithLabelValues(nd.Namespace, nd.Name, nd.Kind).Inc() return err } @@ -240,14 +242,18 @@ func (r *ApplicationDisruptionBudgetResolver) CallHealthHook(ctx context.Context resp, err := client.Do(req) if err != nil { + DisruptionBudgetCheckHealthHookErrorTotal.WithLabelValues(nd.Namespace, nd.Name, nd.Kind).Inc() return err } body, err := io.ReadAll(resp.Body) if err != nil { + DisruptionBudgetCheckHealthHookErrorTotal.WithLabelValues(nd.Namespace, nd.Name, nd.Kind).Inc() return err } + DisruptionBudgetCheckHealthHookStatusCodeTotal.WithLabelValues(nd.Namespace, nd.Name, nd.Kind, strconv.Itoa(resp.StatusCode)).Inc() + if resp.StatusCode >= 200 && resp.StatusCode < 300 { return nil } diff --git a/internal/controller/metrics.go b/internal/controller/metrics.go new file mode 100644 index 0000000..16361c6 --- /dev/null +++ b/internal/controller/metrics.go @@ -0,0 +1,82 @@ +package controller + +import "github.com/prometheus/client_golang/prometheus" + +const ( + METIC_PREFIX = "node_disruption_controller_" +) + +var ( + // NODE DISRUPTION METRICS + NodeDisruptionGrantedTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: METIC_PREFIX + "node_disruption_granted_total", + Help: "Total number of granted node disruptions", + }, + []string{}, + ) + NodeDisruptionRejectedTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: METIC_PREFIX + "node_disruption_rejected_total", + Help: "Total number of rejected node disruptions", + }, + []string{}, + ) + NodeDisruptionState = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "node_disruption_state", + Help: "State of node disruption: pending=0, rejected=-1, accepted=1", + }, + []string{"node_disruption_name"}, + ) + NodeDisruptionCreated = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "node_disruption_created", + Help: "Date of create of the node disruption", + }, + []string{"node_disruption_name"}, + ) + NodeDisruptionDeadline = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "node_disruption_deadline", + Help: "Date of the deadline of the node disruption (0 if unset)", + }, + []string{"node_disruption_name"}, + ) + NodeDisruptionImpactedNodes = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: METIC_PREFIX + "node_disruption_impacted_node", + Help: "high cardinality: create a metric for each node impacted by a given node disruption", + }, + []string{"node_disruption_name", "node_name"}, + ) + // APPLICATION DISRUPTION BUDGET METRICS + DisruptionBudgetCheckHealthHookStatusCodeTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: METIC_PREFIX + "disruption_budget_health_hook_status_code_total", + Help: "Total number of request by HTTP status code", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"}, + ) + DisruptionBudgetCheckHealthHookErrorTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: METIC_PREFIX + "disruption_budget_health_hook_error_total", + Help: "Total number of connection/response errors while requesting health hook", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + ) + DisruptionBudgetRejectedTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: METIC_PREFIX + "disruption_budget_rejected_total", + Help: "Total number of rejected node disruption by the disruption budget", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + ) + DisruptionBudgetGrantedTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: METIC_PREFIX + "disruption_budget_granted_total", + Help: "Total number of granted node disruption by the disruption budget", + }, + []string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"}, + ) +) diff --git a/internal/controller/nodedisruption_controller.go b/internal/controller/nodedisruption_controller.go index b046ff9..8fcc7cc 100644 --- a/internal/controller/nodedisruption_controller.go +++ b/internal/controller/nodedisruption_controller.go @@ -47,55 +47,6 @@ type NodeDisruptionReconcilerConfig struct { RejectOverlappingDisruption bool } -const ( - METIC_PREFIX = "node_disruption_controller_" -) - -var ( - NodeDisruptionGrantedTotal = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: METIC_PREFIX + "node_disruption_granted_total", - Help: "Total number of granted node disruptions", - }, - []string{}, - ) - NodeDisruptionRejectedTotal = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: METIC_PREFIX + "node_disruption_rejected_total", - Help: "Total number of rejected node disruptions", - }, - []string{}, - ) - NodeDisruptionState = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: METIC_PREFIX + "node_disruption_state", - Help: "State of node disruption: pending=0, rejected=-1, accepted=1", - }, - []string{"node_disruption_name"}, - ) - NodeDisruptionCreated = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: METIC_PREFIX + "node_disruption_created", - Help: "Date of create of the node disruption", - }, - []string{"node_disruption_name"}, - ) - NodeDisruptionDeadline = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: METIC_PREFIX + "node_disruption_deadline", - Help: "Date of the deadline of the node disruption (0 if unset)", - }, - []string{"node_disruption_name"}, - ) - NodeDisruptionImpactedNodes = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: METIC_PREFIX + "node_disruption_impacted_node", - Help: "high cardinality: create a metric for each node impacted by a given node disruption", - }, - []string{"node_disruption_name", "node_name"}, - ) -) - // NodeDisruptionReconciler reconciles NodeDisruptions type NodeDisruptionReconciler struct { client.Client @@ -380,13 +331,15 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithBudgetConstraints(ctx con if !budget.TolerateDisruption(disruptedNodes) { anyFailed = true + ref := budget.GetNamespacedName() status := nodedisruptionv1alpha1.DisruptedBudgetStatus{ - Reference: budget.GetNamespacedName(), + Reference: ref, Reason: "No more disruption allowed", Ok: false, } statuses = append(statuses, status) logger.Info("Disruption rejected because: ", "status", status) + DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc() break } impactedBudgets = append(impactedBudgets, budget) @@ -398,15 +351,17 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithBudgetConstraints(ctx con for _, budget := range impactedBudgets { err := budget.CheckHealth(ctx) + ref := budget.GetNamespacedName() if err != nil { anyFailed = true status := nodedisruptionv1alpha1.DisruptedBudgetStatus{ - Reference: budget.GetNamespacedName(), + Reference: ref, Reason: fmt.Sprintf("Unhealthy: %s", err), Ok: false, } statuses = append(statuses, status) logger.Info("Disruption rejected because: ", "status", status) + DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc() break } } @@ -417,17 +372,20 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithBudgetConstraints(ctx con for _, budget := range impactedBudgets { err := budget.CallHealthHook(ctx, ndr.NodeDisruption) + ref := budget.GetNamespacedName() if err != nil { anyFailed = true status := nodedisruptionv1alpha1.DisruptedBudgetStatus{ - Reference: budget.GetNamespacedName(), + Reference: ref, Reason: fmt.Sprintf("Unhealthy: %s", err), Ok: false, } statuses = append(statuses, status) logger.Info("Disruption rejected because: ", "status", status) + DisruptionBudgetRejectedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc() break } + DisruptionBudgetGrantedTotal.WithLabelValues(ref.Namespace, ref.Name, ref.Kind).Inc() statuses = append(statuses, nodedisruptionv1alpha1.DisruptedBudgetStatus{ Reference: budget.GetNamespacedName(), Reason: "",