Skip to content

Commit

Permalink
Fix bugs on metrics (#61)
Browse files Browse the repository at this point in the history
* Enable auto discovery of metrics

Before they had to be specified in the main. With
promauto, they can be defined in only one place

* Add a metric to export state as label

it is an alternative to using the value.

* Fix leak of disruptions and nodes

Over time the budget might target new nodes but we
only cleaned nodes or disruption when a budget is deleted
Now, we wipe the list of node/disruptions metrics before
repopulating again
  • Loading branch information
geobeau authored Apr 5, 2024
1 parent f6940f9 commit 2624aa1
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 29 deletions.
8 changes: 0 additions & 8 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ import (

nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1"
"github.com/criteo/node-disruption-controller/internal/controller"
"sigs.k8s.io/controller-runtime/pkg/metrics"
//+kubebuilder:scaffold:imports
)

Expand All @@ -48,13 +47,6 @@ func init() {

utilruntime.Must(nodedisruptionv1alpha1.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme

metrics.Registry.MustRegister(
controller.NodeDisruptionState,
controller.NodeDisruptionCreated,
controller.NodeDisruptionDeadline,
controller.NodeDisruptionImpactedNodes,
)
}

func main() {
Expand Down
4 changes: 4 additions & 0 deletions internal/controller/budget.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,13 @@ func PruneBudgetStatusMetrics(ref nodedisruptionv1alpha1.NamespacedName) {
}

func UpdateBudgetStatusMetrics(ref nodedisruptionv1alpha1.NamespacedName, status nodedisruptionv1alpha1.DisruptionBudgetStatus) {
// delete before updating to avoid leaking metrics/nodes over time
DisruptionBudgetWatchedNodes.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind})
for _, node_name := range status.WatchedNodes {
DisruptionBudgetWatchedNodes.WithLabelValues(ref.Namespace, ref.Name, ref.Kind, node_name).Set(1)
}
// delete before updating to avoid leaking metrics/disruptions over time
DisruptionBudgetDisruptions.DeletePartialMatch(prometheus.Labels{"budget_disruption_namespace": ref.Namespace, "budget_disruption_name": ref.Name, "budget_disruption_kind": ref.Kind})
for _, disruption := range status.Disruptions {
nd_state := 0
state := nodedisruptionv1alpha1.NodeDisruptionState(disruption.State)
Expand Down
49 changes: 30 additions & 19 deletions internal/controller/metrics.go
Original file line number Diff line number Diff line change
@@ -1,127 +1,138 @@
package controller

import "github.com/prometheus/client_golang/prometheus"
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

const (
METIC_PREFIX = "node_disruption_controller_"
)

var (
// NODE DISRUPTION METRICS
NodeDisruptionGrantedTotal = prometheus.NewCounterVec(
NodeDisruptionGrantedTotal = promauto.With(metrics.Registry).NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "node_disruption_granted_total",
Help: "Total number of granted node disruptions",
},
[]string{},
)
NodeDisruptionRejectedTotal = prometheus.NewCounterVec(
NodeDisruptionRejectedTotal = promauto.With(metrics.Registry).NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "node_disruption_rejected_total",
Help: "Total number of rejected node disruptions",
},
[]string{},
)
NodeDisruptionState = prometheus.NewGaugeVec(
NodeDisruptionStateAsValue = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_state",
Name: METIC_PREFIX + "node_disruption_state_value",
Help: "State of node disruption: pending=0, rejected=-1, accepted=1",
},
[]string{"node_disruption_name"},
)
NodeDisruptionCreated = prometheus.NewGaugeVec(
NodeDisruptionStateAsLabel = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_state_label",
Help: "State of node disruption: 0 not in this state; 1 is in state",
},
[]string{"node_disruption_name", "state"},
)
NodeDisruptionCreated = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_created",
Help: "Date of create of the node disruption",
},
[]string{"node_disruption_name"},
)
NodeDisruptionDeadline = prometheus.NewGaugeVec(
NodeDisruptionDeadline = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_deadline",
Help: "Date of the deadline of the node disruption (0 if unset)",
},
[]string{"node_disruption_name"},
)
NodeDisruptionImpactedNodes = prometheus.NewGaugeVec(
NodeDisruptionImpactedNodes = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_impacted_node",
Help: "high cardinality: create a metric for each node impacted by a given node disruption",
},
[]string{"node_disruption_name", "node_name"},
)
// DISRUPTION BUDGET METRICS
DisruptionBudgetCheckHealthHookStatusCodeTotal = prometheus.NewCounterVec(
DisruptionBudgetCheckHealthHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_health_hook_status_code_total",
Help: "Total number of request by HTTP status code",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "status_code"},
)
DisruptionBudgetCheckHealthHookErrorTotal = prometheus.NewCounterVec(
DisruptionBudgetCheckHealthHookErrorTotal = promauto.With(metrics.Registry).NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_health_hook_error_total",
Help: "Total number of connection/response errors while requesting health hook",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetRejectedTotal = prometheus.NewCounterVec(
DisruptionBudgetRejectedTotal = promauto.With(metrics.Registry).NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_rejected_total",
Help: "Total number of rejected node disruption by the disruption budget",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetGrantedTotal = prometheus.NewCounterVec(
DisruptionBudgetGrantedTotal = promauto.With(metrics.Registry).NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "disruption_budget_granted_total",
Help: "Total number of granted node disruption by the disruption budget",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetMaxDisruptions = prometheus.NewGaugeVec(
DisruptionBudgetMaxDisruptions = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "disruption_budget_max_disruptions",
Help: "Reflect the MaxDisruptions fields from budget Spec",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetCurrentDisruptions = prometheus.NewGaugeVec(
DisruptionBudgetCurrentDisruptions = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "disruption_budget_current_disruptions",
Help: "Reflect the CurrentDisruptions fields from budget Status",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetDisruptionsAllowed = prometheus.NewGaugeVec(
DisruptionBudgetDisruptionsAllowed = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "disruption_budget_disruptions_allowed",
Help: "Reflect the DisruptionsAllowed fields from budget Status",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetMaxDisruptedNodes = prometheus.NewGaugeVec(
DisruptionBudgetMaxDisruptedNodes = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "disruption_budget_max_disrupted_nodes",
Help: "Reflect the MaxDisruptedNodes fields from budget Spec",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetMinUndisruptedNodes = prometheus.NewGaugeVec(
DisruptionBudgetMinUndisruptedNodes = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "disruption_budget_min_undisrupted_nodes",
Help: "Reflect the MinUndisruptedNodes fields from budget Spec",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind"},
)
DisruptionBudgetWatchedNodes = prometheus.NewGaugeVec(
DisruptionBudgetWatchedNodes = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_watched_nodes",
Help: "high cardinality: create a metric for each node watched by a budget",
},
[]string{"disruption_budget_namespace", "disruption_budget_name", "disruption_budget_kind", "node_name"},
)
DisruptionBudgetDisruptions = prometheus.NewGaugeVec(
DisruptionBudgetDisruptions = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "budget_disruption_disruptions",
Help: "high cardinality: create a metric for each disruption by a budget",
Expand Down
14 changes: 12 additions & 2 deletions internal/controller/nodedisruption_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque

// PruneNodeDisruptionMetric remove metrics for a Node Disruption that don't exist anymore
func PruneNodeDisruptionMetrics(nd_name string) {
NodeDisruptionState.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionStateAsValue.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionStateAsLabel.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionCreated.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionDeadline.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionImpactedNodes.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
Expand All @@ -122,12 +123,21 @@ func UpdateNodeDisruptionMetrics(nd *nodedisruptionv1alpha1.NodeDisruption) {
nd_state := 0
if nd.Status.State == nodedisruptionv1alpha1.Pending {
nd_state = 0
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(1)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(0)
} else if nd.Status.State == nodedisruptionv1alpha1.Rejected {
nd_state = -1
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(1)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(0)
} else if nd.Status.State == nodedisruptionv1alpha1.Granted {
nd_state = 1
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(1)
}
NodeDisruptionState.WithLabelValues(nd.Name).Set(float64(nd_state))
NodeDisruptionStateAsValue.WithLabelValues(nd.Name).Set(float64(nd_state))
NodeDisruptionCreated.WithLabelValues(nd.Name).Set(float64(nd.CreationTimestamp.Unix()))
// Deadline might not be set so it will be 0 but timestamp in Go are not Unix epoch
// so converting a 0 timestamp will not result in epoch 0. We override this to have nice values
Expand Down

0 comments on commit 2624aa1

Please sign in to comment.