Skip to content

Commit

Permalink
Several metrics improvements (#54)
Browse files Browse the repository at this point in the history
* Fix incorrect state in log

We were logging the version of the old state, now
the state logged is the one being persisted to Kubernetes.

* Change state of node disruption state

-1 is more explicit for rejection

* Prefix metric by the name of the project

Make it easier to filter and pull metrics in Prometheus

* Add metrics to get the total count of granted/rejected state

These metrics are incremented everytime we grant or reject a node disruption.
  • Loading branch information
geobeau authored Mar 15, 2024
1 parent bc3ac72 commit 9ecf030
Showing 1 changed file with 32 additions and 9 deletions.
41 changes: 32 additions & 9 deletions internal/controller/nodedisruption_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,31 +47,49 @@ type NodeDisruptionReconcilerConfig struct {
RejectOverlappingDisruption bool
}

const (
METIC_PREFIX = "node_disruption_controller_"
)

var (
NodeDisruptionGrantedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "node_disruption_granted_total",
Help: "Total number of granted node disruptions",
},
[]string{},
)
NodeDisruptionRejectedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "node_disruption_rejected_total",
Help: "Total number of rejected node disruptions",
},
[]string{},
)
NodeDisruptionState = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "node_disruption_state",
Help: "State of node disruption: pending=0, rejected=1, accepted=2",
Name: METIC_PREFIX + "node_disruption_state",
Help: "State of node disruption: pending=0, rejected=-1, accepted=1",
},
[]string{"node_disruption_name"},
)
NodeDisruptionCreated = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "node_disruption_created",
Name: METIC_PREFIX + "node_disruption_created",
Help: "Date of create of the node disruption",
},
[]string{"node_disruption_name"},
)
NodeDisruptionDeadline = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "node_disruption_deadline",
Name: METIC_PREFIX + "node_disruption_deadline",
Help: "Date of the deadline of the node disruption (0 if unset)",
},
[]string{"node_disruption_name"},
)
NodeDisruptionImpactedNodes = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "node_disruption_impacted_node",
Name: METIC_PREFIX + "node_disruption_impacted_node",
Help: "high cardinality: create a metric for each node impacted by a given node disruption",
},
[]string{"node_disruption_name", "node_name"},
Expand Down Expand Up @@ -133,10 +151,10 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
}

if !reflect.DeepEqual(nd.Status, reconciler.NodeDisruption.Status) {
logger.Info("Updating Status, done with", "state", nd.Status.State)
logger.Info("Updating Status, done with", "state", reconciler.NodeDisruption.Status.State)
return clusterResult, reconciler.UpdateStatus(ctx)
}
logger.Info("Reconciliation successful", "state", nd.Status.State)
logger.Info("Reconciliation successful", "state", reconciler.NodeDisruption.Status.State)
return clusterResult, nil
}

Expand All @@ -153,9 +171,9 @@ func UpdateNodeDisruptionMetric(nd *nodedisruptionv1alpha1.NodeDisruption) {
if nd.Status.State == nodedisruptionv1alpha1.Pending {
nd_state = 0
} else if nd.Status.State == nodedisruptionv1alpha1.Rejected {
nd_state = 1
nd_state = -1
} else if nd.Status.State == nodedisruptionv1alpha1.Granted {
nd_state = 2
nd_state = 1
}
NodeDisruptionState.WithLabelValues(nd.Name).Set(float64(nd_state))
NodeDisruptionCreated.WithLabelValues(nd.Name).Set(float64(nd.CreationTimestamp.Unix()))
Expand Down Expand Up @@ -214,6 +232,11 @@ func (ndr *SingleNodeDisruptionReconciler) TryTransitionState(ctx context.Contex
if err != nil {
return err
}
if ndr.NodeDisruption.Status.State == nodedisruptionv1alpha1.Granted {
NodeDisruptionGrantedTotal.WithLabelValues().Inc()
} else if ndr.NodeDisruption.Status.State == nodedisruptionv1alpha1.Rejected {
NodeDisruptionRejectedTotal.WithLabelValues().Inc()
}
}
// If the disruption is not Pending nor unknown, the state is final
return nil
Expand Down

0 comments on commit 9ecf030

Please sign in to comment.