From 4efbdb143d04cf0956be92d2122ff5b4bbc4b503 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw=20Golicz?= Date: Fri, 25 Oct 2024 08:55:06 +0200 Subject: [PATCH 1/4] Periodically update runtime state metrics --- cmd/main.go | 48 +++++++++++++++---- internal/controller/metrics/metrics.go | 14 +++--- internal/controller/metrics/mocks/Metrics.go | 6 +-- .../runtime/fsm/runtime_fsm_apply_crb_test.go | 2 +- .../fsm/runtime_fsm_create_kubeconfig_test.go | 2 +- .../runtime/fsm/runtime_fsm_initialise.go | 2 +- .../fsm/runtime_fsm_initialise_test.go | 2 +- .../fsm/runtime_fsm_persist_shoot_test.go | 2 +- internal/controller/runtime/suite_test.go | 2 +- 9 files changed, 55 insertions(+), 25 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 808cdc99..6a20c069 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -69,6 +69,7 @@ const defaultExpirationTime = 24 * time.Hour const defaultGardenerRequestTimeout = 60 * time.Second const defaultControlPlaneRequeueDuration = 10 * time.Second const defaultGardenerRequeueDuration = 15 * time.Second +const RuntimeMetricRefreshPeriod = 30 * time.Second func main() { var metricsAddr string @@ -183,15 +184,6 @@ func main() { os.Exit(1) } - // refresh runtime metrics - metrics.ResetRuntimeMetrics() - var runtimeList infrastructuremanagerv1.RuntimeList - if err = mgr.GetClient().List(context.TODO(), &runtimeList); err != nil { - for _, rt := range runtimeList.Items { - metrics.SetRuntimeStates(rt) - } - } - cfg := fsm.RCCfg{ GardenerRequeueDuration: defaultGardenerRequeueDuration, ControlPlaneRequeueDuration: defaultControlPlaneRequeueDuration, @@ -229,11 +221,30 @@ func main() { os.Exit(1) } + refreshRuntimeMetrics := func() { + logger.Info("Refreshing runtime CR metrics") + metrics.ResetRuntimeMetrics() + var RuntimeList infrastructuremanagerv1.RuntimeList + if err = mgr.GetClient().List(context.TODO(), &RuntimeList); err == nil { + for _, rt := range RuntimeList.Items { + metrics.SetRuntimeStates(rt) + } + } + } + + quitChannel := startRuntimeMetricsRefresher(refreshRuntimeMetrics) + defer func() { + logger.Info("Stopping metric refresh process goroutine") + if quitChannel != nil { + quitChannel <- true + } + }() + setupLog.Info("Starting Manager", "kubeconfigExpirationTime", expirationTime, "kubeconfigRotationPeriod", rotationPeriod) if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { setupLog.Error(err, "problem running manager") - os.Exit(1) + os.Exit(1) //nolint:gocritic } } @@ -313,3 +324,20 @@ func validateAuditLogDataMap(data map[string]map[string]auditlogging.AuditLogDat return nil } + +func startRuntimeMetricsRefresher(refreshFunc func()) chan bool { + quitChannel := make(chan bool) + go func() { + for { + time.Sleep(RuntimeMetricRefreshPeriod) + select { + case <-quitChannel: + // println("Received signal to stop metric refresh process") + return + default: + refreshFunc() + } + } + }() + return quitChannel +} diff --git a/internal/controller/metrics/metrics.go b/internal/controller/metrics/metrics.go index 4daa325c..63c64b3e 100644 --- a/internal/controller/metrics/metrics.go +++ b/internal/controller/metrics/metrics.go @@ -12,6 +12,7 @@ import ( const ( runtimeIDKeyName = "runtimeId" + runtimeNameKeyName = "runtimeName" shootNameIDKeyName = "shootName" rotationDuration = "rotationDuration" expirationDuration = "expirationDuration" @@ -33,7 +34,7 @@ const ( //go:generate mockery --name=Metrics type Metrics interface { SetRuntimeStates(runtime v1.Runtime) - CleanUpRuntimeGauge(runtimeID string) + CleanUpRuntimeGauge(runtimeID, runtimeName string) ResetRuntimeMetrics() IncRuntimeFSMStopCounter() SetGardenerClusterStates(cluster v1.GardenerCluster) @@ -68,7 +69,7 @@ func NewMetrics() Metrics { Subsystem: componentName, Name: RuntimeStateMetricName, Help: "Exposes current Status.state for Runtime CRs", - }, []string{runtimeIDKeyName, shootNameIDKeyName, provider, state, message}), + }, []string{runtimeIDKeyName, runtimeNameKeyName, shootNameIDKeyName, provider, state, message}), runtimeFSMUnexpectedStopsCnt: prometheus.NewCounter( prometheus.CounterOpts{ Name: RuntimeFSMStopMetricName, @@ -90,14 +91,15 @@ func (m metricsImpl) SetRuntimeStates(runtime v1.Runtime) { reason = runtime.Status.Conditions[size-1].Message } - m.CleanUpRuntimeGauge(runtimeID) - m.runtimeStateGauge.WithLabelValues(runtimeID, runtime.Spec.Shoot.Name, runtime.Spec.Shoot.Provider.Type, string(runtime.Status.State), reason).Set(1) + m.CleanUpRuntimeGauge(runtimeID, runtime.Name) + m.runtimeStateGauge.WithLabelValues(runtimeID, runtime.Name, runtime.Spec.Shoot.Name, runtime.Spec.Shoot.Provider.Type, string(runtime.Status.State), reason).Set(1) } } -func (m metricsImpl) CleanUpRuntimeGauge(runtimeID string) { +func (m metricsImpl) CleanUpRuntimeGauge(runtimeID, runtimeName string) { m.runtimeStateGauge.DeletePartialMatch(prometheus.Labels{ - runtimeIDKeyName: runtimeID, + runtimeIDKeyName: runtimeID, + runtimeNameKeyName: runtimeName, }) } diff --git a/internal/controller/metrics/mocks/Metrics.go b/internal/controller/metrics/mocks/Metrics.go index c144ac51..ffd12e7d 100644 --- a/internal/controller/metrics/mocks/Metrics.go +++ b/internal/controller/metrics/mocks/Metrics.go @@ -27,9 +27,9 @@ func (_m *Metrics) CleanUpKubeconfigExpiration(runtimeID string) { _m.Called(runtimeID) } -// CleanUpRuntimeGauge provides a mock function with given fields: runtimeID -func (_m *Metrics) CleanUpRuntimeGauge(runtimeID string) { - _m.Called(runtimeID) +// CleanUpRuntimeGauge provides a mock function with given fields: runtimeID, runtimeName +func (_m *Metrics) CleanUpRuntimeGauge(runtimeID string, runtimeName string) { + _m.Called(runtimeID, runtimeName) } // IncRuntimeFSMStopCounter provides a mock function with given fields: diff --git a/internal/controller/runtime/fsm/runtime_fsm_apply_crb_test.go b/internal/controller/runtime/fsm/runtime_fsm_apply_crb_test.go index 1d060e64..a4d803b6 100644 --- a/internal/controller/runtime/fsm/runtime_fsm_apply_crb_test.go +++ b/internal/controller/runtime/fsm/runtime_fsm_apply_crb_test.go @@ -25,7 +25,7 @@ var _ = Describe(`runtime_fsm_apply_crb`, Label("applyCRB"), func() { withMockedMetrics := func() fakeFSMOpt { m := &mocks.Metrics{} m.On("SetRuntimeStates", mock.Anything).Return() - m.On("CleanUpRuntimeGauge", mock.Anything).Return() + m.On("CleanUpRuntimeGauge", mock.Anything, mock.Anything).Return() m.On("IncRuntimeFSMStopCounter").Return() return withMetrics(m) } diff --git a/internal/controller/runtime/fsm/runtime_fsm_create_kubeconfig_test.go b/internal/controller/runtime/fsm/runtime_fsm_create_kubeconfig_test.go index a50a3949..3cfc8f86 100644 --- a/internal/controller/runtime/fsm/runtime_fsm_create_kubeconfig_test.go +++ b/internal/controller/runtime/fsm/runtime_fsm_create_kubeconfig_test.go @@ -39,7 +39,7 @@ var _ = Describe("KIM sFnCreateKubeconfig", func() { withMockedMetrics := func() fakeFSMOpt { m := &mocks.Metrics{} m.On("SetRuntimeStates", mock.Anything).Return() - m.On("CleanUpRuntimeGauge", mock.Anything).Return() + m.On("CleanUpRuntimeGauge", mock.Anything, mock.Anything).Return() m.On("IncRuntimeFSMStopCounter").Return() return withMetrics(m) } diff --git a/internal/controller/runtime/fsm/runtime_fsm_initialise.go b/internal/controller/runtime/fsm/runtime_fsm_initialise.go index 708d0de0..bcd46dff 100644 --- a/internal/controller/runtime/fsm/runtime_fsm_initialise.go +++ b/internal/controller/runtime/fsm/runtime_fsm_initialise.go @@ -100,6 +100,6 @@ func removeFinalizerAndStop(ctx context.Context, m *fsm, s *systemState) (stateF } // remove from metrics - m.Metrics.CleanUpRuntimeGauge(runtimeID) + m.Metrics.CleanUpRuntimeGauge(runtimeID, s.instance.Name) return stop() } diff --git a/internal/controller/runtime/fsm/runtime_fsm_initialise_test.go b/internal/controller/runtime/fsm/runtime_fsm_initialise_test.go index e80d84ef..b53705d2 100644 --- a/internal/controller/runtime/fsm/runtime_fsm_initialise_test.go +++ b/internal/controller/runtime/fsm/runtime_fsm_initialise_test.go @@ -40,7 +40,7 @@ var _ = Describe("KIM sFnInitialise", func() { withMockedMetrics := func() fakeFSMOpt { m := &mocks.Metrics{} m.On("SetRuntimeStates", mock.Anything).Return() - m.On("CleanUpRuntimeGauge", mock.Anything).Return() + m.On("CleanUpRuntimeGauge", mock.Anything, mock.Anything).Return() m.On("IncRuntimeFSMStopCounter").Return() return withMetrics(m) } diff --git a/internal/controller/runtime/fsm/runtime_fsm_persist_shoot_test.go b/internal/controller/runtime/fsm/runtime_fsm_persist_shoot_test.go index 82641563..bd548efb 100644 --- a/internal/controller/runtime/fsm/runtime_fsm_persist_shoot_test.go +++ b/internal/controller/runtime/fsm/runtime_fsm_persist_shoot_test.go @@ -28,7 +28,7 @@ var _ = Describe("KIM sFnPersist", func() { withMockedMetrics := func() fakeFSMOpt { m := &mocks.Metrics{} m.On("SetRuntimeStates", mock.Anything).Return() - m.On("CleanUpRuntimeGauge", mock.Anything).Return() + m.On("CleanUpRuntimeGauge", mock.Anything, mock.Anything).Return() m.On("IncRuntimeFSMStopCounter").Return() return withMetrics(m) } diff --git a/internal/controller/runtime/suite_test.go b/internal/controller/runtime/suite_test.go index 3b107e19..a9219f64 100644 --- a/internal/controller/runtime/suite_test.go +++ b/internal/controller/runtime/suite_test.go @@ -119,7 +119,7 @@ var _ = BeforeSuite(func() { mm := &mocks.Metrics{} mm.On("SetRuntimeStates", mock.Anything).Return() mm.On("IncRuntimeFSMStopCounter").Return() - mm.On("CleanUpRuntimeGauge", mock.Anything).Return() + mm.On("CleanUpRuntimeGauge", mock.Anything, mock.Anything).Return() fsmCfg := fsm.RCCfg{ Finalizer: infrastructuremanagerv1.Finalizer, From 8264618247db5f3ac1b65f5174dca45d9df958d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw=20Golicz?= Date: Fri, 25 Oct 2024 10:51:05 +0200 Subject: [PATCH 2/4] remove periodical sync of Runtime metrics --- cmd/main.go | 42 +++++++----------------------------------- 1 file changed, 7 insertions(+), 35 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 6a20c069..e613ec4e 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -69,7 +69,6 @@ const defaultExpirationTime = 24 * time.Hour const defaultGardenerRequestTimeout = 60 * time.Second const defaultControlPlaneRequeueDuration = 10 * time.Second const defaultGardenerRequeueDuration = 15 * time.Second -const RuntimeMetricRefreshPeriod = 30 * time.Second func main() { var metricsAddr string @@ -221,30 +220,20 @@ func main() { os.Exit(1) } - refreshRuntimeMetrics := func() { - logger.Info("Refreshing runtime CR metrics") - metrics.ResetRuntimeMetrics() - var RuntimeList infrastructuremanagerv1.RuntimeList - if err = mgr.GetClient().List(context.TODO(), &RuntimeList); err == nil { - for _, rt := range RuntimeList.Items { - metrics.SetRuntimeStates(rt) - } + logger.Info("Refreshing runtime CR metrics") + metrics.ResetRuntimeMetrics() + var RuntimeList infrastructuremanagerv1.RuntimeList + if err = mgr.GetClient().List(context.TODO(), &RuntimeList); err == nil { + for _, rt := range RuntimeList.Items { + metrics.SetRuntimeStates(rt) } } - quitChannel := startRuntimeMetricsRefresher(refreshRuntimeMetrics) - defer func() { - logger.Info("Stopping metric refresh process goroutine") - if quitChannel != nil { - quitChannel <- true - } - }() - setupLog.Info("Starting Manager", "kubeconfigExpirationTime", expirationTime, "kubeconfigRotationPeriod", rotationPeriod) if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { setupLog.Error(err, "problem running manager") - os.Exit(1) //nolint:gocritic + os.Exit(1) } } @@ -324,20 +313,3 @@ func validateAuditLogDataMap(data map[string]map[string]auditlogging.AuditLogDat return nil } - -func startRuntimeMetricsRefresher(refreshFunc func()) chan bool { - quitChannel := make(chan bool) - go func() { - for { - time.Sleep(RuntimeMetricRefreshPeriod) - select { - case <-quitChannel: - // println("Received signal to stop metric refresh process") - return - default: - refreshFunc() - } - } - }() - return quitChannel -} From cc0e9f11d6d05d62c1e93a84a92638c7dd57d68e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw=20Golicz?= Date: Fri, 25 Oct 2024 12:30:38 +0200 Subject: [PATCH 3/4] Using separate Client to refresh Runtime metrics on startup --- cmd/main.go | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index e613ec4e..d62d19ac 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -21,7 +21,9 @@ import ( "encoding/json" "flag" "fmt" + "github.com/go-logr/logr" "io" + "k8s.io/client-go/rest" "os" "time" @@ -106,7 +108,9 @@ func main() { logger := zap.New(zap.UseFlagOptions(&opts)) ctrl.SetLogger(logger) - mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + restConfig := ctrl.GetConfigOrDie() + + mgr, err := ctrl.NewManager(restConfig, ctrl.Options{ Metrics: metricsserver.Options{ BindAddress: metricsAddr, }, @@ -220,14 +224,7 @@ func main() { os.Exit(1) } - logger.Info("Refreshing runtime CR metrics") - metrics.ResetRuntimeMetrics() - var RuntimeList infrastructuremanagerv1.RuntimeList - if err = mgr.GetClient().List(context.TODO(), &RuntimeList); err == nil { - for _, rt := range RuntimeList.Items { - metrics.SetRuntimeStates(rt) - } - } + refreshRuntimeMetrics(restConfig, logger, metrics) setupLog.Info("Starting Manager", "kubeconfigExpirationTime", expirationTime, "kubeconfigRotationPeriod", rotationPeriod) @@ -313,3 +310,29 @@ func validateAuditLogDataMap(data map[string]map[string]auditlogging.AuditLogDat return nil } + +func refreshRuntimeMetrics(restConfig *rest.Config, logger logr.Logger, metrics metrics.Metrics) { + k8sClient, err := client.New(restConfig, client.Options{}) + if err != nil { + setupLog.Error(err, "Unable to set up client for refreshing runtime CR metrics") + os.Exit(1) + } + + err = infrastructuremanagerv1.AddToScheme(k8sClient.Scheme()) + if err != nil { + setupLog.Error(err, "unable to set up client") + os.Exit(1) + } + + logger.Info("Refreshing runtime CR metrics") + metrics.ResetRuntimeMetrics() + rl := infrastructuremanagerv1.RuntimeList{} + if err = k8sClient.List(context.Background(), &rl, &client.ListOptions{Namespace: "kcp-system"}); err != nil { + setupLog.Error(err, "error while listing unable to list runtimes") + os.Exit(1) + } + + for _, rt := range rl.Items { + metrics.SetRuntimeStates(rt) + } +} From a1b06ed58eb381bb10f500ee57cff751097cc316 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw=20Golicz?= Date: Fri, 25 Oct 2024 12:33:15 +0200 Subject: [PATCH 4/4] fix to make linter happy --- cmd/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index d62d19ac..5fd3d0af 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -21,15 +21,14 @@ import ( "encoding/json" "flag" "fmt" - "github.com/go-logr/logr" "io" - "k8s.io/client-go/rest" "os" "time" "github.com/gardener/gardener/pkg/apis/core/v1beta1" gardener_apis "github.com/gardener/gardener/pkg/client/core/clientset/versioned/typed/core/v1beta1" gardener_oidc "github.com/gardener/oidc-webhook-authenticator/apis/authentication/v1alpha1" + "github.com/go-logr/logr" validator "github.com/go-playground/validator/v10" infrastructuremanagerv1 "github.com/kyma-project/infrastructure-manager/api/v1" "github.com/kyma-project/infrastructure-manager/internal/auditlogging" @@ -46,6 +45,7 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" _ "k8s.io/client-go/plugin/pkg/client/auth" + "k8s.io/client-go/rest" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz"