From d3901a226972f5809f380464a2041eb1ab63c69f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9na=C3=AFc=20Huard?= Date: Thu, 8 Aug 2024 15:45:54 +0200 Subject: [PATCH] Fix a cluster-agent crash at startup (#28282) --- .../subcommands/start/command.go | 2 +- flakes.yaml | 4 --- .../apiserver/controllers/controllers.go | 31 ++++++++++--------- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/cmd/cluster-agent/subcommands/start/command.go b/cmd/cluster-agent/subcommands/start/command.go index 4821d08af215f..4b544bc84726a 100644 --- a/cmd/cluster-agent/subcommands/start/command.go +++ b/cmd/cluster-agent/subcommands/start/command.go @@ -324,7 +324,7 @@ func start(log log.Component, DatadogClient: dc, } - if aggErr := controllers.StartControllers(ctx); aggErr != nil { + if aggErr := controllers.StartControllers(&ctx); aggErr != nil { for _, err := range aggErr.Errors() { pkglog.Warnf("Error while starting controller: %v", err) } diff --git a/flakes.yaml b/flakes.yaml index ed2c02bcf1b29..6887638a60fc5 100644 --- a/flakes.yaml +++ b/flakes.yaml @@ -12,10 +12,6 @@ test/new-e2e/tests/containers: - TestEKSSuite/TestCPU/metric___container.cpu.usage{^kube_deployment:stress-ng$,^kube_namespace:workload-cpustress$} - TestKindSuite/TestCPU/metric___container.cpu.usage{^kube_deployment:stress-ng$,^kube_namespace:workload-cpustress$} - TestECSSuite - - TestEKSSuite/Test00UpAndRunning/agent_pods_are_ready_and_not_restarting - - TestEKSSuite/TestZZUpAndRunning/agent_pods_are_ready_and_not_restarting - - TestKindSuite/Test00UpAndRunning/agent_pods_are_ready_and_not_restarting - - TestKindSuite/TestZZUpAndRunning/agent_pods_are_ready_and_not_restarting test/new-e2e/tests/installer: - TestPackages/upgrade_scenario_ubuntu_22_04_x86_64/TestUpgradeSuccessful diff --git a/pkg/util/kubernetes/apiserver/controllers/controllers.go b/pkg/util/kubernetes/apiserver/controllers/controllers.go index de14e644fec13..fe51616dc53a4 100644 --- a/pkg/util/kubernetes/apiserver/controllers/controllers.go +++ b/pkg/util/kubernetes/apiserver/controllers/controllers.go @@ -33,7 +33,7 @@ const autoscalerNowHandleMsgEvent = "Autoscaler is now handled by the Cluster-Ag var errIsEmpty = errors.New("entity is empty") //nolint:revive -type startFunc func(ControllerContext, chan error) +type startFunc func(*ControllerContext, chan error) type controllerFuncs struct { enabled func() bool @@ -64,6 +64,7 @@ var controllerCatalog = map[controllerName]controllerFuncs{ // ControllerContext holds all the attributes needed by the controllers type ControllerContext struct { informers map[apiserver.InformerName]cache.SharedInformer + informersMutex sync.Mutex InformerFactory informers.SharedInformerFactory DynamicClient dynamic.Interface DynamicInformerFactory dynamicinformer.DynamicSharedInformerFactory @@ -77,7 +78,7 @@ type ControllerContext struct { // StartControllers runs the enabled Kubernetes controllers for the Datadog Cluster Agent. This is // only called once, when we have confirmed we could correctly connect to the API server. -func StartControllers(ctx ControllerContext) k8serrors.Aggregate { +func StartControllers(ctx *ControllerContext) k8serrors.Aggregate { ctx.informers = make(map[apiserver.InformerName]cache.SharedInformer) var wg sync.WaitGroup @@ -126,9 +127,7 @@ func StartControllers(ctx ControllerContext) k8serrors.Aggregate { // startMetadataController starts the informers needed for metadata collection. // The synchronization of the informers is handled by the controller. -// -//nolint:revive // TODO(CAPP) Fix revive linter -func startMetadataController(ctx ControllerContext, c chan error) { +func startMetadataController(ctx *ControllerContext, _ chan error) { metaController := newMetadataController( ctx.InformerFactory.Core().V1().Endpoints(), ctx.WorkloadMeta, @@ -138,7 +137,7 @@ func startMetadataController(ctx ControllerContext, c chan error) { // startAutoscalersController starts the informers needed for autoscaling. // The synchronization of the informers is handled by the controller. -func startAutoscalersController(ctx ControllerContext, c chan error) { +func startAutoscalersController(ctx *ControllerContext, c chan error) { var err error if ctx.DatadogClient == nil { c <- fmt.Errorf("datadog client is nil") @@ -166,15 +165,19 @@ func startAutoscalersController(ctx ControllerContext, c chan error) { } // registerServicesInformer registers the services informer. -// -//nolint:revive // TODO(CAPP) Fix revive linter -func registerServicesInformer(ctx ControllerContext, c chan error) { - ctx.informers[servicesInformer] = ctx.InformerFactory.Core().V1().Services().Informer() +func registerServicesInformer(ctx *ControllerContext, _ chan error) { + informer := ctx.InformerFactory.Core().V1().Services().Informer() + + ctx.informersMutex.Lock() + ctx.informers[servicesInformer] = informer + ctx.informersMutex.Unlock() } // registerEndpointsInformer registers the endpoints informer. -// -//nolint:revive // TODO(CAPP) Fix revive linter -func registerEndpointsInformer(ctx ControllerContext, c chan error) { - ctx.informers[endpointsInformer] = ctx.InformerFactory.Core().V1().Endpoints().Informer() +func registerEndpointsInformer(ctx *ControllerContext, _ chan error) { + informer := ctx.InformerFactory.Core().V1().Endpoints().Informer() + + ctx.informersMutex.Lock() + ctx.informers[endpointsInformer] = informer + ctx.informersMutex.Unlock() }