diff --git a/hack/runtime-migrator/go.mod b/hack/runtime-migrator/go.mod index 1ae29a43..c1edbd06 100644 --- a/hack/runtime-migrator/go.mod +++ b/hack/runtime-migrator/go.mod @@ -40,6 +40,8 @@ require ( github.com/google/go-cmp v0.6.0 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/uuid v1.6.0 // indirect + github.com/hashicorp/errwrap v1.1.0 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/imdario/mergo v0.3.16 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect diff --git a/hack/runtime-migrator/go.sum b/hack/runtime-migrator/go.sum index 3f593cd4..60b0a4e3 100644 --- a/hack/runtime-migrator/go.sum +++ b/hack/runtime-migrator/go.sum @@ -60,6 +60,11 @@ github.com/google/pprof v0.0.0-20240827171923-fa2c70bbbfe5 h1:5iH8iuqE5apketRbSF github.com/google/pprof v0.0.0-20240827171923-fa2c70bbbfe5/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= +github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -106,6 +111,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/mock v0.4.0 h1:VcM4ZOtdbR4f6VXfiOpwpVJDL6lCReaZ6mw31wqh7KU= +go.uber.org/mock v0.4.0/go.mod h1:a6FSlNadKUHUa9IP5Vyt1zh4fC7uAwxMutEAscFbkZc= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= diff --git a/internal/controller/runtime/fsm/runtime_fsm_apply_clusterrolebindings.go b/internal/controller/runtime/fsm/runtime_fsm_apply_clusterrolebindings.go index ce92b36d..2b63b63f 100644 --- a/internal/controller/runtime/fsm/runtime_fsm_apply_clusterrolebindings.go +++ b/internal/controller/runtime/fsm/runtime_fsm_apply_clusterrolebindings.go @@ -35,7 +35,8 @@ func sFnApplyClusterRoleBindings(ctx context.Context, m *fsm, s *systemState) (s var crbList rbacv1.ClusterRoleBindingList if err := shootAdminClient.List(ctx, &crbList); err != nil { updateCRBApplyFailed(&s.instance) - return updateStatusAndStopWithError(err) + m.log.Info("Cannot list Cluster Role Bindings on shoot, scheduling for retry", "RuntimeCR", s.instance.Name, "shoot", s.shoot.Name) + return requeue() } removed := getRemoved(crbList.Items, s.instance.Spec.Security.Administrators) @@ -47,7 +48,8 @@ func sFnApplyClusterRoleBindings(ctx context.Context, m *fsm, s *systemState) (s } { if err := fn(); err != nil { updateCRBApplyFailed(&s.instance) - return updateStatusAndStopWithError(err) + m.log.Info("Cannot setup Cluster Role Bindings on shoot, scheduling for retry", "RuntimeCR", s.instance.Name, "shoot", s.shoot.Name) + return requeue() } } diff --git a/internal/controller/runtime/fsm/runtime_fsm_select_shoot_processing.go b/internal/controller/runtime/fsm/runtime_fsm_select_shoot_processing.go index 9d6788be..0de3f10b 100644 --- a/internal/controller/runtime/fsm/runtime_fsm_select_shoot_processing.go +++ b/internal/controller/runtime/fsm/runtime_fsm_select_shoot_processing.go @@ -10,8 +10,6 @@ import ( ctrl "sigs.k8s.io/controller-runtime" ) -type ErrReason string - func sFnSelectShootProcessing(_ context.Context, m *fsm, s *systemState) (stateFn, *ctrl.Result, error) { m.log.Info("Select shoot processing state") diff --git a/internal/controller/runtime/fsm/runtime_fsm_waiting_for_shoot_reconcile.go b/internal/controller/runtime/fsm/runtime_fsm_waiting_for_shoot_reconcile.go index 10b47e40..788e60d1 100644 --- a/internal/controller/runtime/fsm/runtime_fsm_waiting_for_shoot_reconcile.go +++ b/internal/controller/runtime/fsm/runtime_fsm_waiting_for_shoot_reconcile.go @@ -6,6 +6,7 @@ import ( gardener "github.com/gardener/gardener/pkg/apis/core/v1beta1" imv1 "github.com/kyma-project/infrastructure-manager/api/v1" + imgardenerhandler "github.com/kyma-project/infrastructure-manager/pkg/gardener" ctrl "sigs.k8s.io/controller-runtime" ) @@ -25,10 +26,17 @@ func sFnWaitForShootReconcile(_ context.Context, m *fsm, s *systemState) (stateF return updateStatusAndRequeueAfter(m.RCCfg.GardenerRequeueDuration) case gardener.LastOperationStateFailed: - var reason ErrReason + lastErrors := s.shoot.Status.LastErrors + reason := imgardenerhandler.ToErrReason(lastErrors...) - if len(s.shoot.Status.LastErrors) > 0 { - reason = gardenerErrCodesToErrReason(s.shoot.Status.LastErrors...) + if imgardenerhandler.IsRetryable(lastErrors) { + m.log.Info(fmt.Sprintf("Retryable gardener errors during cluster provisioning for Shoot %s, reason: %s, scheduling for retry", s.shoot.Name, reason)) + s.instance.UpdateStatePending( + imv1.ConditionTypeRuntimeProvisioned, + imv1.ConditionReasonShootCreationPending, + "Unknown", + "Retryable gardener errors during cluster reconcile") + return updateStatusAndRequeueAfter(m.RCCfg.GardenerRequeueDuration) } msg := fmt.Sprintf("error during cluster processing: reconcilation failed for shoot %s, reason: %s, exiting with no retry", s.shoot.Name, reason) diff --git a/internal/controller/runtime/fsm/runtime_fsm_waiting_shoot_creation.go b/internal/controller/runtime/fsm/runtime_fsm_waiting_shoot_creation.go index 1acd3e1b..2c0036fc 100644 --- a/internal/controller/runtime/fsm/runtime_fsm_waiting_shoot_creation.go +++ b/internal/controller/runtime/fsm/runtime_fsm_waiting_shoot_creation.go @@ -3,11 +3,10 @@ package fsm import ( "context" "fmt" - "strings" gardener "github.com/gardener/gardener/pkg/apis/core/v1beta1" - gardenerhelper "github.com/gardener/gardener/pkg/apis/core/v1beta1/helper" imv1 "github.com/kyma-project/infrastructure-manager/api/v1" + imgardenerhandler "github.com/kyma-project/infrastructure-manager/pkg/gardener" ctrl "sigs.k8s.io/controller-runtime" ) @@ -43,15 +42,19 @@ func sFnWaitForShootCreation(_ context.Context, m *fsm, s *systemState) (stateFn return updateStatusAndRequeueAfter(m.RCCfg.GardenerRequeueDuration) case gardener.LastOperationStateFailed: - if gardenerhelper.HasErrorCode(s.shoot.Status.LastErrors, gardener.ErrorInfraRateLimitsExceeded) { - m.log.Info(fmt.Sprintf("Error during cluster provisioning: Rate limits exceeded for Shoot %s, scheduling for retry", s.shoot.Name)) + lastErrors := s.shoot.Status.LastErrors + reason := imgardenerhandler.ToErrReason(lastErrors...) + + if imgardenerhandler.IsRetryable(lastErrors) { + m.log.Info(fmt.Sprintf("Retryable gardener errors during cluster provisioning for Shoot %s, reason: %s, scheduling for retry", s.shoot.Name, reason)) + s.instance.UpdateStatePending( + imv1.ConditionTypeRuntimeProvisioned, + imv1.ConditionReasonShootCreationPending, + "Unknown", + "Retryable gardener errors during cluster provisioning") return updateStatusAndRequeueAfter(m.RCCfg.GardenerRequeueDuration) } - // also handle other retryable errors here - // ErrorRetryableConfigurationProblem - // ErrorRetryableInfraDependencies - msg := fmt.Sprintf("Provisioning failed for shoot: %s ! Last state: %s, Description: %s", s.shoot.Name, s.shoot.Status.LastOperation.State, s.shoot.Status.LastOperation.Description) m.log.Info(msg) @@ -78,20 +81,3 @@ func sFnWaitForShootCreation(_ context.Context, m *fsm, s *systemState) (stateFn return stopWithMetrics() } } - -func gardenerErrCodesToErrReason(lastErrors ...gardener.LastError) ErrReason { - var codes []gardener.ErrorCode - var vals []string - - for _, e := range lastErrors { - if len(e.Codes) > 0 { - codes = append(codes, e.Codes...) - } - } - - for _, code := range codes { - vals = append(vals, string(code)) - } - - return ErrReason(strings.Join(vals, ", ")) -} diff --git a/pkg/gardener/gardener_error_handler.go b/pkg/gardener/gardener_error_handler.go new file mode 100644 index 00000000..0ae41da9 --- /dev/null +++ b/pkg/gardener/gardener_error_handler.go @@ -0,0 +1,45 @@ +package gardener + +import ( + "fmt" + "strconv" + "strings" + + gardener "github.com/gardener/gardener/pkg/apis/core/v1beta1" + gardenerhelper "github.com/gardener/gardener/pkg/apis/core/v1beta1/helper" +) + +type ErrReason string + +func IsRetryable(lastErrors []gardener.LastError) bool { + if len(lastErrors) > 0 && + !gardenerhelper.HasNonRetryableErrorCode(lastErrors...) { + return true + } + return false +} + +func ToErrReason(lastErrors ...gardener.LastError) ErrReason { + var codes []gardener.ErrorCode + var vals []string + + for _, e := range lastErrors { + if len(e.Codes) > 0 { + codes = append(codes, e.Codes...) + } + } + + for _, code := range codes { + vals = append(vals, string(code)) + } + + return ErrReason(strings.Join(vals, ", ")) +} + +func CombineErrorDescriptions(lastErrors []gardener.LastError) string { + var descriptions string + for i, lastError := range lastErrors { + descriptions += fmt.Sprint(strconv.Itoa(i+1), ") ", lastError.Description, " ") + } + return descriptions +} diff --git a/pkg/gardener/gardener_error_handler_test.go b/pkg/gardener/gardener_error_handler_test.go new file mode 100644 index 00000000..ac83ab22 --- /dev/null +++ b/pkg/gardener/gardener_error_handler_test.go @@ -0,0 +1,152 @@ +package gardener + +import ( + "testing" + + gardener "github.com/gardener/gardener/pkg/apis/core/v1beta1" + "github.com/stretchr/testify/assert" +) + +func TestGardenerErrorHandler(t *testing.T) { + t.Run("Should combine error descriptions", func(t *testing.T) { + // given + lastErrors := []gardener.LastError{ + { + Description: "First description", + }, + { + Description: "Second description", + }, + } + + // when + combinedDescritpion := CombineErrorDescriptions(lastErrors) + + // then + assert.Equal(t, "1) First description 2) Second description ", combinedDescritpion) + }) + + for _, testCase := range []struct { + name string + lastErrors []gardener.LastError + expectedRetryable bool + }{ + { + name: "Should return true for retryable gardener errors", + lastErrors: fixRetryableErrors(), + expectedRetryable: true, + }, + { + name: "Should return false for retryable gardener errors", + lastErrors: fixNonRetryableErrors(), + expectedRetryable: false, + }, + { + name: "Should return false for mixture of retryable and non-retryable gardener errors", + lastErrors: fixMixtureOfErrors(), + expectedRetryable: false, + }, + { + name: "Should return false empty list", + lastErrors: []gardener.LastError{}, + expectedRetryable: false, + }, + } { + t.Run(testCase.name, func(t *testing.T) { + // given + + // when + retryable := IsRetryable(testCase.lastErrors) + + // then + assert.Equal(t, testCase.expectedRetryable, retryable) + }) + } +} + +func fixRetryableErrors() []gardener.LastError { + return []gardener.LastError{ + { + Description: "First description - retryable", + Codes: []gardener.ErrorCode{ + gardener.ErrorRetryableConfigurationProblem, + }, + }, + { + Description: "Second description - retryable", + Codes: []gardener.ErrorCode{ + gardener.ErrorRetryableInfraDependencies, + }, + }, + } +} + +func fixNonRetryableErrors() []gardener.LastError { + return []gardener.LastError{ + { + Description: "First description - non-retryable", + Codes: []gardener.ErrorCode{ + gardener.ErrorInfraDependencies, + }, + }, + { + Description: "Second description - non-retryable", + Codes: []gardener.ErrorCode{ + gardener.ErrorInfraQuotaExceeded, + }, + }, + { + Description: "Third description - non-retryable", + Codes: []gardener.ErrorCode{ + gardener.ErrorInfraUnauthenticated, + }, + }, + { + Description: "Fourth description - non-retryable", + Codes: []gardener.ErrorCode{ + gardener.ErrorInfraUnauthorized, + }, + }, + { + Description: "Fifth description - non-retryable", + Codes: []gardener.ErrorCode{ + gardener.ErrorInfraRateLimitsExceeded, + }, + }, + { + Description: "Sixth description - non-retryable", + Codes: []gardener.ErrorCode{ + gardener.ErrorConfigurationProblem, + }, + }, + { + Description: "Seventh description - non-retryable", + Codes: []gardener.ErrorCode{ + gardener.ErrorProblematicWebhook, + }, + }, + } +} + +func fixMixtureOfErrors() []gardener.LastError { + return []gardener.LastError{ + { + Description: "First description - non-retryable", + Codes: []gardener.ErrorCode{ + gardener.ErrorInfraDependencies, + }, + }, + { + Description: "Second description - retryable", + Codes: []gardener.ErrorCode{ + gardener.ErrorRetryableConfigurationProblem, + }, + }, + { + Description: "Third description - non-retryable", + Codes: []gardener.ErrorCode{ + gardener.ErrorInfraQuotaExceeded, + }, + }, + } +}