Skip to content

Commit

Permalink
Merge pull request #480 from Disper/retryable_improvements
Browse files Browse the repository at this point in the history
Improves gardener errors handling for those which can be retried
  • Loading branch information
kyma-bot authored Nov 18, 2024
2 parents 67028fc + cbd6dbf commit de36327
Show file tree
Hide file tree
Showing 8 changed files with 232 additions and 32 deletions.
2 changes: 2 additions & 0 deletions hack/runtime-migrator/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ require (
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/imdario/mergo v0.3.16 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
Expand Down
7 changes: 7 additions & 0 deletions hack/runtime-migrator/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ github.com/google/pprof v0.0.0-20240827171923-fa2c70bbbfe5 h1:5iH8iuqE5apketRbSF
github.com/google/pprof v0.0.0-20240827171923-fa2c70bbbfe5/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I=
github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4=
github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
Expand Down Expand Up @@ -106,6 +111,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/mock v0.4.0 h1:VcM4ZOtdbR4f6VXfiOpwpVJDL6lCReaZ6mw31wqh7KU=
go.uber.org/mock v0.4.0/go.mod h1:a6FSlNadKUHUa9IP5Vyt1zh4fC7uAwxMutEAscFbkZc=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ func sFnApplyClusterRoleBindings(ctx context.Context, m *fsm, s *systemState) (s
var crbList rbacv1.ClusterRoleBindingList
if err := shootAdminClient.List(ctx, &crbList); err != nil {
updateCRBApplyFailed(&s.instance)
return updateStatusAndStopWithError(err)
m.log.Info("Cannot list Cluster Role Bindings on shoot, scheduling for retry", "RuntimeCR", s.instance.Name, "shoot", s.shoot.Name)
return requeue()
}

removed := getRemoved(crbList.Items, s.instance.Spec.Security.Administrators)
Expand All @@ -47,7 +48,8 @@ func sFnApplyClusterRoleBindings(ctx context.Context, m *fsm, s *systemState) (s
} {
if err := fn(); err != nil {
updateCRBApplyFailed(&s.instance)
return updateStatusAndStopWithError(err)
m.log.Info("Cannot setup Cluster Role Bindings on shoot, scheduling for retry", "RuntimeCR", s.instance.Name, "shoot", s.shoot.Name)
return requeue()
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
)

type ErrReason string

func sFnSelectShootProcessing(_ context.Context, m *fsm, s *systemState) (stateFn, *ctrl.Result, error) {
m.log.Info("Select shoot processing state")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (

gardener "github.com/gardener/gardener/pkg/apis/core/v1beta1"
imv1 "github.com/kyma-project/infrastructure-manager/api/v1"
imgardenerhandler "github.com/kyma-project/infrastructure-manager/pkg/gardener"
ctrl "sigs.k8s.io/controller-runtime"
)

Expand All @@ -25,10 +26,17 @@ func sFnWaitForShootReconcile(_ context.Context, m *fsm, s *systemState) (stateF
return updateStatusAndRequeueAfter(m.RCCfg.GardenerRequeueDuration)

case gardener.LastOperationStateFailed:
var reason ErrReason
lastErrors := s.shoot.Status.LastErrors
reason := imgardenerhandler.ToErrReason(lastErrors...)

if len(s.shoot.Status.LastErrors) > 0 {
reason = gardenerErrCodesToErrReason(s.shoot.Status.LastErrors...)
if imgardenerhandler.IsRetryable(lastErrors) {
m.log.Info(fmt.Sprintf("Retryable gardener errors during cluster provisioning for Shoot %s, reason: %s, scheduling for retry", s.shoot.Name, reason))
s.instance.UpdateStatePending(
imv1.ConditionTypeRuntimeProvisioned,
imv1.ConditionReasonShootCreationPending,
"Unknown",
"Retryable gardener errors during cluster reconcile")
return updateStatusAndRequeueAfter(m.RCCfg.GardenerRequeueDuration)
}

msg := fmt.Sprintf("error during cluster processing: reconcilation failed for shoot %s, reason: %s, exiting with no retry", s.shoot.Name, reason)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@ package fsm
import (
"context"
"fmt"
"strings"

gardener "github.com/gardener/gardener/pkg/apis/core/v1beta1"
gardenerhelper "github.com/gardener/gardener/pkg/apis/core/v1beta1/helper"
imv1 "github.com/kyma-project/infrastructure-manager/api/v1"
imgardenerhandler "github.com/kyma-project/infrastructure-manager/pkg/gardener"
ctrl "sigs.k8s.io/controller-runtime"
)

Expand Down Expand Up @@ -43,15 +42,19 @@ func sFnWaitForShootCreation(_ context.Context, m *fsm, s *systemState) (stateFn
return updateStatusAndRequeueAfter(m.RCCfg.GardenerRequeueDuration)

case gardener.LastOperationStateFailed:
if gardenerhelper.HasErrorCode(s.shoot.Status.LastErrors, gardener.ErrorInfraRateLimitsExceeded) {
m.log.Info(fmt.Sprintf("Error during cluster provisioning: Rate limits exceeded for Shoot %s, scheduling for retry", s.shoot.Name))
lastErrors := s.shoot.Status.LastErrors
reason := imgardenerhandler.ToErrReason(lastErrors...)

if imgardenerhandler.IsRetryable(lastErrors) {
m.log.Info(fmt.Sprintf("Retryable gardener errors during cluster provisioning for Shoot %s, reason: %s, scheduling for retry", s.shoot.Name, reason))
s.instance.UpdateStatePending(
imv1.ConditionTypeRuntimeProvisioned,
imv1.ConditionReasonShootCreationPending,
"Unknown",
"Retryable gardener errors during cluster provisioning")
return updateStatusAndRequeueAfter(m.RCCfg.GardenerRequeueDuration)
}

// also handle other retryable errors here
// ErrorRetryableConfigurationProblem
// ErrorRetryableInfraDependencies

msg := fmt.Sprintf("Provisioning failed for shoot: %s ! Last state: %s, Description: %s", s.shoot.Name, s.shoot.Status.LastOperation.State, s.shoot.Status.LastOperation.Description)
m.log.Info(msg)

Expand All @@ -78,20 +81,3 @@ func sFnWaitForShootCreation(_ context.Context, m *fsm, s *systemState) (stateFn
return stopWithMetrics()
}
}

func gardenerErrCodesToErrReason(lastErrors ...gardener.LastError) ErrReason {
var codes []gardener.ErrorCode
var vals []string

for _, e := range lastErrors {
if len(e.Codes) > 0 {
codes = append(codes, e.Codes...)
}
}

for _, code := range codes {
vals = append(vals, string(code))
}

return ErrReason(strings.Join(vals, ", "))
}
45 changes: 45 additions & 0 deletions pkg/gardener/gardener_error_handler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package gardener

import (
"fmt"
"strconv"
"strings"

gardener "github.com/gardener/gardener/pkg/apis/core/v1beta1"
gardenerhelper "github.com/gardener/gardener/pkg/apis/core/v1beta1/helper"
)

type ErrReason string

func IsRetryable(lastErrors []gardener.LastError) bool {
if len(lastErrors) > 0 &&
!gardenerhelper.HasNonRetryableErrorCode(lastErrors...) {
return true
}
return false
}

func ToErrReason(lastErrors ...gardener.LastError) ErrReason {
var codes []gardener.ErrorCode
var vals []string

for _, e := range lastErrors {
if len(e.Codes) > 0 {
codes = append(codes, e.Codes...)
}
}

for _, code := range codes {
vals = append(vals, string(code))
}

return ErrReason(strings.Join(vals, ", "))
}

func CombineErrorDescriptions(lastErrors []gardener.LastError) string {
var descriptions string
for i, lastError := range lastErrors {
descriptions += fmt.Sprint(strconv.Itoa(i+1), ") ", lastError.Description, " ")
}
return descriptions
}
152 changes: 152 additions & 0 deletions pkg/gardener/gardener_error_handler_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
package gardener

import (
"testing"

gardener "github.com/gardener/gardener/pkg/apis/core/v1beta1"
"github.com/stretchr/testify/assert"
)

func TestGardenerErrorHandler(t *testing.T) {
t.Run("Should combine error descriptions", func(t *testing.T) {
// given
lastErrors := []gardener.LastError{
{
Description: "First description",
},
{
Description: "Second description",
},
}

// when
combinedDescritpion := CombineErrorDescriptions(lastErrors)

// then
assert.Equal(t, "1) First description 2) Second description ", combinedDescritpion)
})

for _, testCase := range []struct {
name string
lastErrors []gardener.LastError
expectedRetryable bool
}{
{
name: "Should return true for retryable gardener errors",
lastErrors: fixRetryableErrors(),
expectedRetryable: true,
},
{
name: "Should return false for retryable gardener errors",
lastErrors: fixNonRetryableErrors(),
expectedRetryable: false,
},
{
name: "Should return false for mixture of retryable and non-retryable gardener errors",
lastErrors: fixMixtureOfErrors(),
expectedRetryable: false,
},
{
name: "Should return false empty list",
lastErrors: []gardener.LastError{},
expectedRetryable: false,
},
} {
t.Run(testCase.name, func(t *testing.T) {
// given

// when
retryable := IsRetryable(testCase.lastErrors)

// then
assert.Equal(t, testCase.expectedRetryable, retryable)
})
}
}

func fixRetryableErrors() []gardener.LastError {
return []gardener.LastError{
{
Description: "First description - retryable",
Codes: []gardener.ErrorCode{
gardener.ErrorRetryableConfigurationProblem,
},
},
{
Description: "Second description - retryable",
Codes: []gardener.ErrorCode{
gardener.ErrorRetryableInfraDependencies,
},
},
}
}

func fixNonRetryableErrors() []gardener.LastError {
return []gardener.LastError{
{
Description: "First description - non-retryable",
Codes: []gardener.ErrorCode{
gardener.ErrorInfraDependencies,
},
},
{
Description: "Second description - non-retryable",
Codes: []gardener.ErrorCode{
gardener.ErrorInfraQuotaExceeded,
},
},
{
Description: "Third description - non-retryable",
Codes: []gardener.ErrorCode{
gardener.ErrorInfraUnauthenticated,
},
},
{
Description: "Fourth description - non-retryable",
Codes: []gardener.ErrorCode{
gardener.ErrorInfraUnauthorized,
},
},
{
Description: "Fifth description - non-retryable",
Codes: []gardener.ErrorCode{
gardener.ErrorInfraRateLimitsExceeded,
},
},
{
Description: "Sixth description - non-retryable",
Codes: []gardener.ErrorCode{
gardener.ErrorConfigurationProblem,
},
},
{
Description: "Seventh description - non-retryable",
Codes: []gardener.ErrorCode{
gardener.ErrorProblematicWebhook,
},
},
}
}

func fixMixtureOfErrors() []gardener.LastError {
return []gardener.LastError{
{
Description: "First description - non-retryable",
Codes: []gardener.ErrorCode{
gardener.ErrorInfraDependencies,
},
},
{
Description: "Second description - retryable",
Codes: []gardener.ErrorCode{
gardener.ErrorRetryableConfigurationProblem,
},
},
{
Description: "Third description - non-retryable",
Codes: []gardener.ErrorCode{
gardener.ErrorInfraQuotaExceeded,
},
},
}
}

0 comments on commit de36327

Please sign in to comment.