From a28fcd42853751395c46b4ac3a307ebca9bb7757 Mon Sep 17 00:00:00 2001 From: pducolin <45568537+pducolin@users.noreply.github.com> Date: Thu, 12 Sep 2024 08:19:45 +0200 Subject: [PATCH] [e2e] recreate stack on fakeintake timeout (#29253) --- .../pkg/utils/infra/retriable_errors.go | 16 +++++-- test/new-e2e/pkg/utils/infra/stack_manager.go | 19 +++++--- .../pkg/utils/infra/stack_manager_test.go | 48 ++++++++++++++++++- .../system-probe/system-probe-test-env.go | 2 +- 4 files changed, 72 insertions(+), 13 deletions(-) diff --git a/test/new-e2e/pkg/utils/infra/retriable_errors.go b/test/new-e2e/pkg/utils/infra/retriable_errors.go index b8d5c27b53195..7d28f17006460 100644 --- a/test/new-e2e/pkg/utils/infra/retriable_errors.go +++ b/test/new-e2e/pkg/utils/infra/retriable_errors.go @@ -24,22 +24,30 @@ func getKnownErrors() []knownError { // Add here errors that are known to be flakes and that should be retried return []knownError{ { - errorMessage: "i/o timeout", + errorMessage: `i\/o timeout`, retryType: ReCreate, }, { // https://datadoghq.atlassian.net/browse/ADXT-1 - errorMessage: "failed attempts: dial tcp :22: connect: connection refused", + errorMessage: `failed attempts: dial tcp :22: connect: connection refused`, retryType: ReCreate, }, { // https://datadoghq.atlassian.net/browse/ADXT-295 - errorMessage: "Resource provider reported that the resource did not exist while updating", + errorMessage: `Resource provider reported that the resource did not exist while updating`, retryType: ReCreate, }, { // https://datadoghq.atlassian.net/browse/ADXT-558 - errorMessage: "Process exited with status 2: running \" sudo cloud-init status --wait\"", + errorMessage: `Process exited with status 2: running " sudo cloud-init status --wait"`, + retryType: ReCreate, + }, + { + errorMessage: `waiting for ECS Service .+fakeintake-ecs.+ create: timeout while waiting for state to become 'tfSTABLE'`, + retryType: ReCreate, + }, + { + errorMessage: `error while waiting for fakeintake`, retryType: ReCreate, }, } diff --git a/test/new-e2e/pkg/utils/infra/stack_manager.go b/test/new-e2e/pkg/utils/infra/stack_manager.go index 2a580fd651d7d..15d9e44c2139b 100644 --- a/test/new-e2e/pkg/utils/infra/stack_manager.go +++ b/test/new-e2e/pkg/utils/infra/stack_manager.go @@ -12,6 +12,7 @@ import ( "fmt" "io" "os" + "regexp" "runtime" "strings" "sync" @@ -56,16 +57,16 @@ var ( initStackManager sync.Once ) -// RetryStrategy is a function that given the current error and the number of retries, returns the type of retry to perform and a list of options to modify the configuration -type RetryStrategy func(error, int) (RetryType, []GetStackOption) +// RetryStrategyFromFn is a function that given the current error and the number of retries, returns the type of retry to perform and a list of options to modify the configuration +type RetryStrategyFromFn func(error, int) (RetryType, []GetStackOption) // StackManager handles type StackManager struct { stacks *safeStackMap knownErrors []knownError - // RetryStrategy defines how to handle retries. By default points to StackManager.getRetryStrategyFrom but can be overridden - RetryStrategy RetryStrategy + // GetRetryStrategyFrom defines how to handle retries. By default points to StackManager.getRetryStrategyFrom but can be overridden + GetRetryStrategyFrom RetryStrategyFromFn } type safeStackMap struct { @@ -120,7 +121,7 @@ func newStackManager() (*StackManager, error) { stacks: newSafeStackMap(), knownErrors: getKnownErrors(), } - sm.RetryStrategy = sm.getRetryStrategyFrom + sm.GetRetryStrategyFrom = sm.getRetryStrategyFrom return sm, nil } @@ -523,7 +524,7 @@ func (sm *StackManager) getStack(ctx context.Context, name string, deployFunc pu } } - retryStrategy, changedOpts := sm.RetryStrategy(upError, upCount) + retryStrategy, changedOpts := sm.GetRetryStrategyFrom(upError, upCount) sendEventToDatadog(params.DatadogEventSender, fmt.Sprintf("[E2E] Stack %s : error on Pulumi stack up", name), upError.Error(), []string{"operation:up", "result:fail", fmt.Sprintf("retry:%s", retryStrategy), fmt.Sprintf("stack:%s", stack.Name()), fmt.Sprintf("retries:%d", upCount)}) switch retryStrategy { @@ -619,7 +620,11 @@ func (sm *StackManager) getRetryStrategyFrom(err error, upCount int) (RetryType, } for _, knownError := range sm.knownErrors { - if strings.Contains(err.Error(), knownError.errorMessage) { + isMatch, err := regexp.MatchString(knownError.errorMessage, err.Error()) + if err != nil { + fmt.Printf("Error matching regex %s: %v\n", knownError.errorMessage, err) + } + if isMatch { return knownError.retryType, nil } } diff --git a/test/new-e2e/pkg/utils/infra/stack_manager_test.go b/test/new-e2e/pkg/utils/infra/stack_manager_test.go index c10aaba87ee4e..56de15ce98455 100644 --- a/test/new-e2e/pkg/utils/infra/stack_manager_test.go +++ b/test/new-e2e/pkg/utils/infra/stack_manager_test.go @@ -8,13 +8,15 @@ package infra import ( "context" + "errors" "fmt" - "github.com/DataDog/datadog-agent/test/new-e2e/pkg/utils/common" "io" "strings" "testing" "time" + "github.com/DataDog/datadog-agent/test/new-e2e/pkg/utils/common" + "github.com/DataDog/datadog-api-client-go/v2/api/datadogV1" "github.com/pulumi/pulumi/sdk/v3/go/auto" "github.com/pulumi/pulumi/sdk/v3/go/pulumi" @@ -225,6 +227,50 @@ func TestStackManager(t *testing.T) { assert.Contains(t, mockDatadogEventSender.events[1].Title, fmt.Sprintf("[E2E] Stack %s : error on Pulumi stack up", stackName)) assert.Contains(t, mockDatadogEventSender.events[2].Title, fmt.Sprintf("[E2E] Stack %s : success on Pulumi stack up", stackName)) }) + + t.Run("should-return-retry-strategy-on-retriable-errors", func(t *testing.T) { + t.Parallel() + + type testError struct { + name string + errMessage string + expectedRetryType RetryType + } + + testErrors := []testError{ + { + name: "timeout", + errMessage: "i/o timeout", + expectedRetryType: ReCreate, + }, + { + name: "connection-refused", + errMessage: "failed attempts: dial tcp :22: connect: connection refused", + expectedRetryType: ReCreate, + }, + { + name: "resource-not-exist", + errMessage: "Resource provider reported that the resource did not exist while updating", + expectedRetryType: ReCreate, + }, + { + name: "cloud-init-timeout", + errMessage: "Process exited with status 2: running \" sudo cloud-init status --wait\"", + expectedRetryType: ReCreate, + }, + { + name: "ecs-fakeintake-timeout", + errMessage: "waiting for ECS Service (arn:aws:ecs:us-east-1:669783387624:service/fakeintake-ecs/ci-633219896-4670-e2e-dockersuite-80f62edf7bcc6194-aws-fakeintake-dockervm-srv) create: timeout while waiting for state to become 'tfSTABLE' (last state: 'tfPENDING', timeout: 20m0s)", + expectedRetryType: ReCreate, + }, + } + + for _, te := range testErrors { + err := errors.New(te.errMessage) + retryType, _ := stackManager.getRetryStrategyFrom(err, 0) + assert.Equal(t, te.expectedRetryType, retryType, te.name) + } + }) } func filterRetryOnErrorLogs(logs []string) []string { diff --git a/test/new-e2e/system-probe/system-probe-test-env.go b/test/new-e2e/system-probe/system-probe-test-env.go index 516f47d76244a..2148e4d94427e 100644 --- a/test/new-e2e/system-probe/system-probe-test-env.go +++ b/test/new-e2e/system-probe/system-probe-test-env.go @@ -252,7 +252,7 @@ func NewTestEnv(name, x86InstanceType, armInstanceType string, opts *EnvOpts) (* infraEnv: opts.InfraEnv, } - stackManager.RetryStrategy = retryHandler.HandleError + stackManager.GetRetryStrategyFrom = retryHandler.HandleError pulumiStack, upResult, pulumiErr := stackManager.GetStackNoDeleteOnFailure( systemProbeTestEnv.context, systemProbeTestEnv.name,