Skip to content

Commit

Permalink
[e2e] recreate stack on fakeintake timeout (#29253)
Browse files Browse the repository at this point in the history
  • Loading branch information
pducolin authored Sep 12, 2024
1 parent 0c21812 commit a28fcd4
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 13 deletions.
16 changes: 12 additions & 4 deletions test/new-e2e/pkg/utils/infra/retriable_errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,30 @@ func getKnownErrors() []knownError {
// Add here errors that are known to be flakes and that should be retried
return []knownError{
{
errorMessage: "i/o timeout",
errorMessage: `i\/o timeout`,
retryType: ReCreate,
},
{
// https://datadoghq.atlassian.net/browse/ADXT-1
errorMessage: "failed attempts: dial tcp :22: connect: connection refused",
errorMessage: `failed attempts: dial tcp :22: connect: connection refused`,
retryType: ReCreate,
},
{
// https://datadoghq.atlassian.net/browse/ADXT-295
errorMessage: "Resource provider reported that the resource did not exist while updating",
errorMessage: `Resource provider reported that the resource did not exist while updating`,
retryType: ReCreate,
},
{
// https://datadoghq.atlassian.net/browse/ADXT-558
errorMessage: "Process exited with status 2: running \" sudo cloud-init status --wait\"",
errorMessage: `Process exited with status 2: running " sudo cloud-init status --wait"`,
retryType: ReCreate,
},
{
errorMessage: `waiting for ECS Service .+fakeintake-ecs.+ create: timeout while waiting for state to become 'tfSTABLE'`,
retryType: ReCreate,
},
{
errorMessage: `error while waiting for fakeintake`,
retryType: ReCreate,
},
}
Expand Down
19 changes: 12 additions & 7 deletions test/new-e2e/pkg/utils/infra/stack_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"fmt"
"io"
"os"
"regexp"
"runtime"
"strings"
"sync"
Expand Down Expand Up @@ -56,16 +57,16 @@ var (
initStackManager sync.Once
)

// RetryStrategy is a function that given the current error and the number of retries, returns the type of retry to perform and a list of options to modify the configuration
type RetryStrategy func(error, int) (RetryType, []GetStackOption)
// RetryStrategyFromFn is a function that given the current error and the number of retries, returns the type of retry to perform and a list of options to modify the configuration
type RetryStrategyFromFn func(error, int) (RetryType, []GetStackOption)

// StackManager handles
type StackManager struct {
stacks *safeStackMap
knownErrors []knownError

// RetryStrategy defines how to handle retries. By default points to StackManager.getRetryStrategyFrom but can be overridden
RetryStrategy RetryStrategy
// GetRetryStrategyFrom defines how to handle retries. By default points to StackManager.getRetryStrategyFrom but can be overridden
GetRetryStrategyFrom RetryStrategyFromFn
}

type safeStackMap struct {
Expand Down Expand Up @@ -120,7 +121,7 @@ func newStackManager() (*StackManager, error) {
stacks: newSafeStackMap(),
knownErrors: getKnownErrors(),
}
sm.RetryStrategy = sm.getRetryStrategyFrom
sm.GetRetryStrategyFrom = sm.getRetryStrategyFrom

return sm, nil
}
Expand Down Expand Up @@ -523,7 +524,7 @@ func (sm *StackManager) getStack(ctx context.Context, name string, deployFunc pu
}
}

retryStrategy, changedOpts := sm.RetryStrategy(upError, upCount)
retryStrategy, changedOpts := sm.GetRetryStrategyFrom(upError, upCount)
sendEventToDatadog(params.DatadogEventSender, fmt.Sprintf("[E2E] Stack %s : error on Pulumi stack up", name), upError.Error(), []string{"operation:up", "result:fail", fmt.Sprintf("retry:%s", retryStrategy), fmt.Sprintf("stack:%s", stack.Name()), fmt.Sprintf("retries:%d", upCount)})

switch retryStrategy {
Expand Down Expand Up @@ -619,7 +620,11 @@ func (sm *StackManager) getRetryStrategyFrom(err error, upCount int) (RetryType,
}

for _, knownError := range sm.knownErrors {
if strings.Contains(err.Error(), knownError.errorMessage) {
isMatch, err := regexp.MatchString(knownError.errorMessage, err.Error())
if err != nil {
fmt.Printf("Error matching regex %s: %v\n", knownError.errorMessage, err)
}
if isMatch {
return knownError.retryType, nil
}
}
Expand Down
48 changes: 47 additions & 1 deletion test/new-e2e/pkg/utils/infra/stack_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@ package infra

import (
"context"
"errors"
"fmt"
"github.com/DataDog/datadog-agent/test/new-e2e/pkg/utils/common"
"io"
"strings"
"testing"
"time"

"github.com/DataDog/datadog-agent/test/new-e2e/pkg/utils/common"

"github.com/DataDog/datadog-api-client-go/v2/api/datadogV1"
"github.com/pulumi/pulumi/sdk/v3/go/auto"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
Expand Down Expand Up @@ -225,6 +227,50 @@ func TestStackManager(t *testing.T) {
assert.Contains(t, mockDatadogEventSender.events[1].Title, fmt.Sprintf("[E2E] Stack %s : error on Pulumi stack up", stackName))
assert.Contains(t, mockDatadogEventSender.events[2].Title, fmt.Sprintf("[E2E] Stack %s : success on Pulumi stack up", stackName))
})

t.Run("should-return-retry-strategy-on-retriable-errors", func(t *testing.T) {
t.Parallel()

type testError struct {
name string
errMessage string
expectedRetryType RetryType
}

testErrors := []testError{
{
name: "timeout",
errMessage: "i/o timeout",
expectedRetryType: ReCreate,
},
{
name: "connection-refused",
errMessage: "failed attempts: dial tcp :22: connect: connection refused",
expectedRetryType: ReCreate,
},
{
name: "resource-not-exist",
errMessage: "Resource provider reported that the resource did not exist while updating",
expectedRetryType: ReCreate,
},
{
name: "cloud-init-timeout",
errMessage: "Process exited with status 2: running \" sudo cloud-init status --wait\"",
expectedRetryType: ReCreate,
},
{
name: "ecs-fakeintake-timeout",
errMessage: "waiting for ECS Service (arn:aws:ecs:us-east-1:669783387624:service/fakeintake-ecs/ci-633219896-4670-e2e-dockersuite-80f62edf7bcc6194-aws-fakeintake-dockervm-srv) create: timeout while waiting for state to become 'tfSTABLE' (last state: 'tfPENDING', timeout: 20m0s)",
expectedRetryType: ReCreate,
},
}

for _, te := range testErrors {
err := errors.New(te.errMessage)
retryType, _ := stackManager.getRetryStrategyFrom(err, 0)
assert.Equal(t, te.expectedRetryType, retryType, te.name)
}
})
}

func filterRetryOnErrorLogs(logs []string) []string {
Expand Down
2 changes: 1 addition & 1 deletion test/new-e2e/system-probe/system-probe-test-env.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ func NewTestEnv(name, x86InstanceType, armInstanceType string, opts *EnvOpts) (*
infraEnv: opts.InfraEnv,
}

stackManager.RetryStrategy = retryHandler.HandleError
stackManager.GetRetryStrategyFrom = retryHandler.HandleError
pulumiStack, upResult, pulumiErr := stackManager.GetStackNoDeleteOnFailure(
systemProbeTestEnv.context,
systemProbeTestEnv.name,
Expand Down

0 comments on commit a28fcd4

Please sign in to comment.