From bd3623b6cbe5ea57e7944265117153865b040fe0 Mon Sep 17 00:00:00 2001 From: Julien Duchesne Date: Thu, 27 Apr 2023 06:26:18 -0400 Subject: [PATCH] AWS: Allow using a list of subnets (#127) * AWS: Allow using a list of subnets When using more specialized instance types, it may happen that they aren't available in the given AZ By giving multiple subnets, it allows the autoscaler to retry in a different AZ when that happens I saw a TODO about using a small struct to track attempts. By adding that, I could extend the current sizeAlt behavior The autoscaler will now try the regular size in all subnets, then the alternate size in all subnets This is what the output looks like when combined with a size alt (2 invalid subnets + 1 invalid machine type): ``` {"attempt":1,"image":"ami-00149760ce42c967b","level":"debug","msg":"instance create","name":"drone-linux-aws-amd64-ZQumcEFd","region":"us-east-2","size":"blabla","subnet":"a","time":"2023-01-30T16:50:54Z"} {"attempt":1,"error":"InvalidParameterValue: Invalid value 'blabla' for InstanceType.\n\tstatus code: 400, request id: 24a9131c-b46a-4afe-87e0-adea1b509232","image":"ami-00149760ce42c967b","level":"error","msg":"instance create failed","name":"drone-linux-aws-amd64-ZQumcEFd","region":"us-east-2","size":"blabla","subnet":"a","time":"2023-01-30T16:50:54Z"} {"attempt":2,"image":"ami-00149760ce42c967b","level":"debug","msg":"instance create","name":"drone-linux-aws-amd64-ZQumcEFd","region":"us-east-2","size":"blabla","subnet":"b","time":"2023-01-30T16:50:54Z"} {"attempt":2,"error":"InvalidParameterValue: Invalid value 'blabla' for InstanceType.\n\tstatus code: 400, request id: 34d7a00a-dd9f-4689-93fe-3fc5238404df","image":"ami-00149760ce42c967b","level":"error","msg":"instance create failed","name":"drone-linux-aws-amd64-ZQumcEFd","region":"us-east-2","size":"blabla","subnet":"b","time":"2023-01-30T16:50:55Z"} {"attempt":3,"image":"ami-00149760ce42c967b","level":"debug","msg":"instance create","name":"drone-linux-aws-amd64-ZQumcEFd","region":"us-east-2","size":"blabla","subnet":"","time":"2023-01-30T16:50:55Z"} {"attempt":3,"error":"InvalidParameterValue: Invalid value 'blabla' for InstanceType.\n\tstatus code: 400, request id: 2a3d3e92-6fc3-4343-adff-b62f9d31b390","image":"ami-00149760ce42c967b","level":"error","msg":"instance create failed","name":"drone-linux-aws-amd64-ZQumcEFd","region":"us-east-2","size":"blabla","subnet":"","time":"2023-01-30T16:50:55Z"} {"attempt":4,"image":"ami-00149760ce42c967b","level":"debug","msg":"instance create","name":"drone-linux-aws-amd64-ZQumcEFd","region":"us-east-2","size":"m6i.large","subnet":"a","time":"2023-01-30T16:50:55Z"} {"attempt":4,"error":"InvalidSubnetID.NotFound: The subnet ID 'a' does not exist\n\tstatus code: 400, request id: 1e4e7e07-56c6-489e-b01a-a0ba1fcca1c5","image":"ami-00149760ce42c967b","level":"error","msg":"instance create failed","name":"drone-linux-aws-amd64-ZQumcEFd","region":"us-east-2","size":"m6i.large","subnet":"a","time":"2023-01-30T16:50:56Z"} {"attempt":5,"image":"ami-00149760ce42c967b","level":"debug","msg":"instance create","name":"drone-linux-aws-amd64-ZQumcEFd","region":"us-east-2","size":"m6i.large","subnet":"b","time":"2023-01-30T16:50:56Z"} {"attempt":5,"error":"InvalidSubnetID.NotFound: The subnet ID 'b' does not exist\n\tstatus code: 400, request id: 7974f101-a5cd-4ed6-831d-7731dda2f172","image":"ami-00149760ce42c967b","level":"error","msg":"instance create failed","name":"drone-linux-aws-amd64-ZQumcEFd","region":"us-east-2","size":"m6i.large","subnet":"b","time":"2023-01-30T16:50:56Z"} {"attempt":6,"image":"ami-00149760ce42c967b","level":"debug","msg":"instance create","name":"drone-linux-aws-amd64-ZQumcEFd","region":"us-east-2","size":"m6i.large","subnet":"","time":"2023-01-30T16:50:56Z"} {"attempt":6,"image":"ami-00149760ce42c967b","level":"info","msg":"instance create success","name":"drone-linux-aws-amd64-ZQumcEFd","region":"us-east-2","size":"m6i.large","subnet":"","time":"2023-01-30T16:50:57Z"} {"attempt":6,"image":"ami-00149760ce42c967b","level":"debug","msg":"check instance network","name":"drone-linux-aws-amd64-ZQumcEFd","region":"us-east-2","size":"m6i.large","subnet":"","time":"2023-01-30T16:50:57Z"} ``` * Introduce new config for alternate subnets This maintains API retrocompatibility --- cmd/drone-autoscaler/main.go | 2 +- config/config.go | 1 + config/load_test.go | 5 +++ drivers/amazon/create.go | 71 ++++++++++++++++++++--------------- drivers/amazon/option.go | 6 +-- drivers/amazon/option_test.go | 4 +- drivers/amazon/provider.go | 2 +- drivers/amazon/setup.go | 2 +- 8 files changed, 55 insertions(+), 38 deletions(-) diff --git a/cmd/drone-autoscaler/main.go b/cmd/drone-autoscaler/main.go index 48769499..6df03135 100644 --- a/cmd/drone-autoscaler/main.go +++ b/cmd/drone-autoscaler/main.go @@ -312,7 +312,7 @@ func setupProvider(c config.Config) (autoscaler.Provider, error) { amazon.WithSecurityGroup(c.Amazon.SecurityGroup...), amazon.WithSize(c.Amazon.Instance), amazon.WithSizeAlt(c.Amazon.InstanceAlt), - amazon.WithSubnet(c.Amazon.SubnetID), + amazon.WithSubnets(append([]string{c.Amazon.SubnetID}, c.Amazon.SubnetIDsAlt...)), amazon.WithTags(c.Amazon.Tags), amazon.WithUserData(c.Amazon.UserData), amazon.WithUserDataFile(c.Amazon.UserDataFile), diff --git a/config/config.go b/config/config.go index 3b36185d..427e4dae 100644 --- a/config/config.go +++ b/config/config.go @@ -132,6 +132,7 @@ type ( Retries int SSHKey string SubnetID string `split_words:"true"` + SubnetIDsAlt []string `envconfig:"DRONE_AMAZON_SUBNET_IDS_ALT"` // In the same manner as InstanceAlt, allows fallback to other subnets if provisioning in the main one fails SecurityGroup []string `split_words:"true"` Tags map[string]string UserData string `envconfig:"DRONE_AMAZON_USERDATA"` diff --git a/config/load_test.go b/config/load_test.go index 49b44d06..e9aeb5ca 100644 --- a/config/load_test.go +++ b/config/load_test.go @@ -125,6 +125,7 @@ func TestLoad(t *testing.T) { "DRONE_AMAZON_REGION": "us-east-2", "DRONE_AMAZON_SSHKEY": "id_rsa", "DRONE_AMAZON_SUBNET_ID": "subnet-0b32177f", + "DRONE_AMAZON_SUBNET_IDS_ALT": "subnet-abcd,subnet-efgh", "DRONE_AMAZON_SECURITY_GROUP": "sg-770eabe1", "DRONE_AMAZON_TAGS": "os:linux,arch:amd64", "DRONE_AMAZON_USERDATA": "#cloud-init", @@ -264,6 +265,10 @@ var jsonConfig = []byte(`{ "Region": "us-east-2", "SSHKey": "id_rsa", "SubnetID": "subnet-0b32177f", + "SubnetIDsAlt": [ + "subnet-abcd", + "subnet-efgh" + ], "SecurityGroup": [ "sg-770eabe1" ], diff --git a/drivers/amazon/create.go b/drivers/amazon/create.go index 7d999ac3..a6be16e3 100644 --- a/drivers/amazon/create.go +++ b/drivers/amazon/create.go @@ -8,6 +8,7 @@ import ( "bytes" "context" "encoding/base64" + "fmt" "time" "github.com/drone/autoscaler" @@ -17,16 +18,41 @@ import ( "github.com/aws/aws-sdk-go/service/ec2" ) +type attemptOverrides struct { + attempt int + size string + subnet string +} + func (p *provider) Create(ctx context.Context, opts autoscaler.InstanceCreateOpts) (*autoscaler.Instance, error) { - instance, err := p.create(ctx, opts, false) - // if the instance was successfully provisioned, - // return the instance. - if err == nil { - return instance, err + attemptOverrides := attemptOverrides{ + attempt: 1, + size: p.size, } - // if the instance was provisioned with errors, - // return the instance and the error + tryCreateInAllSubnets := func() (*autoscaler.Instance, error) { + var ( + instance *autoscaler.Instance + err error + ) + for _, subnet := range p.subnets { + attemptOverrides.subnet = subnet + + instance, err = p.create(ctx, opts, attemptOverrides) + // if the instance was provisioned (with or without errors), return the instance. + if instance != nil { + return instance, err + } + + attemptOverrides.attempt++ + } + + return nil, fmt.Errorf("failed to create instance in all subnets: %w", err) + } + + instance, err := tryCreateInAllSubnets() + + // if the instance was provisioned (with or without errors), return the instance. if instance != nil { return instance, err } @@ -34,14 +60,15 @@ func (p *provider) Create(ctx context.Context, opts autoscaler.InstanceCreateOpt // if the instance was not provisioned, and fallback // parameters were provided, retry using the fallback if p.sizeAlt != "" { - instance, err = p.create(ctx, opts, true) + attemptOverrides.size = p.sizeAlt + instance, err = tryCreateInAllSubnets() } // if there is no fallback logic do not retry return instance, err } -func (p *provider) create(ctx context.Context, opts autoscaler.InstanceCreateOpts, retry bool) (*autoscaler.Instance, error) { +func (p *provider) create(ctx context.Context, opts autoscaler.InstanceCreateOpts, overrides attemptOverrides) (*autoscaler.Instance, error) { p.init.Do(func() { p.setup(ctx) }) @@ -76,7 +103,7 @@ func (p *provider) create(ctx context.Context, opts autoscaler.InstanceCreateOpt in := &ec2.RunInstancesInput{ KeyName: aws.String(p.key), ImageId: aws.String(p.image), - InstanceType: aws.String(p.size), + InstanceType: aws.String(overrides.size), MinCount: aws.Int64(1), MaxCount: aws.Int64(1), InstanceMarketOptions: marketOptions, @@ -86,7 +113,7 @@ func (p *provider) create(ctx context.Context, opts autoscaler.InstanceCreateOpt { AssociatePublicIpAddress: aws.Bool(!p.privateIP), DeviceIndex: aws.Int64(0), - SubnetId: aws.String(p.subnet), + SubnetId: aws.String(overrides.subnet), Groups: aws.StringSlice(p.groups), }, }, @@ -125,28 +152,12 @@ func (p *provider) create(ctx context.Context, opts autoscaler.InstanceCreateOpt } logger := logger.FromContext(ctx). - WithField("attempt", 1). + WithField("attempt", overrides.attempt). + WithField("size", overrides.size). + WithField("subnet", overrides.subnet). WithField("region", p.region). WithField("image", p.image). - WithField("size", p.size). WithField("name", opts.Name) - - // TODO(bradyrdzewski) instead of passing a re-try flag - // and then setting parameters, we should instead accept - // an struct that specifies the size, image and any other - // alternate values that one may want to try - - // if this is our second attempt to create the instance, - // re-create using the alternate instance size. - if retry { - in.InstanceType = aws.String(p.sizeAlt) - - // update the logger to reflect this is a retry. - logger = logger. - WithField("size", p.sizeAlt). - WithField("attempt", 2) - } - logger.Debug("instance create") results, err := client.RunInstances(in) diff --git a/drivers/amazon/option.go b/drivers/amazon/option.go index 15aba88a..6684dfe5 100644 --- a/drivers/amazon/option.go +++ b/drivers/amazon/option.go @@ -78,10 +78,10 @@ func WithSSHKey(key string) Option { } } -// WithSubnet returns an option to set the subnet id. -func WithSubnet(id string) Option { +// WithSubnets returns an option to set the subnet ids. +func WithSubnets(ids []string) Option { return func(p *provider) { - p.subnet = id + p.subnets = ids } } diff --git a/drivers/amazon/option_test.go b/drivers/amazon/option_test.go index a3c4d6a8..d1540238 100644 --- a/drivers/amazon/option_test.go +++ b/drivers/amazon/option_test.go @@ -16,7 +16,7 @@ func TestOptions(t *testing.T) { WithSecurityGroup("sg-770eabe1"), WithSize("t3.2xlarge"), WithSSHKey("id_rsa"), - WithSubnet("subnet-0b32177f"), + WithSubnets([]string{"subnet-0b32177f"}), WithTags(map[string]string{"foo": "bar", "baz": "qux"}), WithVolumeSize(64), WithVolumeType("io1"), @@ -40,7 +40,7 @@ func TestOptions(t *testing.T) { if got, want := p.groups[0], "sg-770eabe1"; got != want { t.Errorf("Want security groups %q, got %q", want, got) } - if got, want := p.subnet, "subnet-0b32177f"; got != want { + if got, want := p.subnets, []string{"subnet-0b32177f"}; len(got) != 1 || got[0] != want[0] { t.Errorf("Want subnet %q, got %q", want, got) } if got, want := p.retries, 10; got != want { diff --git a/drivers/amazon/provider.go b/drivers/amazon/provider.go index 8b787244..62234a8b 100644 --- a/drivers/amazon/provider.go +++ b/drivers/amazon/provider.go @@ -32,7 +32,7 @@ type provider struct { userdata *template.Template size string sizeAlt string - subnet string + subnets []string groups []string tags map[string]string iamProfileArn string diff --git a/drivers/amazon/setup.go b/drivers/amazon/setup.go index c93b0649..4651bd29 100644 --- a/drivers/amazon/setup.go +++ b/drivers/amazon/setup.go @@ -21,7 +21,7 @@ func (p *provider) setup(ctx context.Context) error { return p.setupKeypair(ctx) }) } - if p.subnet == "" { + if len(p.subnets) == 0 { // TODO: find or create subnet } if len(p.groups) == 0 {