diff --git a/.buildkite/steps/tests.sh b/.buildkite/steps/tests.sh index 4953d9c0..084a29fd 100755 --- a/.buildkite/steps/tests.sh +++ b/.buildkite/steps/tests.sh @@ -13,6 +13,5 @@ export IMAGE gotestsum --junitfile "junit-${BUILDKITE_JOB_ID}.xml" -- \ -count=1 \ - -failfast \ -ldflags="-X ${package}.branch=${branch}" \ ./... diff --git a/internal/controller/scheduler/scheduler.go b/internal/controller/scheduler/scheduler.go index a0c34ecf..7177fc3b 100644 --- a/internal/controller/scheduler/scheduler.go +++ b/internal/controller/scheduler/scheduler.go @@ -67,7 +67,7 @@ func (w *worker) Create(ctx context.Context, job *api.CommandJob) error { logger := w.logger.With(zap.String("uuid", job.Uuid)) logger.Info("creating job") jobWrapper := NewJobWrapper(w.logger, job, w.cfg).ParsePlugins() - kjob, err := jobWrapper.Build() + kjob, err := jobWrapper.Build(false) if err != nil { kjob, err = jobWrapper.BuildFailureJob(err) if err != nil { @@ -146,7 +146,7 @@ func (w *jobWrapper) ParsePlugins() *jobWrapper { return w } -func (w *jobWrapper) Build() (*batchv1.Job, error) { +func (w *jobWrapper) Build(skipCheckout bool) (*batchv1.Job, error) { // if previous steps have failed, error immediately if w.err != nil { return nil, w.err @@ -231,7 +231,11 @@ func (w *jobWrapper) Build() (*batchv1.Job, error) { volumeMounts := []corev1.VolumeMount{{Name: "workspace", MountPath: "/workspace"}} volumeMounts = append(volumeMounts, w.k8sPlugin.ExtraVolumeMounts...) - const systemContainers = 1 + systemContainerCount := 0 + if !skipCheckout { + systemContainerCount = 1 + } + ttl := int32(w.cfg.JobTTL.Seconds()) kjob.Spec.TTLSecondsAfterFinished = &ttl @@ -261,7 +265,7 @@ func (w *jobWrapper) Build() (*batchv1.Job, error) { }, corev1.EnvVar{ Name: "BUILDKITE_CONTAINER_ID", - Value: strconv.Itoa(i + systemContainers), + Value: strconv.Itoa(i + systemContainerCount), }, corev1.EnvVar{ Name: "BUILDKITE_PLUGINS_PATH", @@ -294,7 +298,7 @@ func (w *jobWrapper) Build() (*batchv1.Job, error) { podSpec.Containers[i] = c } - containerCount := len(podSpec.Containers) + systemContainers + containerCount := len(podSpec.Containers) + systemContainerCount for i, c := range w.k8sPlugin.Sidecars { if c.Name == "" { @@ -366,10 +370,12 @@ func (w *jobWrapper) Build() (*batchv1.Job, error) { }, } agentContainer.Env = append(agentContainer.Env, env...) + podSpec.Containers = append(podSpec.Containers, agentContainer) - checkoutContainer := w.createCheckoutContainer(kjob, env, volumeMounts) + if !skipCheckout { + podSpec.Containers = append(podSpec.Containers, w.createCheckoutContainer(kjob, env, volumeMounts)) + } - podSpec.Containers = append(podSpec.Containers, agentContainer, checkoutContainer) podSpec.InitContainers = append(podSpec.InitContainers, corev1.Container{ Name: "copy-agent", Image: w.cfg.Image, @@ -495,14 +501,24 @@ func (w *jobWrapper) BuildFailureJob(err error) (*batchv1.Job, error) { PodSpec: &corev1.PodSpec{ Containers: []corev1.Container{ { - Image: w.cfg.Image, + // the configured agent image may be private. If there is an error in specifying the + // secrets for this image, we should still be able to run the failure job. So, we + // bypass the potentially private image and use a public one. We could use a + // thinner public image like `alpine:latest`, but it's generally unwise to depend + // on an image that's not published by us. + // + // TODO: pin the version of the agent image and use that here. + // Currently, DefaultAgentImage has a latest tag. That's not ideal as + // a given version of agent stack-k8s may use different versions of the agent image over + // time. We should consider using a specific version of the agent image here. + Image: config.DefaultAgentImage, Command: []string{fmt.Sprintf("echo %q && exit 1", err.Error())}, }, }, }, } w.otherPlugins = nil - return w.Build() + return w.Build(true) } func (w *jobWrapper) labelWithAgentTags() { diff --git a/internal/controller/scheduler/scheduler_test.go b/internal/controller/scheduler/scheduler_test.go index b2ff6402..91096e8e 100644 --- a/internal/controller/scheduler/scheduler_test.go +++ b/internal/controller/scheduler/scheduler_test.go @@ -63,7 +63,7 @@ func TestJobPluginConversion(t *testing.T) { input, scheduler.Config{AgentToken: "token-secret"}, ) - result, err := wrapper.ParsePlugins().Build() + result, err := wrapper.ParsePlugins().Build(false) require.NoError(t, err) assert.Len(t, result.Spec.Template.Spec.Containers, 3) @@ -122,7 +122,7 @@ func TestTagEnv(t *testing.T) { AgentQueryRules: []string{"queue=kubernetes"}, } wrapper := scheduler.NewJobWrapper(logger, input, scheduler.Config{AgentToken: "token-secret"}) - result, err := wrapper.ParsePlugins().Build() + result, err := wrapper.ParsePlugins().Build(false) require.NoError(t, err) container := findContainer(t, result.Spec.Template.Spec.Containers, "agent") @@ -152,7 +152,7 @@ func TestJobWithNoKubernetesPlugin(t *testing.T) { AgentQueryRules: []string{}, } wrapper := scheduler.NewJobWrapper(zaptest.NewLogger(t), input, scheduler.Config{}) - result, err := wrapper.ParsePlugins().Build() + result, err := wrapper.ParsePlugins().Build(false) require.NoError(t, err) require.Len(t, result.Spec.Template.Spec.Containers, 3) @@ -179,7 +179,7 @@ func TestFailureJobs(t *testing.T) { AgentQueryRules: []string{"queue=kubernetes"}, } wrapper := scheduler.NewJobWrapper(zaptest.NewLogger(t), input, scheduler.Config{}) - _, err = wrapper.ParsePlugins().Build() + _, err = wrapper.ParsePlugins().Build(false) require.Error(t, err) result, err := wrapper.BuildFailureJob(err)