Skip to content

Commit

Permalink
Ketch can't deploy new apps after failure
Browse files Browse the repository at this point in the history
When an invalid field is set in the App CR like so (it can be any invalid field):

```
      volumes:
      - name: aaa/bbb
        persistentVolumeClaim:
          claimName: aaa/bbb
```

The following kubernetes event will get emitted:

```
failed to get deploy events: create Pod bulletinboard-web-1-0 in StatefulSet bulletinboard-web-1 failed error: Pod "bulletinboard-web-1-0" is invalid: spec.volumes[0].name: Invalid value: "aaa/bbb": a lowercase RFC 1123 label must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?')
```

Due to this, the following for loop will never break as the ObservedGeneration will never increase from 0 due to the invalid field, only until 10 minutes has elapsed with the ctx:
ketch/app_controller.go at c6aa420 · theketchio/ketch

Additional logic within wl, err = cli.Get(ctx) to check the event associated with the workload and return the error on a specific condition (e.Type == "Warning" && e.Reason == "FailedCreate") was added
  • Loading branch information
aleksej-paschenko authored Jun 15, 2022
1 parent 4d7b379 commit b5c69cf
Showing 1 changed file with 37 additions and 0 deletions.
37 changes: 37 additions & 0 deletions internal/controllers/app_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,12 @@ type condition struct {
Reason string
}

type eventCondition struct {
Type string
Reason string
Message string
}

// workload contains the needed information for watchDeployEvents logic
// deployments and statefulsets are both supported so it became necessary
// to abstract their common properties into a separate type
Expand All @@ -254,6 +260,7 @@ type workload struct {
Generation int
ObservedGeneration int
Conditions []condition
Events []eventCondition
}

type workloadClient struct {
Expand Down Expand Up @@ -284,6 +291,15 @@ func (cli workloadClient) Get(ctx context.Context) (*workload, error) {
for _, c := range o.Status.Conditions {
w.Conditions = append(w.Conditions, condition{Type: string(c.Type), Reason: c.Reason})
}
e, err := cli.k8sClient.CoreV1().Events(cli.workloadNamespace).List(ctx, metav1.ListOptions{FieldSelector: "involvedObject.name=" + o.Name, TypeMeta: metav1.TypeMeta{Kind: "Pod"}})
if err != nil {
return nil, err
}
for _, e := range e.Items {
if e.FirstTimestamp == o.ObjectMeta.CreationTimestamp {
w.Events = append(w.Events, eventCondition{Type: e.Type, Reason: e.Reason, Message: e.Message})
}
}
return &w, nil
case ketchv1.StatefulSetAppType:
o, err := cli.k8sClient.AppsV1().StatefulSets(cli.workloadNamespace).Get(ctx, cli.workloadName, metav1.GetOptions{})
Expand All @@ -303,6 +319,15 @@ func (cli workloadClient) Get(ctx context.Context) (*workload, error) {
for _, c := range o.Status.Conditions {
w.Conditions = append(w.Conditions, condition{Type: string(c.Type), Reason: c.Reason})
}
e, err := cli.k8sClient.CoreV1().Events(cli.workloadNamespace).List(ctx, metav1.ListOptions{FieldSelector: "involvedObject.name=" + o.Name, TypeMeta: metav1.TypeMeta{Kind: "StatefulSet"}})
if err != nil {
return nil, err
}
for _, e := range e.Items {
if e.FirstTimestamp == o.ObjectMeta.CreationTimestamp {
w.Events = append(w.Events, eventCondition{Type: e.Type, Reason: e.Reason, Message: e.Message})
}
}
return &w, nil
}
return nil, fmt.Errorf("unknown workload type")
Expand Down Expand Up @@ -495,6 +520,9 @@ func (r *AppReconciler) watchDeployEvents(ctx context.Context, app *ketchv1.App,
recorder.Eventf(app, v1.EventTypeWarning, ketchv1.AppReconcileError, "error getting deployments: %s", err.Error())
return err
}
if err := checkWorkloadEvent(wl); err != nil {
return err
}
select {
case <-time.After(100 * time.Millisecond):
case <-timeout:
Expand Down Expand Up @@ -681,6 +709,15 @@ func isDeploymentEvent(msg watch.Event, name string) bool {
return ok && strings.HasPrefix(evt.Name, name)
}

func checkWorkloadEvent(wl *workload) error {
for _, e := range wl.Events {
if e.Type == "Warning" && e.Reason == "FailedCreate" {
return errors.New(e.Message)
}
}
return nil
}

// createDeployTimeoutError gets pods that are not status == ready aggregates and returns the pod phase errors
func createDeployTimeoutError(ctx context.Context, cli kubernetes.Interface, app *ketchv1.App, timeout time.Duration, namespace, group, label string) error {
var deploymentVersion int
Expand Down

0 comments on commit b5c69cf

Please sign in to comment.