Skip to content

Commit

Permalink
Improve JobFailedEvent for DeadlineExceeded pods (#574)
Browse files Browse the repository at this point in the history
* Improve JobFailedEvent for DeadlineExceeded pods

Pods can set `activeDeadlineSeconds` to mark how long they should run, if they exceed this active duration then kubernetes will kill them and mark them as failed.

Include a DeadlineExceeded cause in our JobFailedEvent Cause, so it is easy to identify when this happens programmatically

* Add generated dotnet client code

* Simplify comments

* Simplify comments
  • Loading branch information
JamesMurkin authored May 20, 2021
1 parent 9545f0f commit 8016092
Show file tree
Hide file tree
Showing 7 changed files with 168 additions and 117 deletions.
3 changes: 3 additions & 0 deletions client/DotNet/Armada.Client/ClientGenerated.cs
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,9 @@ public enum ApiCause
[System.Runtime.Serialization.EnumMember(Value = @"OOM")]
OOM = 2,

[System.Runtime.Serialization.EnumMember(Value = @"DeadlineExceeded")]
DeadlineExceeded = 3,

}

[System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "10.0.27.0 (Newtonsoft.Json v12.0.0.0)")]
Expand Down
18 changes: 13 additions & 5 deletions internal/executor/util/pod_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ var invalidImageNameStatesSet = util.StringListToSet([]string{"InvalidImageName"

const oomKilledReason = "OOMKilled"
const evictedReason = "Evicted"
const deadlineExceeded = "DeadlineExceeded"

func ExtractPodStuckReason(pod *v1.Pod) string {
containerStatuses := pod.Status.ContainerStatuses
Expand Down Expand Up @@ -63,6 +64,10 @@ func ExtractPodFailedCause(pod *v1.Pod) api.Cause {
if pod.Status.Reason == evictedReason {
return api.Cause_Evicted
}
if pod.Status.Reason == deadlineExceeded {
return api.Cause_DeadlineExceeded
}

containerStatuses := pod.Status.ContainerStatuses
containerStatuses = append(containerStatuses, pod.Status.InitContainerStatuses...)

Expand Down Expand Up @@ -96,18 +101,21 @@ func ExtractFailedPodContainerStatuses(pod *v1.Pod) []*api.ContainerStatus {
returnStatuses := make([]*api.ContainerStatus, 0, len(containerStatuses))

for _, containerStatus := range containerStatuses {
if containerStatus.State.Terminated == nil {
//This function is meant to be finding exit stauses of containers
//Skip non-finished containers
continue
}
status := &api.ContainerStatus{
Name: containerStatus.Name,
Cause: api.Cause_Error,
}
if isOom(containerStatus) {
status.Cause = api.Cause_OOM
}
if containerStatus.State.Terminated != nil {
status.ExitCode = containerStatus.State.Terminated.ExitCode
status.Message = containerStatus.State.Terminated.Message
status.Reason = containerStatus.State.Terminated.Reason
}
status.ExitCode = containerStatus.State.Terminated.ExitCode
status.Message = containerStatus.State.Terminated.Message
status.Reason = containerStatus.State.Terminated.Reason
returnStatuses = append(returnStatuses, status)
}

Expand Down
33 changes: 33 additions & 0 deletions internal/executor/util/pod_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,25 @@ package util
import (
"strings"
"testing"
"time"

"github.com/stretchr/testify/assert"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/G-Research/armada/pkg/api"
)

var evictedPod *v1.Pod
var oomPod *v1.Pod
var customErrorPod *v1.Pod
var deadlineExceededPod *v1.Pod

func init() {
evictedPod = createEvictedPod()
oomPod = createFailedPod(createOomContainerStatus())
customErrorPod = createFailedPod(createCustomErrorContainerStatus())
deadlineExceededPod = createDeadlineExceededPod()
}

func TestContainersAreRetryable_ReturnTrue_WhenNoContainerInImagePullBackoff(t *testing.T) {
Expand Down Expand Up @@ -126,6 +130,9 @@ func TestExtractPodFailedReason(t *testing.T) {
failedReason := ExtractPodFailedReason(evictedPod)
assert.Equal(t, failedReason, evictedPod.Status.Message)

failedReason = ExtractPodFailedReason(deadlineExceededPod)
assert.Equal(t, failedReason, deadlineExceededPod.Status.Message)

failedReason = ExtractPodFailedReason(oomPod)
assert.True(t, strings.Contains(failedReason, oomPod.Status.ContainerStatuses[0].State.Terminated.Reason))

Expand All @@ -137,6 +144,9 @@ func TestExtractPodFailedCause(t *testing.T) {
failedCause := ExtractPodFailedCause(evictedPod)
assert.Equal(t, failedCause, api.Cause_Evicted)

failedCause = ExtractPodFailedCause(deadlineExceededPod)
assert.Equal(t, failedCause, api.Cause_DeadlineExceeded)

failedCause = ExtractPodFailedCause(oomPod)
assert.Equal(t, failedCause, api.Cause_OOM)

Expand All @@ -148,6 +158,9 @@ func TestExtractFailedPodContainerStatuses(t *testing.T) {
containerStatuses := ExtractFailedPodContainerStatuses(evictedPod)
assert.Equal(t, len(containerStatuses), 0)

containerStatuses = ExtractFailedPodContainerStatuses(deadlineExceededPod)
assert.Equal(t, len(containerStatuses), 0)

containerStatuses = ExtractFailedPodContainerStatuses(oomPod)
assert.Equal(t, len(containerStatuses), 1)
assert.Equal(t, containerStatuses[0].Reason, oomPod.Status.ContainerStatuses[0].State.Terminated.Reason)
Expand Down Expand Up @@ -202,3 +215,23 @@ func createEvictedPod() *v1.Pod {
},
}
}

func createDeadlineExceededPod() *v1.Pod {
//For DeadlineExceeded, Kubernetes leaves the container statuses as Running even though it kills them on the host
containerStatus := v1.ContainerStatus{
Name: "app",
State: v1.ContainerState{
Running: &v1.ContainerStateRunning{
StartedAt: metav1.NewTime(time.Now()),
},
},
}
return &v1.Pod{
Status: v1.PodStatus{
Phase: v1.PodFailed,
Reason: "DeadlineExceeded",
Message: "Pod was active on the node longer than the specified deadline",
ContainerStatuses: []v1.ContainerStatus{containerStatus},
},
}
}
3 changes: 2 additions & 1 deletion pkg/api/api.swagger.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,8 @@ func SwaggerJsonTemplate() string {
" \"enum\": [\n" +
" \"Error\",\n" +
" \"Evicted\",\n" +
" \"OOM\"\n" +
" \"OOM\",\n" +
" \"DeadlineExceeded\"\n" +
" ]\n" +
" },\n" +
" \"apiContainerStatus\": {\n" +
Expand Down
3 changes: 2 additions & 1 deletion pkg/api/api.swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,8 @@
"enum": [
"Error",
"Evicted",
"OOM"
"OOM",
"DeadlineExceeded"
]
},
"apiContainerStatus": {
Expand Down
Loading

0 comments on commit 8016092

Please sign in to comment.