Skip to content

Commit

Permalink
Include exit codes in job failed event. (#282)
Browse files Browse the repository at this point in the history
  • Loading branch information
jankaspar authored Dec 16, 2019
1 parent a7a6f9e commit 1365f1c
Show file tree
Hide file tree
Showing 9 changed files with 253 additions and 69 deletions.
3 changes: 3 additions & 0 deletions client/DotNet/Armada.Client/ClientGenerated.cs
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,9 @@ public partial class ApiJobFailedEvent
[Newtonsoft.Json.JsonProperty("Created", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public System.DateTimeOffset? Created { get; set; }

[Newtonsoft.Json.JsonProperty("ExitCodes", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public System.Collections.Generic.IDictionary<string, int> ExitCodes { get; set; }

[Newtonsoft.Json.JsonProperty("JobId", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public string JobId { get; set; }

Expand Down
7 changes: 7 additions & 0 deletions internal/armada/api/api.swagger.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,13 @@ func SwaggerJsonTemplate() string {
" \"type\": \"string\",\n" +
" \"format\": \"date-time\"\n" +
" },\n" +
" \"ExitCodes\": {\n" +
" \"type\": \"object\",\n" +
" \"additionalProperties\": {\n" +
" \"type\": \"integer\",\n" +
" \"format\": \"int32\"\n" +
" }\n" +
" },\n" +
" \"JobId\": {\n" +
" \"type\": \"string\"\n" +
" },\n" +
Expand Down
7 changes: 7 additions & 0 deletions internal/armada/api/api.swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,13 @@
"type": "string",
"format": "date-time"
},
"ExitCodes": {
"type": "object",
"additionalProperties": {
"type": "integer",
"format": "int32"
}
},
"JobId": {
"type": "string"
},
Expand Down
280 changes: 215 additions & 65 deletions internal/armada/api/event.pb.go

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions internal/armada/api/event.proto
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ message JobFailedEvent {
google.protobuf.Timestamp Created = 4 [(gogoproto.stdtime) = true, (gogoproto.nullable) = false];
string ClusterId = 5;
string Reason = 6;
map<string, int32> ExitCodes = 7;
}

message JobSucceededEvent {
Expand Down
5 changes: 3 additions & 2 deletions internal/executor/reporter/event.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func CreateEventForCurrentState(pod *v1.Pod, clusterId string) (api.Event, error
ClusterId: clusterId,
}, nil
case v1.PodFailed:
return CreateJobFailedEvent(pod, util.ExtractPodFailedReason(pod), clusterId), nil
return CreateJobFailedEvent(pod, util.ExtractPodFailedReason(pod), util.ExtractPodExitCodes(pod), clusterId), nil
case v1.PodSucceeded:
return &api.JobSucceededEvent{
JobId: pod.Labels[domain.JobId],
Expand Down Expand Up @@ -69,13 +69,14 @@ func CreateJobLeaseReturnedEvent(pod *v1.Pod, reason string, clusterId string) a
}
}

func CreateJobFailedEvent(pod *v1.Pod, reason string, clusterId string) api.Event {
func CreateJobFailedEvent(pod *v1.Pod, reason string, exitCodes map[string]int32, clusterId string) api.Event {
return &api.JobFailedEvent{
JobId: pod.Labels[domain.JobId],
JobSetId: pod.Annotations[domain.JobSetId],
Queue: pod.Labels[domain.Queue],
Created: time.Now(),
ClusterId: clusterId,
Reason: reason,
ExitCodes: exitCodes,
}
}
2 changes: 1 addition & 1 deletion internal/executor/service/cluster_allocation.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func (allocationService *ClusterAllocationService) failJobs(failedSubmissions []
toBeReportedDone := make([]*v1.Pod, 0, 10)

for _, details := range failedSubmissions {
failEvent := reporter.CreateJobFailedEvent(details.pod, details.error.Status().Message, allocationService.clusterContext.GetClusterId())
failEvent := reporter.CreateJobFailedEvent(details.pod, details.error.Status().Message, map[string]int32{}, allocationService.clusterContext.GetClusterId())
err := allocationService.eventReporter.Report(failEvent)

if err == nil {
Expand Down
2 changes: 1 addition & 1 deletion internal/executor/service/stuck_pod_detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func (podProgressMonitor *StuckPodDetector) onStuckPodDetected(pod *v1.Pod) (res
if util.IsRetryable(pod) {
event = reporter.CreateJobUnableToScheduleEvent(pod, util.ExtractPodStuckReason(pod), podProgressMonitor.clusterId)
} else {
event = reporter.CreateJobFailedEvent(pod, util.ExtractPodStuckReason(pod), podProgressMonitor.clusterId)
event = reporter.CreateJobFailedEvent(pod, util.ExtractPodStuckReason(pod), map[string]int32{}, podProgressMonitor.clusterId)
}

err := podProgressMonitor.eventReporter.Report(event)
Expand Down
15 changes: 15 additions & 0 deletions internal/executor/util/pod_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,21 @@ func ExtractPodFailedReason(pod *v1.Pod) string {
return failedMessage
}

func ExtractPodExitCodes(pod *v1.Pod) map[string]int32 {
containerStatuses := pod.Status.ContainerStatuses
containerStatuses = append(containerStatuses, pod.Status.InitContainerStatuses...)

exitCodes := map[string]int32{}

for _, containerStatus := range containerStatuses {
if containerStatus.State.Terminated != nil {
exitCodes[containerStatus.Name] = containerStatus.State.Terminated.ExitCode
}
}

return exitCodes
}

func IsRetryable(pod *v1.Pod) bool {
containerStatuses := pod.Status.ContainerStatuses
containerStatuses = append(containerStatuses, pod.Status.InitContainerStatuses...)
Expand Down

0 comments on commit 1365f1c

Please sign in to comment.