Skip to content

Commit

Permalink
net: lib: aws_fota: Improve error handling for FOTA jobs
Browse files Browse the repository at this point in the history
Improves error handling by providing more fine grained
errors on job parsing. Jobs are now marked as failed
if document is invalid but the job id could be parsed.
A new error state is introduced to enable
retries for marking jobs as failed.
Additionally, the job update accepted/rejected topics
are now longer subscribed to, as this is not necessary
to receive the respective messages and thus adds
unnecessary overhead.

Signed-off-by: Fabian Nawratil <fabian.nawratil@nordicsemi.no>
  • Loading branch information
fnawratil committed Aug 17, 2023
1 parent a8b6514 commit e94d736
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 143 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,7 @@ Libraries for networking

* The :kconfig:option:`CONFIG_AWS_FOTA_HOSTNAME_MAX_LEN` Kconfig option has been replaced by the :kconfig:option:`CONFIG_DOWNLOAD_CLIENT_MAX_HOSTNAME_SIZE` Kconfig option.
* The :kconfig:option:`CONFIG_AWS_FOTA_FILE_PATH_MAX_LEN` Kconfig option has been replaced by the :kconfig:option:`CONFIG_DOWNLOAD_CLIENT_MAX_FILENAME_SIZE` Kconfig option.
* AWS FOTA jobs are now marked as failed if the job document for the update is invalid.

* :ref:`lib_azure_fota` library:

Expand Down
14 changes: 11 additions & 3 deletions subsys/net/lib/aws_fota/include/aws_fota_json.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@ extern "C" {
*/
#define EXECUTION_OBJ_DECODED_BIT 2

/** @brief AWS FOTA JSON parse result. */
enum aws_fota_json_result {
AWS_FOTA_JSON_RES_SKIPPED = 1, /*!< Not a FOTA job document, skipped */
AWS_FOTA_JSON_RES_SUCCESS = 0, /*!< Job document parsed successfully */
AWS_FOTA_JSON_RES_INVALID_PARAMS = -1, /*!< Input parameters invalid */
AWS_FOTA_JSON_RES_INVALID_JOB = -2, /*!< Job document invalid, could not get job id */
AWS_FOTA_JSON_RES_INVALID_DOCUMENT = -3,/*!< FOTA update data invalid */
AWS_FOTA_JSON_RES_URL_TOO_LONG = -4, /*!< Parts of URL too large for buffer */
};

/**
* @brief Parse a given AWS IoT DescribeJobExecution response JSON object.
* More information on this object can be found at https://docs.aws.amazon.com/iot/latest/developerguide/jobs-api.html#mqtt-describejobexecution
Expand All @@ -54,9 +64,7 @@ extern "C" {
* @param[in] file_path_buf_size Size of the output buffer for the "file" field
* @param[out] version_number Version number from the Job Execution data type.
*
* @return 0 if the Job Execution object is empty, 1 if Job Execution object was
* correctly decoded, otherwise a negative error code is returned
* identicating reason of failure.
* @return aws_fota_json_result specifying the result of the parse operation.
**/
int aws_fota_parse_DescribeJobExecution_rsp(const char *job_document,
uint32_t payload_len,
Expand Down
194 changes: 102 additions & 92 deletions subsys/net/lib/aws_fota/src/aws_fota.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ static enum internal_state {
STATE_INIT,
STATE_DOWNLOADING,
STATE_DOWNLOAD_COMPLETE,
STATE_ERROR,
} internal_state = STATE_UNINIT;

/* Enum used when parsing AWS jobs topics messages are received on. */
Expand Down Expand Up @@ -78,6 +79,8 @@ static char *state2str(enum internal_state state)
return "STATE_DOWNLOADING";
case STATE_DOWNLOAD_COMPLETE:
return "STATE_DOWNLOAD_COMPLETE";
case STATE_ERROR:
return "STATE_ERROR";
default:
return "Unknown";
}
Expand All @@ -95,13 +98,18 @@ static void internal_state_set(enum internal_state new_state)
internal_state = new_state;
}

static void set_current_job_id(uint8_t *job_id)
{
strncpy(job_id_handling, job_id, sizeof(job_id_handling));
job_id_handling[sizeof(job_id_handling) - 1] = '\0';
}

static void reset_library(void)
{
internal_state_set(STATE_INIT);
execution_status = AWS_JOBS_QUEUED;
download_progress = 0;
strncpy(job_id_handling, AWS_JOB_ID_DEFAULT, sizeof(job_id_handling));
job_id_handling[sizeof(job_id_handling) - 1] = '\0';
set_current_job_id(AWS_JOB_ID_DEFAULT);
LOG_DBG("Library reset");
}

Expand Down Expand Up @@ -142,8 +150,7 @@ static enum jobs_topic topic_type_get(const char *incoming_topic, size_t topic_l
*
* @return 0 If successful otherwise a negative error code is returned.
*/
static int get_published_payload(struct mqtt_client *client, uint8_t *write_buf,
size_t length)
static int get_published_payload(struct mqtt_client *client, uint8_t *write_buf, size_t length)
{
uint8_t *buf = write_buf;
uint8_t *end = buf + length;
Expand Down Expand Up @@ -208,6 +215,28 @@ static int update_job_execution(struct mqtt_client *const client,
return 0;
}

/**
* @brief Update the job document of the current job to AWS_JOBS_FAILED and move to ERROR state.
*
* @param client Connected MQTT client instance
*
* @return 0 If successful otherwise a negative error code is returned.
*/
static int set_current_job_failed(struct mqtt_client *const client)
{
struct aws_fota_event aws_fota_evt = {
.id = AWS_FOTA_EVT_ERROR
};

callback(&aws_fota_evt);
internal_state_set(STATE_ERROR);

return update_job_execution(client,
job_id_handling,
sizeof(job_id_handling),
AWS_JOBS_FAILED, "");
}

/**
* @brief Parsing an AWS IoT Job Execution response received on $next/get MQTT
* topic or notify-next. If it is a valid response the program state is
Expand All @@ -220,8 +249,7 @@ static int update_job_execution(struct mqtt_client *const client,
*
* @return 0 If successful otherwise a negative error code is returned.
*/
static int get_job_execution(struct mqtt_client *const client,
uint32_t payload_len)
static int parse_job_execution(struct mqtt_client *const client, uint32_t payload_len)
{
int err;
int execution_version_number_prev = execution_version_number;
Expand All @@ -248,40 +276,52 @@ static int get_job_execution(struct mqtt_client *const client,
file_path, sizeof(file_path),
&execution_version_number);

if (err < 0) {
LOG_ERR("Error when parsing the json: %d", err);
goto cleanup;
} else if (err == 0) {
if (err == AWS_FOTA_JSON_RES_SKIPPED) {
LOG_DBG("Got only one field");
LOG_DBG("No queued jobs for this device");
return 0;
} else if ((err < 0) &&
(err != AWS_FOTA_JSON_RES_INVALID_DOCUMENT) &&
(err != AWS_FOTA_JSON_RES_URL_TOO_LONG)) {
LOG_ERR("Error when parsing the json: %d", err);
err = -ENODATA;
goto cleanup;
}

/* Check if the incoming job is already being handled. */
if (strncmp(job_id_incoming, job_id_handling, sizeof(job_id_incoming)) == 0) {
LOG_WRN("Job already being handled, ignore message");
err = 0;
goto cleanup;
} else {
strncpy(job_id_handling, job_id_incoming, sizeof(job_id_handling));
job_id_handling[sizeof(job_id_handling) - 1] = '\0';
}

set_current_job_id(job_id_incoming);

/* Check if the update data is valid */
if (err == AWS_FOTA_JSON_RES_INVALID_DOCUMENT) {
LOG_ERR("Invalid FOTA update document: %d", err);
return set_current_job_failed(client);
} else if (err == AWS_FOTA_JSON_RES_URL_TOO_LONG) {
LOG_ERR("URL elements too long for buffer: %d", err);
return set_current_job_failed(client);
}

/* Valid update */
LOG_DBG("Job ID: %s", (char *)job_id_handling);
LOG_DBG("hostname: %s", (char *)hostname);
LOG_DBG("file_path %s", (char *)file_path);
LOG_DBG("execution_version_number: %d ", execution_version_number);

/* Subscribe to update topic to receive feedback on whether an
* update is accepted or not.
*/
err = aws_jobs_subscribe_topic_update(client, job_id_handling, update_topic);
err = update_job_execution(client,
job_id_handling,
sizeof(job_id_handling),
AWS_JOBS_IN_PROGRESS,
"");
if (err) {
LOG_ERR("Error when subscribing job_id_update: %d", err);
goto cleanup;
LOG_ERR("update_job_execution failed, error: %d", err);
return err;
}

LOG_DBG("Subscribed to FOTA update topic %s", (char *)update_topic);
return 0;

cleanup:
Expand All @@ -298,8 +338,7 @@ static int get_job_execution(struct mqtt_client *const client,
*
* @return 0 If successful otherwise a negative error code is returned.
*/
static int job_update_accepted(struct mqtt_client *const client,
uint32_t payload_len)
static int job_update_accepted(struct mqtt_client *const client, uint32_t payload_len)
{
int err;
int sec_tag = -1;
Expand Down Expand Up @@ -334,12 +373,7 @@ static int job_update_accepted(struct mqtt_client *const client,
err = fota_download_start(hostname, file_path, sec_tag, 0, 0);
if (err) {
LOG_ERR("Error (%d) when trying to start firmware download", err);
aws_fota_evt.id = AWS_FOTA_EVT_ERROR;
callback(&aws_fota_evt);
return update_job_execution(client,
job_id_handling,
sizeof(job_id_handling),
AWS_JOBS_FAILED, "");
return set_current_job_failed(client);
}

internal_state_set(STATE_DOWNLOADING);
Expand All @@ -361,6 +395,13 @@ static int job_update_accepted(struct mqtt_client *const client,
reset_library();
}
break;
case AWS_JOBS_FAILED:
LOG_DBG("Job document was updated with status FAILED");
reset_library();

/* Ignore return value since error does not need to be handled. */
(void)aws_jobs_get_job_execution(client_internal, "$next", get_topic);
break;
default:
LOG_ERR("Invalid execution status");
return -EINVAL;
Expand All @@ -378,8 +419,7 @@ static int job_update_accepted(struct mqtt_client *const client,
*
* @return A negative error code is returned.
*/
static int job_update_rejected(struct mqtt_client *const client,
uint32_t payload_len)
static int job_update_rejected(struct mqtt_client *const client, uint32_t payload_len)
{
struct aws_fota_event aws_fota_evt = { .id = AWS_FOTA_EVT_ERROR };
LOG_ERR("Job document update was rejected");
Expand Down Expand Up @@ -449,10 +489,11 @@ static int on_publish_evt(struct mqtt_client *const client,
}

LOG_DBG("Checking for an available job");
return get_job_execution(client, payload_len);
return parse_job_execution(client, payload_len);
case TOPIC_UPDATE_ACCEPTED:
if (internal_state != STATE_INIT &&
internal_state != STATE_DOWNLOAD_COMPLETE) {
internal_state != STATE_DOWNLOAD_COMPLETE &&
internal_state != STATE_ERROR) {
goto read_payload;
}

Expand All @@ -464,7 +505,8 @@ static int on_publish_evt(struct mqtt_client *const client,
return job_update_accepted(client, payload_len);
case TOPIC_UPDATE_REJECTED:
if (internal_state != STATE_INIT &&
internal_state != STATE_DOWNLOAD_COMPLETE) {
internal_state != STATE_DOWNLOAD_COMPLETE &&
internal_state != STATE_ERROR) {
goto read_payload;
}

Expand Down Expand Up @@ -500,6 +542,7 @@ static int on_publish_evt(struct mqtt_client *const client,
static int on_connack_evt(struct mqtt_client *const client)
{
int err;
enum execution_status status;

switch (internal_state) {
case STATE_INIT:
Expand All @@ -514,24 +557,30 @@ static int on_connack_evt(struct mqtt_client *const client)
LOG_ERR("Unable to subscribe to jobs/$next/get");
return err;
}

status = AWS_JOBS_IN_PROGRESS;
break;
case STATE_DOWNLOADING:
/* Fall through */
case STATE_DOWNLOAD_COMPLETE:
if (strncmp(job_id_handling, AWS_JOB_ID_DEFAULT, sizeof(job_id_handling)) == 0) {
return -ECANCELED;
}

err = aws_jobs_subscribe_topic_update(client, job_id_handling, update_topic);
if (err) {
LOG_ERR("Error when subscribing job_id_update: %d", err);
return err;
}

LOG_DBG("Subscribed to FOTA update topic %s", (char *)update_topic);
status = AWS_JOBS_SUCCEEDED;
break;
default:
case STATE_ERROR:
status = AWS_JOBS_FAILED;
break;
default:
return 0;
}

/* If we have just reconnected, we might already be handling a job.
* Check if this is the case and update the job status accordingly.
*/
if (strncmp(job_id_handling, AWS_JOB_ID_DEFAULT, sizeof(job_id_handling)) == 0) {
return 0;
}

err = update_job_execution(client, job_id_handling, sizeof(job_id_handling), status, "");
if (err) {
LOG_ERR("update_job_execution failed, error: %d", err);
return err;
}

return 0;
Expand Down Expand Up @@ -560,34 +609,7 @@ static int on_suback_evt(struct mqtt_client *const client, uint16_t message_id)
break;
case SUBSCRIBE_JOB_ID_UPDATE:
LOG_DBG("Subscribed to job ID update accepted/rejected topics");

enum execution_status status;

switch (internal_state) {
case STATE_INIT:
status = AWS_JOBS_IN_PROGRESS;
break;
case STATE_DOWNLOAD_COMPLETE:
status = AWS_JOBS_SUCCEEDED;
break;
case STATE_DOWNLOADING:
return 0;
default:
LOG_WRN("Invalid state");
return -ECANCELED;
}

err = update_job_execution(client,
job_id_handling,
sizeof(job_id_handling),
status,
"");
if (err) {
LOG_ERR("update_job_execution failed, error: %d", err);
return err;
}

break;
default:
/* Message ID not related to AWS FOTA. */
break;
Expand All @@ -596,8 +618,7 @@ static int on_suback_evt(struct mqtt_client *const client, uint16_t message_id)
return 0;
}

int aws_fota_mqtt_evt_handler(struct mqtt_client *const client,
const struct mqtt_evt *evt)
int aws_fota_mqtt_evt_handler(struct mqtt_client *const client, const struct mqtt_evt *evt)
{
int err;

Expand Down Expand Up @@ -739,21 +760,11 @@ static void http_fota_handler(const struct fota_download_evt *evt)

case FOTA_DOWNLOAD_EVT_ERROR:
LOG_ERR("FOTA_DOWNLOAD_EVT_ERROR");
(void)update_job_execution(client_internal,
job_id_handling,
sizeof(job_id_handling),
AWS_JOBS_FAILED,
"");

aws_fota_evt.id = AWS_FOTA_EVT_ERROR;

callback(&aws_fota_evt);
reset_library();

/* If the FOTA download fails it might be due to the image being deleted.
* Try to get the next job if any exist.
*/
(void)aws_jobs_get_job_execution(client_internal, "$next", get_topic);
err = set_current_job_failed(client_internal);
if (err < 0) {
reset_library();
}
break;

case FOTA_DOWNLOAD_EVT_PROGRESS:
Expand All @@ -772,8 +783,7 @@ static void http_fota_handler(const struct fota_download_evt *evt)
}
}

int aws_fota_init(struct mqtt_client *const client,
aws_fota_callback_t evt_handler)
int aws_fota_init(struct mqtt_client *const client, aws_fota_callback_t evt_handler)
{
int err;

Expand Down
Loading

0 comments on commit e94d736

Please sign in to comment.