From ad00ae7e6e566315ed5f52a34a1665770738d3f7 Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Tue, 30 Jul 2024 19:27:45 -0400 Subject: [PATCH 1/5] Add retry patching configuration design. Signed-off-by: Tiger Kaovilai --- design/retry-patching-configuration_design.md | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 design/retry-patching-configuration_design.md diff --git a/design/retry-patching-configuration_design.md b/design/retry-patching-configuration_design.md new file mode 100644 index 0000000000..c9cb0134c8 --- /dev/null +++ b/design/retry-patching-configuration_design.md @@ -0,0 +1,104 @@ +# Backup Restore Status Patch Retrying Configuration + +## Abstract +When a restore completes, we want to ensure that the restore custom resource progress to the correct status. +If a patch call fails to update status to completion, it should be retried up to a certain time limit. + +This design proposes a way to configure timeout for this retry time limit. + +## Background +Original Issue: https://github.com/vmware-tanzu/velero/issues/7207 + +Velero was performing a restore when the API server was rolling out to a new version. +It had trouble connecting to the API server, but eventually, the restore was successful. +However, since the API server was still in the middle of rolling out, Velero failed to update the restore CR status and gave up. + +After the connection was restored, it didn't attempt to update, causing the restore CR to be stuck at "In progress" indefinitely. +This can lead to incorrect decisions for other components that rely on the backup/restore CR status to determine completion. + +## Goals +- Make timeout configurable for retry patching by reusing existing [`--resource-timeout` server flag](https://github.com/vmware-tanzu/velero/blob/d9ca14747925630664c9e4f85a682b5fc356806d/pkg/cmd/server/server.go#L245) + +## Non Goals +- Create a new timeout flag +- Refactor backup/restore workflow + + +## High-Level Design +We will add retries with timeout to existing patch calls that moves a backup/restore from InProgress to a final status such as +- Completed +- PartiallyFailed +- Failed +- FailedValidation + + +## Detailed Design +Relevant reconcilers will have `resourceTimeout time.Duration` added to its struct and to parameters of New[Backup|Restore]XReconciler functions. + +pkg/cmd/server/server.go in `func (s *server) runControllers(..) error` also update the New[Backup|Restore]XCReconciler with added duration parameters using value from existing `--resource-timeout` server flag. + +Current calls to kube.PatchResource involving status patch will be replaced with kube.PatchResourceWithRetriesOnErrors added to package `kube` below. + +Calls where there is a ...client.Patch() will be wrapped with client.RetriesPhasePatchFuncOnErrors() added to package `client` below. + +pkg/util/kube/client.go +```go +// PatchResourceWithRetries patches the original resource with the updated resource, retrying when the provided retriable function returns true. +func PatchResourceWithRetries(maxDuration time.Duration, original, updated client.Object, kbClient client.Client, retriable func(error) bool) error { + return veleroPkgClient.RetriesPhasePatchFunc(maxDuration, func() error { return PatchResource(original, updated, kbClient) }, retriable) +} + +// PatchResourceWithRetriesOnErrors patches the original resource with the updated resource, retrying when the operation returns an error. +func PatchResourceWithRetriesOnErrors(maxDuration time.Duration, original, updated client.Object, kbClient client.Client) error { + return PatchResourceWithRetries(maxDuration, original, updated, kbClient, func(err error) bool { + // retry using DefaultBackoff to resolve connection refused error that may occur when the server is under heavy load + // TODO: consider using a more specific error type to retry, for now, we retry on all errors + // specific errors: + // - connection refused: https://pkg.go.dev/syscall#:~:text=Errno(0x67)-,ECONNREFUSED,-%3D%20Errno(0x6f + return err != nil + }) +} +``` + +pkg/client/retry.go +```go +// CapBackoff provides a backoff with a set backoff cap +func CapBackoff(cap time.Duration) wait.Backoff { + if cap < 0 { + cap = 0 + } + return wait.Backoff{ + Steps: math.MaxInt, + Duration: 10 * time.Millisecond, + Cap: cap, + Factor: retry.DefaultBackoff.Factor, + Jitter: retry.DefaultBackoff.Jitter, + } +} + +// RetriesPhasePatchFunc accepts a patch function param, retrying when the provided retriable function returns true. +func RetriesPhasePatchFunc(maxDuration time.Duration, fn func() error, retriable func(error) bool) error { + return retry.OnError(CapBackoff(maxDuration), func(err error) bool { return retriable(err) }, fn) +} + +// RetriesPhasePatchFuncOnErrors accepts a patch function param, retrying when the error is not nil. +func RetriesPhasePatchFuncOnErrors(maxDuration time.Duration, fn func() error) error { + return RetriesPhasePatchFunc(maxDuration, fn, func(err error) bool { return err != nil }) +} +``` + +## Alternatives Considered + - Requeuing InProgress backups that is not known by current velero instance to still be in progress as failed (attempted in [#7863](https://github.com/vmware-tanzu/velero/pull/7863)) + - It was deemed as making backup restore flow hard to enhance for future reconciler updates such as adding cancel or adding parallel backups. + +## Security Considerations +None + +## Compatibility +Retry should only trigger a restore or backup that is already in progress and not patching successfully by current instance. Prior InProgress backups/restores will not be re-processed and will remain stuck InProgress until there is another velero server (re)start. + +## Implementation +There is a past implementation in [#7845](https://github.com/vmware-tanzu/velero/pull/7845/) where implementation for this design will be based upon. + +## Open Issues +A discussion of issues relating to this proposal for which the author does not know the solution. This section may be omitted if there are none. From 8f1424f04ea1e277dc0363da6f26b7a1fbd3eba7 Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Thu, 1 Aug 2024 22:17:15 -0400 Subject: [PATCH 2/5] sseago feedback: finalizing Signed-off-by: Tiger Kaovilai --- design/retry-patching-configuration_design.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/design/retry-patching-configuration_design.md b/design/retry-patching-configuration_design.md index c9cb0134c8..8262916d19 100644 --- a/design/retry-patching-configuration_design.md +++ b/design/retry-patching-configuration_design.md @@ -25,12 +25,17 @@ This can lead to incorrect decisions for other components that rely on the backu ## High-Level Design -We will add retries with timeout to existing patch calls that moves a backup/restore from InProgress to a final status such as +We will add retries with timeout to existing patch calls that moves a backup/restore from InProgress to a different status phase such as +- FailedValidation (final) +- Failed (final) +- WaitingForPluginOperations +- WaitingForPluginOperationsPartiallyFailed +- Finalizing +- FinalizingPartiallyFailed + +and from above non final phases to - Completed - PartiallyFailed -- Failed -- FailedValidation - ## Detailed Design Relevant reconcilers will have `resourceTimeout time.Duration` added to its struct and to parameters of New[Backup|Restore]XReconciler functions. From d112cc26dafb961406fb50023ad8a6ca38bb256b Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Thu, 1 Aug 2024 22:18:39 -0400 Subject: [PATCH 3/5] abstract backup/restore Signed-off-by: Tiger Kaovilai --- design/retry-patching-configuration_design.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/design/retry-patching-configuration_design.md b/design/retry-patching-configuration_design.md index 8262916d19..3adacf93a2 100644 --- a/design/retry-patching-configuration_design.md +++ b/design/retry-patching-configuration_design.md @@ -1,7 +1,7 @@ # Backup Restore Status Patch Retrying Configuration ## Abstract -When a restore completes, we want to ensure that the restore custom resource progress to the correct status. +When a backup/restore completes, we want to ensure that the restore custom resource progress to the correct status. If a patch call fails to update status to completion, it should be retried up to a certain time limit. This design proposes a way to configure timeout for this retry time limit. From cacb5f0eae0f2bd64d4f5b4de801d34aa7f10e4f Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Fri, 9 Aug 2024 17:02:37 -0400 Subject: [PATCH 4/5] Apply suggestions from code review Signed-off-by: Tiger Kaovilai Signed-off-by: Tiger Kaovilai --- design/retry-patching-configuration_design.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/design/retry-patching-configuration_design.md b/design/retry-patching-configuration_design.md index 3adacf93a2..b8ddbb44f1 100644 --- a/design/retry-patching-configuration_design.md +++ b/design/retry-patching-configuration_design.md @@ -1,7 +1,7 @@ # Backup Restore Status Patch Retrying Configuration ## Abstract -When a backup/restore completes, we want to ensure that the restore custom resource progress to the correct status. +When a backup/restore completes, we want to ensure that the custom resource progresses to the correct status. If a patch call fails to update status to completion, it should be retried up to a certain time limit. This design proposes a way to configure timeout for this retry time limit. @@ -105,5 +105,3 @@ Retry should only trigger a restore or backup that is already in progress and no ## Implementation There is a past implementation in [#7845](https://github.com/vmware-tanzu/velero/pull/7845/) where implementation for this design will be based upon. -## Open Issues -A discussion of issues relating to this proposal for which the author does not know the solution. This section may be omitted if there are none. From eebc4af484cb4889241bea39ee385c2914a13d82 Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Tue, 27 Aug 2024 20:19:36 -0400 Subject: [PATCH 5/5] Make retry func name more generic Signed-off-by: Tiger Kaovilai --- design/retry-patching-configuration_design.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/design/retry-patching-configuration_design.md b/design/retry-patching-configuration_design.md index b8ddbb44f1..efce3c634b 100644 --- a/design/retry-patching-configuration_design.md +++ b/design/retry-patching-configuration_design.md @@ -50,7 +50,7 @@ pkg/util/kube/client.go ```go // PatchResourceWithRetries patches the original resource with the updated resource, retrying when the provided retriable function returns true. func PatchResourceWithRetries(maxDuration time.Duration, original, updated client.Object, kbClient client.Client, retriable func(error) bool) error { - return veleroPkgClient.RetriesPhasePatchFunc(maxDuration, func() error { return PatchResource(original, updated, kbClient) }, retriable) + return veleroPkgClient.RetryOnRetriableMaxBackOff(maxDuration, func() error { return PatchResource(original, updated, kbClient) }, retriable) } // PatchResourceWithRetriesOnErrors patches the original resource with the updated resource, retrying when the operation returns an error. @@ -81,14 +81,14 @@ func CapBackoff(cap time.Duration) wait.Backoff { } } -// RetriesPhasePatchFunc accepts a patch function param, retrying when the provided retriable function returns true. -func RetriesPhasePatchFunc(maxDuration time.Duration, fn func() error, retriable func(error) bool) error { +// RetryOnRetriableMaxBackOff accepts a patch function param, retrying when the provided retriable function returns true. +func RetryOnRetriableMaxBackOff(maxDuration time.Duration, fn func() error, retriable func(error) bool) error { return retry.OnError(CapBackoff(maxDuration), func(err error) bool { return retriable(err) }, fn) } -// RetriesPhasePatchFuncOnErrors accepts a patch function param, retrying when the error is not nil. -func RetriesPhasePatchFuncOnErrors(maxDuration time.Duration, fn func() error) error { - return RetriesPhasePatchFunc(maxDuration, fn, func(err error) bool { return err != nil }) +// RetryOnErrorMaxBackOff accepts a patch function param, retrying when the error is not nil. +func RetryOnErrorMaxBackOff(maxDuration time.Duration, fn func() error) error { + return RetryOnRetriableMaxBackOff(maxDuration, fn, func(err error) bool { return err != nil }) } ```