From 6ba258fb3b3c1a20cca68a237467cb392cb8dddc Mon Sep 17 00:00:00 2001 From: janiskemper <63146658+janiskemper@users.noreply.github.com> Date: Wed, 19 Jun 2024 16:44:43 +0200 Subject: [PATCH] :bug: Improve handling of bm server reboot timeouts (#1327) Bare metal servers currently can time out while rebooting. However, the timeouts don't make too much sense right now. - The timeout is too big, so that usually MachineHealthChecks will trigger before the timeout is reached. We reduce this here. - The server was just rebooted again if a timeout is reached. However, if the timeout is actually reached, we don't want to continue, as there is probably something wrong with the server. Now we set a permanent error. Additionally, there are some code improvements: - The check whether a reboot has been triggered requires an API call to Hetzner API. This check is now only done when necessary, so that we safe the unneccessary API calls. - We set a permanent error if a reboot is marked as failed, so that we stop reconciling. - The ProvisionSucceeded condition of a host is saved in case of a permanent error to give the user some feedback that the permanent error happened and why. --- api/v1beta1/conditions_const.go | 2 + pkg/services/baremetal/host/host.go | 62 +++++++++++++++--------- pkg/services/baremetal/host/host_test.go | 34 ++++++++++--- 3 files changed, 67 insertions(+), 31 deletions(-) diff --git a/api/v1beta1/conditions_const.go b/api/v1beta1/conditions_const.go index b19642a93..6bc272f1d 100644 --- a/api/v1beta1/conditions_const.go +++ b/api/v1beta1/conditions_const.go @@ -181,6 +181,8 @@ const ( LinuxOnOtherDiskFoundReason = "LinuxOnOtherDiskFound" // SSHToRescueSystemFailedReason indicates that the rescue system can't be reached via ssh. SSHToRescueSystemFailedReason = "SSHToRescueSystemFailed" + // RebootTimedOutReason indicates that the reboot timed out. + RebootTimedOutReason = "RebootTimedOut" ) const ( diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 1c1f7fab0..b00728160 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -52,7 +52,7 @@ const ( rebootWaitTime time.Duration = 15 * time.Second sshResetTimeout time.Duration = 5 * time.Minute softwareResetTimeout time.Duration = 5 * time.Minute - hardwareResetTimeout time.Duration = 60 * time.Minute + hardwareResetTimeout time.Duration = 5 * time.Minute rescue string = "rescue" rescuePort int = 22 gbToMebiBytes int = 1000 @@ -398,7 +398,7 @@ func (s *Service) handleIncompleteBoot(isRebootIntoRescue, isTimeout, isConnecti return false, s.handleErrorTypeSoftwareRebootFailed(isTimeout, isRebootIntoRescue) case infrav1.ErrorTypeHardwareRebootTriggered: - return false, s.handleErrorTypeHardwareRebootFailed(isTimeout, isRebootIntoRescue) + return s.handleErrorTypeHardwareRebootFailed(isTimeout, isRebootIntoRescue) } return false, fmt.Errorf("%w: %s", errUnexpectedErrorType, s.scope.HetznerBareMetalHost.Spec.Status.ErrorType) @@ -485,7 +485,8 @@ func (s *Service) handleErrorTypeSoftwareRebootFailed(isSSHTimeoutError, wantsRe return nil } -func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRescue bool) error { +// handleErrorTypeHardwareRebootFailed deals with hardware reboot failed cases and returns whether we should fail the process. +func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRescue bool) (bool, error) { rebootInto := "node" if wantsRescue { rebootInto = "rescue mode" @@ -494,11 +495,11 @@ func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRe // right hostname. This means that the server has not been rebooted and we need to escalate. // If we got a timeout error from ssh, it means that the server has not yet finished rebooting. // If the timeout for hardware reboots has been reached, then escalate. - if !isSSHTimeoutError || hasTimedOut(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated, hardwareResetTimeout) { + if !isSSHTimeoutError { if wantsRescue { // make sure hat we boot into rescue mode if that is necessary if err := s.ensureRescueMode(); err != nil { - return fmt.Errorf("failed to ensure rescue mode: %w", err) + return false, fmt.Errorf("failed to ensure rescue mode: %w", err) } } @@ -509,13 +510,33 @@ func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRe // we immediately set an error message in the host status to track the reboot we just performed if _, err := s.scope.RobotClient.RebootBMServer(s.scope.HetznerBareMetalHost.Spec.ServerID, infrav1.RebootTypeHardware); err != nil { s.handleRobotRateLimitExceeded(err, rebootServerStr) - return fmt.Errorf(errMsgFailedReboot, err) + return false, fmt.Errorf(errMsgFailedReboot, err) } msg := fmt.Sprintf("Reboot via ssh into %s failed. Now using rebootType %q.", rebootInto, infrav1.RebootTypeHardware) createRebootEvent(s.scope.HetznerBareMetalHost, infrav1.RebootTypeHardware, msg) } - return nil + + // if hardware reboots time out, we should fail + if hasTimedOut(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated, hardwareResetTimeout) { + msg := "reboot timed out - please check if server is working properly" + if wantsRescue { + msg = "The rescue system could not be reached. Please ensure that the machine tries to boot from network before booting from disk. This setting needs to be enabled permanently in the BIOS." + } + conditions.MarkFalse( + s.scope.HetznerBareMetalHost, + infrav1.ProvisionSucceededCondition, + infrav1.RebootTimedOutReason, + clusterv1.ConditionSeverityError, + msg, + ) + + record.Warn(s.scope.HetznerBareMetalHost, "HardwareRebootTimedOut", msg) + + return true, fmt.Errorf("hardware reboot timed out") + } + + return false, nil } func hasTimedOut(lastUpdated *metav1.Time, timeout time.Duration) bool { @@ -571,7 +592,7 @@ func (s *Service) actionRegistering() actionResult { failed, err := s.handleIncompleteBoot(true, isSSHTimeoutError, isSSHConnectionRefusedError) if failed { - return s.recordActionFailure(infrav1.ProvisioningError, err.Error()) + return s.recordActionFailure(infrav1.PermanentError, err.Error()) } if err != nil { return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)} @@ -738,25 +759,21 @@ func (s *Service) analyzeSSHOutputRegistering(out sshclient.Output) (isSSHTimeou } func (s *Service) analyzeSSHErrorRegistering(sshErr error) (isSSHTimeoutError, isConnectionRefused bool, reterr error) { - // check if the reboot triggered - rebootTriggered, err := s.rebootTriggered() - if err != nil { - return false, false, fmt.Errorf("failed to check whether reboot triggered: %w", err) - } - switch { case os.IsTimeout(sshErr) || sshclient.IsTimeoutError(sshErr): isSSHTimeoutError = true case sshclient.IsAuthenticationFailedError(sshErr): + // check if the reboot triggered + rebootTriggered, err := s.rebootTriggered() + if err != nil { + return false, false, fmt.Errorf("failed to check whether reboot triggered: %w", err) + } + if !rebootTriggered { return false, false, nil } reterr = fmt.Errorf("wrong ssh key: %w", sshErr) case sshclient.IsConnectionRefusedError(sshErr): - if !rebootTriggered && s.scope.HetznerBareMetalHost.Spec.Status.ErrorType != infrav1.ErrorTypeHardwareRebootTriggered { - // Reboot did not trigger - return false, false, nil - } isConnectionRefused = true default: @@ -1299,7 +1316,7 @@ func (s *Service) actionProvisioning() actionResult { } failed, err := s.handleIncompleteBoot(false, isSSHTimeoutError, isSSHConnectionRefusedError) if failed { - return s.recordActionFailure(infrav1.ProvisioningError, err.Error()) + return s.recordActionFailure(infrav1.PermanentError, err.Error()) } if err != nil { return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)} @@ -1488,7 +1505,7 @@ func (s *Service) actionEnsureProvisioned() (ar actionResult) { failed, err := s.handleIncompleteBoot(false, isTimeout, isSSHConnectionRefusedError) if failed { - return s.recordActionFailure(infrav1.ProvisioningError, err.Error()) + return s.recordActionFailure(infrav1.PermanentError, err.Error()) } if err != nil { return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)} @@ -1699,7 +1716,7 @@ func (s *Service) actionProvisioned() actionResult { } failed, err := s.handleIncompleteBoot(false, isTimeout, isSSHConnectionRefusedError) if failed { - return s.recordActionFailure(infrav1.ProvisioningError, err.Error()) + return s.recordActionFailure(infrav1.PermanentError, err.Error()) } if err != nil { return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)} @@ -1752,10 +1769,9 @@ func (s *Service) actionDeprovisioning() actionResult { // Permanent errors are those ones that do not get solved with de- or re-provisioning. if s.scope.HetznerBareMetalHost.Spec.Status.ErrorType != infrav1.PermanentError { s.scope.HetznerBareMetalHost.ClearError() + conditions.Delete(s.scope.HetznerBareMetalHost, infrav1.ProvisionSucceededCondition) } - conditions.Delete(s.scope.HetznerBareMetalHost, infrav1.ProvisionSucceededCondition) - return actionComplete{} } diff --git a/pkg/services/baremetal/host/host_test.go b/pkg/services/baremetal/host/host_test.go index 301e803a5..b64bc9f3e 100644 --- a/pkg/services/baremetal/host/host_test.go +++ b/pkg/services/baremetal/host/host_test.go @@ -447,12 +447,6 @@ var _ = Describe("handleIncompleteBoot", func() { Expect(robotMock.AssertNotCalled(GinkgoT(), "RebootBMServer", mock.Anything, mock.Anything)).To(BeTrue()) } }, - Entry("timed out hw reset", testCaseHandleIncompleteBootDifferentTimeouts{ - hostErrorType: infrav1.ErrorTypeHardwareRebootTriggered, - lastUpdated: time.Now().Add(-time.Hour), - expectedHostErrorType: infrav1.ErrorTypeHardwareRebootTriggered, - expectedRebootType: infrav1.RebootTypeHardware, - }), Entry("timed out sw reset", testCaseHandleIncompleteBootDifferentTimeouts{ hostErrorType: infrav1.ErrorTypeSoftwareRebootTriggered, lastUpdated: time.Now().Add(-5 * time.Minute), @@ -461,7 +455,7 @@ var _ = Describe("handleIncompleteBoot", func() { }), Entry("not timed out hw reset", testCaseHandleIncompleteBootDifferentTimeouts{ hostErrorType: infrav1.ErrorTypeHardwareRebootTriggered, - lastUpdated: time.Now().Add(-30 * time.Minute), + lastUpdated: time.Now().Add(-2 * time.Minute), expectedHostErrorType: infrav1.ErrorTypeHardwareRebootTriggered, expectedRebootType: infrav1.RebootType(""), }), @@ -496,6 +490,30 @@ var _ = Describe("handleIncompleteBoot", func() { Expect(host.Spec.Status.ErrorType).To(Equal(infrav1.ErrorTypeConnectionError)) Expect(robotMock.AssertNotCalled(GinkgoT(), "RebootBMServer", mock.Anything, mock.Anything)).To(BeTrue()) }) + + It("fails if hardware reboot times out", func() { + robotMock := robotmock.Client{} + robotMock.On("SetBootRescue", mock.Anything, sshFingerprint).Return(nil, nil) + robotMock.On("GetBootRescue", mock.Anything).Return(&models.Rescue{Active: true}, nil) + robotMock.On("RebootBMServer", mock.Anything, mock.Anything).Return(nil, nil) + + host := helpers.BareMetalHost("test-host", "default", + helpers.WithRebootTypes([]infrav1.RebootType{ + infrav1.RebootTypeSoftware, + infrav1.RebootTypeHardware, + infrav1.RebootTypePower, + }), + helpers.WithSSHSpec(), + helpers.WithSSHStatus(), + helpers.WithError(infrav1.ErrorTypeHardwareRebootTriggered, "", 1, metav1.Time{Time: time.Now().Add(-time.Hour)}), + ) + service := newTestService(host, &robotMock, nil, nil, nil) + + _, err := service.handleIncompleteBoot(true, true, false) + Expect(err).ToNot(Succeed()) + Expect(host.Spec.Status.ErrorType).To(Equal(infrav1.ErrorTypeHardwareRebootTriggered)) + Expect(robotMock.AssertNotCalled(GinkgoT(), "RebootBMServer", mock.Anything, mock.Anything)).To(BeTrue()) + }) }) Context("hostname rescue vs machinename", func() { @@ -753,7 +771,7 @@ var _ = Describe("analyzeSSHOutputInstallImage", func() { err: sshclient.ErrConnectionRefused, rescueActive: true, expectedIsTimeout: false, - expectedIsConnectionRefused: false, + expectedIsConnectionRefused: true, expectedErrMessage: "", }), Entry("connectionRefused error, rescue not active", testCaseAnalyzeSSHOutputInstallImageOutErr{