Skip to content

Commit

Permalink
🐛 Improve handling of bm server reboot timeouts (#1327)
Browse files Browse the repository at this point in the history
Bare metal servers currently can time out while rebooting. However, the
timeouts don't make too much sense right now.

- The timeout is too big, so that usually MachineHealthChecks will
  trigger before the timeout is reached. We reduce this here.
- The server was just rebooted again if a timeout is reached. However,
  if the timeout is actually reached, we don't want to continue, as
there is probably something wrong with the server. Now we set a
permanent error.

Additionally, there are some code improvements:
- The check whether a reboot has been triggered requires an API call to
  Hetzner API. This check is now only done when necessary, so that we
safe the unneccessary API calls.
- We set a permanent error if a reboot is marked as failed, so that we
  stop reconciling.
- The ProvisionSucceeded condition of a host is saved in case of a
  permanent error to give the user some feedback that the permanent
error happened and why.
  • Loading branch information
janiskemper authored Jun 19, 2024
1 parent 1daa785 commit 6ba258f
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 31 deletions.
2 changes: 2 additions & 0 deletions api/v1beta1/conditions_const.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ const (
LinuxOnOtherDiskFoundReason = "LinuxOnOtherDiskFound"
// SSHToRescueSystemFailedReason indicates that the rescue system can't be reached via ssh.
SSHToRescueSystemFailedReason = "SSHToRescueSystemFailed"
// RebootTimedOutReason indicates that the reboot timed out.
RebootTimedOutReason = "RebootTimedOut"
)

const (
Expand Down
62 changes: 39 additions & 23 deletions pkg/services/baremetal/host/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ const (
rebootWaitTime time.Duration = 15 * time.Second
sshResetTimeout time.Duration = 5 * time.Minute
softwareResetTimeout time.Duration = 5 * time.Minute
hardwareResetTimeout time.Duration = 60 * time.Minute
hardwareResetTimeout time.Duration = 5 * time.Minute
rescue string = "rescue"
rescuePort int = 22
gbToMebiBytes int = 1000
Expand Down Expand Up @@ -398,7 +398,7 @@ func (s *Service) handleIncompleteBoot(isRebootIntoRescue, isTimeout, isConnecti
return false, s.handleErrorTypeSoftwareRebootFailed(isTimeout, isRebootIntoRescue)

case infrav1.ErrorTypeHardwareRebootTriggered:
return false, s.handleErrorTypeHardwareRebootFailed(isTimeout, isRebootIntoRescue)
return s.handleErrorTypeHardwareRebootFailed(isTimeout, isRebootIntoRescue)
}

return false, fmt.Errorf("%w: %s", errUnexpectedErrorType, s.scope.HetznerBareMetalHost.Spec.Status.ErrorType)
Expand Down Expand Up @@ -485,7 +485,8 @@ func (s *Service) handleErrorTypeSoftwareRebootFailed(isSSHTimeoutError, wantsRe
return nil
}

func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRescue bool) error {
// handleErrorTypeHardwareRebootFailed deals with hardware reboot failed cases and returns whether we should fail the process.
func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRescue bool) (bool, error) {
rebootInto := "node"
if wantsRescue {
rebootInto = "rescue mode"
Expand All @@ -494,11 +495,11 @@ func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRe
// right hostname. This means that the server has not been rebooted and we need to escalate.
// If we got a timeout error from ssh, it means that the server has not yet finished rebooting.
// If the timeout for hardware reboots has been reached, then escalate.
if !isSSHTimeoutError || hasTimedOut(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated, hardwareResetTimeout) {
if !isSSHTimeoutError {
if wantsRescue {
// make sure hat we boot into rescue mode if that is necessary
if err := s.ensureRescueMode(); err != nil {
return fmt.Errorf("failed to ensure rescue mode: %w", err)
return false, fmt.Errorf("failed to ensure rescue mode: %w", err)
}
}

Expand All @@ -509,13 +510,33 @@ func (s *Service) handleErrorTypeHardwareRebootFailed(isSSHTimeoutError, wantsRe
// we immediately set an error message in the host status to track the reboot we just performed
if _, err := s.scope.RobotClient.RebootBMServer(s.scope.HetznerBareMetalHost.Spec.ServerID, infrav1.RebootTypeHardware); err != nil {
s.handleRobotRateLimitExceeded(err, rebootServerStr)
return fmt.Errorf(errMsgFailedReboot, err)
return false, fmt.Errorf(errMsgFailedReboot, err)
}
msg := fmt.Sprintf("Reboot via ssh into %s failed. Now using rebootType %q.",
rebootInto, infrav1.RebootTypeHardware)
createRebootEvent(s.scope.HetznerBareMetalHost, infrav1.RebootTypeHardware, msg)
}
return nil

// if hardware reboots time out, we should fail
if hasTimedOut(s.scope.HetznerBareMetalHost.Spec.Status.LastUpdated, hardwareResetTimeout) {
msg := "reboot timed out - please check if server is working properly"
if wantsRescue {
msg = "The rescue system could not be reached. Please ensure that the machine tries to boot from network before booting from disk. This setting needs to be enabled permanently in the BIOS."
}
conditions.MarkFalse(
s.scope.HetznerBareMetalHost,
infrav1.ProvisionSucceededCondition,
infrav1.RebootTimedOutReason,
clusterv1.ConditionSeverityError,
msg,
)

record.Warn(s.scope.HetznerBareMetalHost, "HardwareRebootTimedOut", msg)

return true, fmt.Errorf("hardware reboot timed out")
}

return false, nil
}

func hasTimedOut(lastUpdated *metav1.Time, timeout time.Duration) bool {
Expand Down Expand Up @@ -571,7 +592,7 @@ func (s *Service) actionRegistering() actionResult {

failed, err := s.handleIncompleteBoot(true, isSSHTimeoutError, isSSHConnectionRefusedError)
if failed {
return s.recordActionFailure(infrav1.ProvisioningError, err.Error())
return s.recordActionFailure(infrav1.PermanentError, err.Error())
}
if err != nil {
return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)}
Expand Down Expand Up @@ -738,25 +759,21 @@ func (s *Service) analyzeSSHOutputRegistering(out sshclient.Output) (isSSHTimeou
}

func (s *Service) analyzeSSHErrorRegistering(sshErr error) (isSSHTimeoutError, isConnectionRefused bool, reterr error) {
// check if the reboot triggered
rebootTriggered, err := s.rebootTriggered()
if err != nil {
return false, false, fmt.Errorf("failed to check whether reboot triggered: %w", err)
}

switch {
case os.IsTimeout(sshErr) || sshclient.IsTimeoutError(sshErr):
isSSHTimeoutError = true
case sshclient.IsAuthenticationFailedError(sshErr):
// check if the reboot triggered
rebootTriggered, err := s.rebootTriggered()
if err != nil {
return false, false, fmt.Errorf("failed to check whether reboot triggered: %w", err)
}

if !rebootTriggered {
return false, false, nil
}
reterr = fmt.Errorf("wrong ssh key: %w", sshErr)
case sshclient.IsConnectionRefusedError(sshErr):
if !rebootTriggered && s.scope.HetznerBareMetalHost.Spec.Status.ErrorType != infrav1.ErrorTypeHardwareRebootTriggered {
// Reboot did not trigger
return false, false, nil
}
isConnectionRefused = true

default:
Expand Down Expand Up @@ -1299,7 +1316,7 @@ func (s *Service) actionProvisioning() actionResult {
}
failed, err := s.handleIncompleteBoot(false, isSSHTimeoutError, isSSHConnectionRefusedError)
if failed {
return s.recordActionFailure(infrav1.ProvisioningError, err.Error())
return s.recordActionFailure(infrav1.PermanentError, err.Error())
}
if err != nil {
return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)}
Expand Down Expand Up @@ -1488,7 +1505,7 @@ func (s *Service) actionEnsureProvisioned() (ar actionResult) {

failed, err := s.handleIncompleteBoot(false, isTimeout, isSSHConnectionRefusedError)
if failed {
return s.recordActionFailure(infrav1.ProvisioningError, err.Error())
return s.recordActionFailure(infrav1.PermanentError, err.Error())
}
if err != nil {
return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)}
Expand Down Expand Up @@ -1699,7 +1716,7 @@ func (s *Service) actionProvisioned() actionResult {
}
failed, err := s.handleIncompleteBoot(false, isTimeout, isSSHConnectionRefusedError)
if failed {
return s.recordActionFailure(infrav1.ProvisioningError, err.Error())
return s.recordActionFailure(infrav1.PermanentError, err.Error())
}
if err != nil {
return actionError{err: fmt.Errorf(errMsgFailedHandlingIncompleteBoot, err)}
Expand Down Expand Up @@ -1752,10 +1769,9 @@ func (s *Service) actionDeprovisioning() actionResult {
// Permanent errors are those ones that do not get solved with de- or re-provisioning.
if s.scope.HetznerBareMetalHost.Spec.Status.ErrorType != infrav1.PermanentError {
s.scope.HetznerBareMetalHost.ClearError()
conditions.Delete(s.scope.HetznerBareMetalHost, infrav1.ProvisionSucceededCondition)
}

conditions.Delete(s.scope.HetznerBareMetalHost, infrav1.ProvisionSucceededCondition)

return actionComplete{}
}

Expand Down
34 changes: 26 additions & 8 deletions pkg/services/baremetal/host/host_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -447,12 +447,6 @@ var _ = Describe("handleIncompleteBoot", func() {
Expect(robotMock.AssertNotCalled(GinkgoT(), "RebootBMServer", mock.Anything, mock.Anything)).To(BeTrue())
}
},
Entry("timed out hw reset", testCaseHandleIncompleteBootDifferentTimeouts{
hostErrorType: infrav1.ErrorTypeHardwareRebootTriggered,
lastUpdated: time.Now().Add(-time.Hour),
expectedHostErrorType: infrav1.ErrorTypeHardwareRebootTriggered,
expectedRebootType: infrav1.RebootTypeHardware,
}),
Entry("timed out sw reset", testCaseHandleIncompleteBootDifferentTimeouts{
hostErrorType: infrav1.ErrorTypeSoftwareRebootTriggered,
lastUpdated: time.Now().Add(-5 * time.Minute),
Expand All @@ -461,7 +455,7 @@ var _ = Describe("handleIncompleteBoot", func() {
}),
Entry("not timed out hw reset", testCaseHandleIncompleteBootDifferentTimeouts{
hostErrorType: infrav1.ErrorTypeHardwareRebootTriggered,
lastUpdated: time.Now().Add(-30 * time.Minute),
lastUpdated: time.Now().Add(-2 * time.Minute),
expectedHostErrorType: infrav1.ErrorTypeHardwareRebootTriggered,
expectedRebootType: infrav1.RebootType(""),
}),
Expand Down Expand Up @@ -496,6 +490,30 @@ var _ = Describe("handleIncompleteBoot", func() {
Expect(host.Spec.Status.ErrorType).To(Equal(infrav1.ErrorTypeConnectionError))
Expect(robotMock.AssertNotCalled(GinkgoT(), "RebootBMServer", mock.Anything, mock.Anything)).To(BeTrue())
})

It("fails if hardware reboot times out", func() {
robotMock := robotmock.Client{}
robotMock.On("SetBootRescue", mock.Anything, sshFingerprint).Return(nil, nil)
robotMock.On("GetBootRescue", mock.Anything).Return(&models.Rescue{Active: true}, nil)
robotMock.On("RebootBMServer", mock.Anything, mock.Anything).Return(nil, nil)

host := helpers.BareMetalHost("test-host", "default",
helpers.WithRebootTypes([]infrav1.RebootType{
infrav1.RebootTypeSoftware,
infrav1.RebootTypeHardware,
infrav1.RebootTypePower,
}),
helpers.WithSSHSpec(),
helpers.WithSSHStatus(),
helpers.WithError(infrav1.ErrorTypeHardwareRebootTriggered, "", 1, metav1.Time{Time: time.Now().Add(-time.Hour)}),
)
service := newTestService(host, &robotMock, nil, nil, nil)

_, err := service.handleIncompleteBoot(true, true, false)
Expect(err).ToNot(Succeed())
Expect(host.Spec.Status.ErrorType).To(Equal(infrav1.ErrorTypeHardwareRebootTriggered))
Expect(robotMock.AssertNotCalled(GinkgoT(), "RebootBMServer", mock.Anything, mock.Anything)).To(BeTrue())
})
})

Context("hostname rescue vs machinename", func() {
Expand Down Expand Up @@ -753,7 +771,7 @@ var _ = Describe("analyzeSSHOutputInstallImage", func() {
err: sshclient.ErrConnectionRefused,
rescueActive: true,
expectedIsTimeout: false,
expectedIsConnectionRefused: false,
expectedIsConnectionRefused: true,
expectedErrMessage: "",
}),
Entry("connectionRefused error, rescue not active", testCaseAnalyzeSSHOutputInstallImageOutErr{
Expand Down

0 comments on commit 6ba258f

Please sign in to comment.