From 5cdfa3e26c42badb6e31cd75b33fb77555808afd Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Mon, 2 Dec 2024 11:00:17 -0800 Subject: [PATCH] ra: wait for validations on clean shutdown (#7854) This reduces the number of validations that get left indefinitely in "pending" state. Rename `DrainFinalize()` to `Drain()` to indicate that it now covers more cases than just finalize. --- cmd/boulder-ra/main.go | 2 +- ra/ra.go | 20 +++++++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/cmd/boulder-ra/main.go b/cmd/boulder-ra/main.go index c5b994e737d..45d142e0bf4 100644 --- a/cmd/boulder-ra/main.go +++ b/cmd/boulder-ra/main.go @@ -295,7 +295,7 @@ func main() { apc, issuerCerts, ) - defer rai.DrainFinalize() + defer rai.Drain() policyErr := rai.LoadRateLimitPoliciesFile(c.RA.RateLimitPoliciesFilename) cmd.FailOnError(policyErr, "Couldn't load rate limit policies file") diff --git a/ra/ra.go b/ra/ra.go index 62da21cb1e4..4c38937df4e 100644 --- a/ra/ra.go +++ b/ra/ra.go @@ -103,7 +103,7 @@ type RegistrationAuthorityImpl struct { maxNames int orderLifetime time.Duration finalizeTimeout time.Duration - finalizeWG sync.WaitGroup + drainWG sync.WaitGroup issuersByNameID map[issuance.NameID]*issuance.Certificate purger akamaipb.AkamaiPurgerClient @@ -1010,7 +1010,7 @@ func (ra *RegistrationAuthorityImpl) FinalizeOrder(ctx context.Context, req *rap // // We track this goroutine's lifetime in a waitgroup global to this RA, so // that it can wait for all goroutines to drain during shutdown. - ra.finalizeWG.Add(1) + ra.drainWG.Add(1) go func() { _, err := ra.issueCertificateOuter(ctx, proto.Clone(order).(*corepb.Order), csr, logEvent) if err != nil { @@ -1018,7 +1018,7 @@ func (ra *RegistrationAuthorityImpl) FinalizeOrder(ctx context.Context, req *rap // no parent goroutine waiting for it to receive the error. ra.log.AuditErrf("Asynchronous finalization failed: %s", err.Error()) } - ra.finalizeWG.Done() + ra.drainWG.Done() }() return order, nil } else { @@ -1904,8 +1904,11 @@ func (ra *RegistrationAuthorityImpl) PerformValidation( } // Dispatch to the VA for service + ra.drainWG.Add(1) vaCtx := context.Background() go func(authz core.Authorization) { + defer ra.drainWG.Done() + // We will mutate challenges later in this goroutine to change status and // add error, but we also return a copy of authz immediately. To avoid a // data race, make a copy of the challenges slice here for mutation. @@ -2803,6 +2806,13 @@ func (ra *RegistrationAuthorityImpl) GetAuthorization(ctx context.Context, req * return authz, nil } -func (ra *RegistrationAuthorityImpl) DrainFinalize() { - ra.finalizeWG.Wait() +// Drain blocks until all detached goroutines are done. +// +// The RA runs detached goroutines for challenge validation and finalization, +// so that ACME responses can be returned to the user promptly while work continues. +// +// The main goroutine should call this before exiting to avoid canceling the work +// being done in detached goroutines. +func (ra *RegistrationAuthorityImpl) Drain() { + ra.drainWG.Wait() }