From b151dd40c8b52e11eb01be769b535428736e385e Mon Sep 17 00:00:00 2001 From: acud <12988138+acud@users.noreply.github.com> Date: Fri, 2 Aug 2024 21:17:03 +0000 Subject: [PATCH] test: improve bootstrapper and checkpoint test (#6193) ## Motivation Improve checkpoint test by not quitting directly on the bootstrapper when encountering a grpc error and increase epochs to recovery on the test. Co-authored-by: Matthias <5011972+fasmat@users.noreply.github.com> --- cmd/bootstrapper/server.go | 98 +++++++++++++++++++++----------- config/presets/fastnet.go | 4 +- systest/tests/checkpoint_test.go | 8 +-- 3 files changed, 71 insertions(+), 39 deletions(-) diff --git a/cmd/bootstrapper/server.go b/cmd/bootstrapper/server.go index 5f9986ffb2..eb6fa5204f 100644 --- a/cmd/bootstrapper/server.go +++ b/cmd/bootstrapper/server.go @@ -132,21 +132,72 @@ func (s *Server) Start(ctx context.Context, errCh chan error, params *NetworkPar } // start generating fallback data - s.eg.Go( - func() error { - s.genDataLoop(ctx, errCh, last, params.updateActiveSetTime, s.GenFallbackActiveSet) - return nil - }) - s.eg.Go( - func() error { - s.genDataLoop(ctx, errCh, last+1, params.updateBeaconTime, s.GenFallbackBeacon) - return nil - }) + s.eg.Go(func() error { + for epoch := last; ; epoch++ { + wait := time.Until(params.updateActiveSetTime(epoch)) + select { + case <-time.After(wait): + if err := s.genWithRetry(ctx, epoch, 10); err != nil { + errCh <- err + return nil + } + case <-ctx.Done(): + return nil + } + } + }) + s.eg.Go(func() error { + for epoch := last + 1; ; epoch++ { + wait := time.Until(params.updateBeaconTime(epoch)) + select { + case <-time.After(wait): + if err := s.GenFallbackBeacon(epoch); err != nil { + errCh <- err + return err + } + case <-ctx.Done(): + return nil + } + } + }) return nil }) } +func (s *Server) genWithRetry(ctx context.Context, epoch types.EpochID, maxRetries int) error { + err := s.GenFallbackActiveSet(ctx, epoch) + if err == nil { + return nil + } + s.logger.With().Debug("generate fallback active set retry", log.Err(err)) + + retries := 0 + backoff := 10 * time.Second + timer := time.NewTimer(backoff) + + for { + select { + case <-timer.C: + if err := s.GenFallbackActiveSet(ctx, epoch); err != nil { + s.logger.With().Debug("generate fallback active set retry", log.Err(err)) + retries++ + if retries >= maxRetries { + return err + } + timer.Reset(backoff) + continue + } + return nil + case <-ctx.Done(): + if !timer.Stop() { + <-timer.C + } + return ctx.Err() + } + } +} + // in systests, we want to be sure the nodes use the fallback data unconditionally. // use a fixed known value for beacon to be sure that fallback is used during testing. func epochBeacon(epoch types.EpochID) types.Beacon { @@ -165,7 +216,7 @@ func (s *Server) GenBootstrap(ctx context.Context, epoch types.EpochID) error { return err } -func (s *Server) GenFallbackBeacon(_ context.Context, epoch types.EpochID) error { +func (s *Server) GenFallbackBeacon(epoch types.EpochID) error { suffix := bootstrap.SuffixBeacon _, err := s.gen.GenUpdate(epoch, epochBeacon(epoch), nil, suffix) return err @@ -193,31 +244,10 @@ func getPartialActiveSet(ctx context.Context, smEndpoint string, targetEpoch typ return actives[:cutoff], nil } -func (s *Server) genDataLoop( - ctx context.Context, - errCh chan error, - start types.EpochID, - timeFunc func(types.EpochID) time.Time, - genFunc func(context.Context, types.EpochID) error, -) { - for epoch := start; ; epoch++ { - wait := time.Until(timeFunc(epoch)) - select { - case <-time.After(wait): - if err := genFunc(ctx, epoch); err != nil { - errCh <- err - return - } - case <-ctx.Done(): - return - } - } -} - func (s *Server) Stop(ctx context.Context) { s.logger.With().Info("shutting down server") - _ = s.Shutdown(ctx) - _ = s.eg.Wait() + s.Shutdown(ctx) + s.eg.Wait() } func (s *Server) handle(w http.ResponseWriter, r *http.Request) { diff --git a/config/presets/fastnet.go b/config/presets/fastnet.go index e616057ac0..225a2482d5 100644 --- a/config/presets/fastnet.go +++ b/config/presets/fastnet.go @@ -51,7 +51,9 @@ func fastnet() config.Config { conf.LayerDuration = 15 * time.Second conf.Sync.Interval = 5 * time.Second conf.Sync.GossipDuration = 10 * time.Second - conf.Sync.AtxSync.EpochInfoInterval = 20 * time.Second + conf.Sync.AtxSync.EpochInfoInterval = 1 * time.Second + conf.Sync.AtxSync.EpochInfoPeers = 10 + conf.Sync.AtxSync.RequestsLimit = 100 conf.Sync.MalSync.IDRequestInterval = 20 * time.Second conf.LayersPerEpoch = 4 conf.RegossipAtxInterval = 30 * time.Second diff --git a/systest/tests/checkpoint_test.go b/systest/tests/checkpoint_test.go index bd74e489c5..6d4ccde138 100644 --- a/systest/tests/checkpoint_test.go +++ b/systest/tests/checkpoint_test.go @@ -51,8 +51,8 @@ func TestCheckpoint(t *testing.T) { // the start of the next poet round snapshotLayer := uint32(15) restoreLayer := uint32(18) - checkpointEpoch := uint32(4) - lastEpoch := uint32(8) + checkpointEpoch := uint32(8) + lastEpoch := uint32(14) // bootstrap the checkpoint epoch and the next epoch as the beacon protocol was interrupted in the last epoch cl, err := reuseCluster(tctx, restoreLayer) @@ -168,8 +168,8 @@ func TestCheckpoint(t *testing.T) { } } - tctx.Log.Infow("waiting for all miners to be smeshing", "last epoch", checkpointEpoch+2) - ensureSmeshing(t, tctx, cl, checkpointEpoch+2) + tctx.Log.Infow("waiting for all miners to be smeshing", "last epoch", checkpointEpoch) + ensureSmeshing(t, tctx, cl, checkpointEpoch) // increase the cluster size to the original test size tctx.Log.Info("cluster size changed to ", size)