From f3fbb66923b8dcb3ab9ecc1b589b37e841cafccd Mon Sep 17 00:00:00 2001 From: Dmitry Date: Sat, 2 Sep 2023 08:13:53 +0200 Subject: [PATCH 1/7] deflake hare failures --- systest/parameters/fastnet/smesher.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/systest/parameters/fastnet/smesher.json b/systest/parameters/fastnet/smesher.json index d893b51277..73c20742c6 100644 --- a/systest/parameters/fastnet/smesher.json +++ b/systest/parameters/fastnet/smesher.json @@ -14,5 +14,8 @@ "transaction", "activation", "admin", "smesher" ], "grpc-private-services": [] + }, + "logging": { + "hare": "debug" } } From 1148c05cc1c4246ce1d2b849d7e3fb1791425978 Mon Sep 17 00:00:00 2001 From: Dmitry Date: Sat, 2 Sep 2023 08:48:05 +0200 Subject: [PATCH 2/7] fix expected delay and fire time events faster --- config/presets/fastnet.go | 2 ++ hare3/hare.go | 5 ++++- node/node.go | 2 +- timesync/config/config.go | 8 ++++++-- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/config/presets/fastnet.go b/config/presets/fastnet.go index c056256803..f6f694246e 100644 --- a/config/presets/fastnet.go +++ b/config/presets/fastnet.go @@ -91,5 +91,7 @@ func fastnet() config.Config { conf.POET.CycleGap = 30 * time.Second conf.POET.PhaseShift = 30 * time.Second + conf.TIME.TickInterval = 100 * time.Millisecond + return conf } diff --git a/hare3/hare.go b/hare3/hare.go index 6ded5f34e0..af34bccd48 100644 --- a/hare3/hare.go +++ b/hare3/hare.go @@ -69,7 +69,10 @@ func (cfg *Config) MarshalLogObject(encoder zapcore.ObjectEncoder) error { // roundStart returns expected time for iter/round relative to // layer start. func (cfg *Config) roundStart(round IterRound) time.Duration { - return cfg.PreroundDelay + time.Duration(round.Absolute())*cfg.RoundDuration + if round.Round == 0 { + return cfg.PreroundDelay + } + return cfg.PreroundDelay + time.Duration(round.Absolute()-1)*cfg.RoundDuration } func DefaultConfig() Config { diff --git a/node/node.go b/node/node.go index 18d890a1ee..3d18314c7c 100644 --- a/node/node.go +++ b/node/node.go @@ -1436,7 +1436,7 @@ func (app *App) startSynchronous(ctx context.Context) (err error) { } app.clock, err = timesync.NewClock( timesync.WithLayerDuration(app.Config.LayerDuration), - timesync.WithTickInterval(1*time.Second), + timesync.WithTickInterval(app.Config.TIME.TickInterval), timesync.WithGenesisTime(gTime), timesync.WithLogger(app.addLogger(ClockLogger, lg)), ) diff --git a/timesync/config/config.go b/timesync/config/config.go index 285e727555..9a64dd7997 100644 --- a/timesync/config/config.go +++ b/timesync/config/config.go @@ -1,6 +1,8 @@ package config import ( + "time" + "github.com/spacemeshos/go-spacemesh/timesync/peersync" ) @@ -11,14 +13,16 @@ var ( // TimeConfig specifies the timesync params for ntp. type TimeConfig struct { - Peersync peersync.Config `mapstructure:"peersync"` + TickInterval time.Duration `mapstructure:"tick-interval"` + Peersync peersync.Config `mapstructure:"peersync"` } // DefaultConfig defines the default tymesync configuration. func DefaultConfig() TimeConfig { // TimeConfigValues defines default values for all time and ntp related params. TimeConfigValues := TimeConfig{ - Peersync: peersync.DefaultConfig(), + TickInterval: time.Second, + Peersync: peersync.DefaultConfig(), } return TimeConfigValues From 7e1484f45e52a2095b02e36e51ca19d907395280 Mon Sep 17 00:00:00 2001 From: Dmitry Date: Sat, 2 Sep 2023 10:01:48 +0200 Subject: [PATCH 3/7] 500ms was too optimistic --- config/presets/fastnet.go | 2 +- hare3/hare.go | 15 ++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/config/presets/fastnet.go b/config/presets/fastnet.go index f6f694246e..c3587a6759 100644 --- a/config/presets/fastnet.go +++ b/config/presets/fastnet.go @@ -39,7 +39,7 @@ func fastnet() config.Config { conf.HARE3.Committee = 800 conf.HARE3.Leaders = 10 conf.HARE3.PreroundDelay = 3 * time.Second - conf.HARE3.RoundDuration = 500 * time.Millisecond + conf.HARE3.RoundDuration = 700 * time.Millisecond conf.HARE3.IterationsLimit = 2 conf.P2P.MinPeers = 10 diff --git a/hare3/hare.go b/hare3/hare.go index af34bccd48..cd2979c7dd 100644 --- a/hare3/hare.go +++ b/hare3/hare.go @@ -391,9 +391,9 @@ func (h *Hare) run(layer types.LayerID, beacon types.Beacon, proto *protocol) er zap.Uint8("iter", proto.Iter), zap.Stringer("round", proto.Round), ) current := proto.IterRound - start := time.Now() var vrf *types.HareEligibility if current.IsMessageRound() { + start := time.Now() vrf = h.oracle.active(h.signer.NodeID(), layer, current) activeLatency.Observe(time.Since(start).Seconds()) } @@ -428,6 +428,7 @@ func (h *Hare) onOutput(layer types.LayerID, ir IterRound, out output, vrf *type out.message.Layer = layer out.message.Eligibility = *vrf out.message.Sender = h.signer.NodeID() + out.message.Signature = h.signer.Sign(signing.HARE, out.message.ToMetadata().ToBytes()) } h.log.Debug("round output", zap.Uint32("lid", layer.Uint32()), @@ -436,14 +437,10 @@ func (h *Hare) onOutput(layer types.LayerID, ir IterRound, out output, vrf *type zap.Bool("active", vrf != nil), ) if out.message != nil { - h.eg.Go(func() error { - out.message.Signature = h.signer.Sign(signing.HARE, out.message.ToMetadata().ToBytes()) - if err := h.pubsub.Publish(h.ctx, h.config.ProtocolName, out.message.ToBytes()); err != nil { - h.log.Error("failed to publish", zap.Inline(out.message), zap.Error(err)) - } - h.tracer.OnMessageSent(out.message) - return nil - }) + if err := h.pubsub.Publish(h.ctx, h.config.ProtocolName, out.message.ToBytes()); err != nil { + h.log.Error("failed to publish", zap.Inline(out.message), zap.Error(err)) + } + h.tracer.OnMessageSent(out.message) } if out.coin != nil { select { From 35d8e0648748f86a259683253d7248c19b94c7c9 Mon Sep 17 00:00:00 2001 From: Dmitry Date: Sat, 2 Sep 2023 10:21:27 +0200 Subject: [PATCH 4/7] this log is outdated --- hare3/hare.go | 1 - 1 file changed, 1 deletion(-) diff --git a/hare3/hare.go b/hare3/hare.go index cd2979c7dd..0fa89c5f01 100644 --- a/hare3/hare.go +++ b/hare3/hare.go @@ -380,7 +380,6 @@ func (h *Hare) run(layer types.LayerID, beacon types.Beacon, proto *protocol) er if err := h.onOutput(layer, current, proto.Next(vrf != nil), vrf); err != nil { return err } - h.log.Debug("ready to accept messages", zap.Uint32("lid", layer.Uint32())) walltime = walltime.Add(h.config.RoundDuration) result := false for { From 5441f82221650d09365d354f2cafad108264f493 Mon Sep 17 00:00:00 2001 From: Dmitry Date: Sat, 2 Sep 2023 10:54:08 +0200 Subject: [PATCH 5/7] revert clock changes --- config/presets/fastnet.go | 2 -- node/node.go | 2 +- timesync/config/config.go | 8 ++------ 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/config/presets/fastnet.go b/config/presets/fastnet.go index c3587a6759..8c08e30832 100644 --- a/config/presets/fastnet.go +++ b/config/presets/fastnet.go @@ -91,7 +91,5 @@ func fastnet() config.Config { conf.POET.CycleGap = 30 * time.Second conf.POET.PhaseShift = 30 * time.Second - conf.TIME.TickInterval = 100 * time.Millisecond - return conf } diff --git a/node/node.go b/node/node.go index 3d18314c7c..18d890a1ee 100644 --- a/node/node.go +++ b/node/node.go @@ -1436,7 +1436,7 @@ func (app *App) startSynchronous(ctx context.Context) (err error) { } app.clock, err = timesync.NewClock( timesync.WithLayerDuration(app.Config.LayerDuration), - timesync.WithTickInterval(app.Config.TIME.TickInterval), + timesync.WithTickInterval(1*time.Second), timesync.WithGenesisTime(gTime), timesync.WithLogger(app.addLogger(ClockLogger, lg)), ) diff --git a/timesync/config/config.go b/timesync/config/config.go index 9a64dd7997..285e727555 100644 --- a/timesync/config/config.go +++ b/timesync/config/config.go @@ -1,8 +1,6 @@ package config import ( - "time" - "github.com/spacemeshos/go-spacemesh/timesync/peersync" ) @@ -13,16 +11,14 @@ var ( // TimeConfig specifies the timesync params for ntp. type TimeConfig struct { - TickInterval time.Duration `mapstructure:"tick-interval"` - Peersync peersync.Config `mapstructure:"peersync"` + Peersync peersync.Config `mapstructure:"peersync"` } // DefaultConfig defines the default tymesync configuration. func DefaultConfig() TimeConfig { // TimeConfigValues defines default values for all time and ntp related params. TimeConfigValues := TimeConfig{ - TickInterval: time.Second, - Peersync: peersync.DefaultConfig(), + Peersync: peersync.DefaultConfig(), } return TimeConfigValues From 09a27d1bb5daa04626c0ea64912402ef72f127bf Mon Sep 17 00:00:00 2001 From: Dmitry Date: Tue, 5 Sep 2023 09:54:32 +0200 Subject: [PATCH 6/7] simplify synchronnization in tests --- hare3/hare.go | 7 +++---- hare3/hare_test.go | 17 +++++------------ 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/hare3/hare.go b/hare3/hare.go index 0fa89c5f01..01f0cfd092 100644 --- a/hare3/hare.go +++ b/hare3/hare.go @@ -17,6 +17,7 @@ import ( "github.com/spacemeshos/go-spacemesh/common/types" "github.com/spacemeshos/go-spacemesh/datastore" "github.com/spacemeshos/go-spacemesh/layerpatrol" + "github.com/spacemeshos/go-spacemesh/log" "github.com/spacemeshos/go-spacemesh/metrics" "github.com/spacemeshos/go-spacemesh/p2p" "github.com/spacemeshos/go-spacemesh/p2p/pubsub" @@ -281,11 +282,9 @@ func (h *Hare) Handler(ctx context.Context, peer p2p.Peer, buf []byte) error { malicious: malicious, atxgrade: g, } + h.log.Debug("on message", zap.Inline(input)) gossip, equivocation := session.OnInput(input) - h.log.Debug("on message", - zap.Inline(input), - zap.Bool("gossip", gossip), - ) + h.log.Debug("after on message", log.ZShortStringer("hash", input.msgHash), zap.Bool("gossip", gossip)) submitLatency.Observe(time.Since(start).Seconds()) if equivocation != nil && !malicious { h.log.Debug("registered equivocation", diff --git a/hare3/hare_test.go b/hare3/hare_test.go index 19fda4d9ed..d8b4b83b5c 100644 --- a/hare3/hare_test.go +++ b/hare3/hare_test.go @@ -13,7 +13,6 @@ import ( "github.com/golang/mock/gomock" "github.com/stretchr/testify/require" "go.uber.org/zap/zapcore" - "golang.org/x/sync/errgroup" "github.com/spacemeshos/go-spacemesh/codec" "github.com/spacemeshos/go-spacemesh/common/types" @@ -65,13 +64,14 @@ type testNodeClock struct { started types.LayerID waiters []waiter - clock clock.Clock genesis time.Time layerDuration time.Duration } func (t *testNodeClock) CurrentLayer() types.LayerID { - return types.LayerID(t.clock.Now().Sub(t.genesis) / t.layerDuration) + t.mu.Lock() + defer t.mu.Unlock() + return t.started } func (t *testNodeClock) LayerToTime(lid types.LayerID) time.Time { @@ -208,7 +208,6 @@ func (n *node) withHare() *node { require.NoError(n.t, err) n.nclock = &testNodeClock{ - clock: n.clock, genesis: n.t.start, layerDuration: n.t.layerDuration, } @@ -396,17 +395,11 @@ func (cl *lockstepCluster) setup() { n.oracle.UpdateActiveSet(cl.t.genesis.GetEpoch()+1, active) n.mpublisher.EXPECT().Publish(gomock.Any(), gomock.Any(), gomock.Any()).Do(func(ctx context.Context, _ string, msg []byte) error { cl.timedReceive(cl.start) - var eg errgroup.Group for _, other := range cl.nodes { - other := other - eg.Go(func() error { - other.hare.Handler(ctx, "self", msg) - return nil - }) + other.hare.Handler(ctx, "self", msg) } - err := eg.Wait() cl.timedSend(cl.complete) - return err + return nil }).AnyTimes() } } From 4dce103f560b08ed6f72cfde0dc5da49f4b25a19 Mon Sep 17 00:00:00 2001 From: Dmitry Date: Thu, 7 Sep 2023 07:56:45 +0200 Subject: [PATCH 7/7] restore goroutine to sign and publish --- hare3/hare.go | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/hare3/hare.go b/hare3/hare.go index 01f0cfd092..643089aea3 100644 --- a/hare3/hare.go +++ b/hare3/hare.go @@ -426,7 +426,6 @@ func (h *Hare) onOutput(layer types.LayerID, ir IterRound, out output, vrf *type out.message.Layer = layer out.message.Eligibility = *vrf out.message.Sender = h.signer.NodeID() - out.message.Signature = h.signer.Sign(signing.HARE, out.message.ToMetadata().ToBytes()) } h.log.Debug("round output", zap.Uint32("lid", layer.Uint32()), @@ -435,10 +434,14 @@ func (h *Hare) onOutput(layer types.LayerID, ir IterRound, out output, vrf *type zap.Bool("active", vrf != nil), ) if out.message != nil { - if err := h.pubsub.Publish(h.ctx, h.config.ProtocolName, out.message.ToBytes()); err != nil { - h.log.Error("failed to publish", zap.Inline(out.message), zap.Error(err)) - } - h.tracer.OnMessageSent(out.message) + h.eg.Go(func() error { + out.message.Signature = h.signer.Sign(signing.HARE, out.message.ToMetadata().ToBytes()) + if err := h.pubsub.Publish(h.ctx, h.config.ProtocolName, out.message.ToBytes()); err != nil { + h.log.Error("failed to publish", zap.Inline(out.message), zap.Error(err)) + } + h.tracer.OnMessageSent(out.message) + return nil + }) } if out.coin != nil { select {