Skip to content

Commit

Permalink
find topmost applied layer and prune allready applied results (#5118)
Browse files Browse the repository at this point in the history
in 1.2.0 i noticed one regression:
- if after restart same applied layer considered by tortoise undecided
- and mesh fails to apply layers for **any** reason, such as missing block locallyy

node will loop with 

> 2023-10-02T15:28:14.002+0200	WARN	fd68b.sync	mesh failed to process layer from sync	{"node_id": "fd68b9397572556c2f329f3e5af2faf23aef85dbbbb7e38447fae2f4ef38899f", "module": "sync", "sessionId": "29422935-68d6-47d1-87a8-02293aa181f3", "layer_id": 23104, "errmsg": "requested layer 8063 is before evicted 13102", "name": "sync"}

this change is a hotfix for that problem. after tallying votes we will scan all layers before that and find topmost layer that was applied before and considered valid by tortoise.
  • Loading branch information
dshulyak committed Oct 2, 2023
1 parent 27e05e5 commit 68d096d
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 12 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@

See [RELEASE](./RELEASE.md) for workflow instructions.

## UNRELEASED

### Improvements

* [#5118](https://github.com/spacemeshos/go-spacemesh/pull/5118) reduce number of tortoise results returned after recovery.

this is hotfix for a bug introduced in v1.2.0. in rare conditions node may loop with the following warning:

> 2023-10-02T15:28:14.002+0200 WARN fd68b.sync mesh failed to process layer from sync {"node_id": "fd68b9397572556c2f329f3e5af2faf23aef85dbbbb7e38447fae2f4ef38899f", "module": "sync", "sessionId": "29422935-68d6-47d1-87a8-02293aa181f3", "layer_id": 23104, "errmsg": "requested layer 8063 is before evicted 13102", "name": "sync"}
## v1.2.0

### Upgrade information
Expand Down
4 changes: 3 additions & 1 deletion tortoise/algorithm.go
Original file line number Diff line number Diff line change
Expand Up @@ -535,8 +535,10 @@ func (t *Tortoise) Mode() Mode {
// resetPending compares stored opinion with computed opinion and sets
// pending layer to the layer above equal layer.
// this method is meant to be used only in recovery from disk codepath.
func (t *Tortoise) resetPending(lid types.LayerID, opinion types.Hash32) {
func (t *Tortoise) resetPending(lid types.LayerID, opinion types.Hash32) bool {
if t.trtl.layer(lid).opinion == opinion {
t.trtl.pending = lid + 1
return true
}
return false
}
2 changes: 1 addition & 1 deletion tortoise/model/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func (c *core) OnMessage(m Messenger, event Message) {
m.Send(MessageBallot{Ballot: ballot})
case MessageLayerEnd:
if ev.LayerID.After(types.GetEffectiveGenesis()) {
tortoise.RecoverLayer(context.Background(), c.tortoise, c.cdb, c.beacons, ev.LayerID, ev.LayerID, ev.LayerID)
tortoise.RecoverLayer(context.Background(), c.tortoise, c.cdb, c.beacons, ev.LayerID, ev.LayerID, ev.LayerID, ev.LayerID)
m.Notify(EventVerified{ID: c.id, Verified: c.tortoise.LatestComplete(), Layer: ev.LayerID})
}

Expand Down
25 changes: 16 additions & 9 deletions tortoise/recover.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,14 @@ func Recover(ctx context.Context, db *datastore.CachedDB, latest types.LayerID,
}
}
}
for lid := types.GetEffectiveGenesis().Add(1); !lid.After(last); lid = lid.Add(1) {
start := types.GetEffectiveGenesis().Add(1)
for lid := start; !lid.After(last); lid = lid.Add(1) {
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
if err := RecoverLayer(ctx, trtl, db, beacon, lid, last, min(last, latest)); err != nil {
if err := RecoverLayer(ctx, trtl, db, beacon, start, lid, last, min(last, latest)); err != nil {
return nil, fmt.Errorf("failed to load tortoise state at layer %d: %w", lid, err)
}
}
Expand All @@ -83,7 +84,7 @@ func recoverEpoch(epoch types.EpochID, trtl *Tortoise, db *datastore.CachedDB, b
return nil
}

func RecoverLayer(ctx context.Context, trtl *Tortoise, db *datastore.CachedDB, beacon system.BeaconGetter, lid, last, current types.LayerID) error {
func RecoverLayer(ctx context.Context, trtl *Tortoise, db *datastore.CachedDB, beacon system.BeaconGetter, start, lid, last, current types.LayerID) error {
if lid.FirstInEpoch() {
if err := recoverEpoch(lid.GetEpoch(), trtl, db, beacon); err != nil {
return err
Expand Down Expand Up @@ -135,13 +136,19 @@ func RecoverLayer(ctx context.Context, trtl *Tortoise, db *datastore.CachedDB, b
}
if lid <= current && (lid%types.LayerID(trtl.cfg.WindowSize) == 0 || lid == last) {
trtl.TallyVotes(ctx, lid)

opinion, err := layers.GetAggregatedHash(db, lid-1)
if err == nil {
trtl.resetPending(lid-1, opinion)
} else if !errors.Is(err, sql.ErrNotFound) {
return fmt.Errorf("check opinion %w", err)
// find topmost layer that was already applied and reset pending
// so that result for that layer is not returned
for prev := lid - 1; prev >= start; prev-- {
opinion, err := layers.GetAggregatedHash(db, prev)
if err == nil {
if trtl.resetPending(prev, opinion) {
return nil
}
} else if !errors.Is(err, sql.ErrNotFound) {
return fmt.Errorf("check opinion %w", err)
}
}

}
return nil
}
33 changes: 32 additions & 1 deletion tortoise/recover_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func (a *recoveryAdapter) TallyVotes(ctx context.Context, current types.LayerID)
a.prev = genesis
}
for lid := a.prev; lid <= current; lid++ {
require.NoError(a, RecoverLayer(ctx, a.Tortoise, a.db, a.beacon, lid, current, current))
require.NoError(a, RecoverLayer(ctx, a.Tortoise, a.db, a.beacon, lid, lid, current, current))
a.prev = lid
}
}
Expand Down Expand Up @@ -100,3 +100,34 @@ func TestRecoverWithOpinion(t *testing.T) {
require.Len(t, updates, 1)
require.Equal(t, updates[0], last)
}

func TestResetPending(t *testing.T) {
const size = 10
s := sim.New(sim.WithLayerSize(size))
s.Setup()

cfg := defaultTestConfig()
cfg.LayerSize = size

trt := tortoiseFromSimState(t, s.GetState(0), WithConfig(cfg), WithLogger(logtest.New(t)))
const n = 10
var last types.LayerID
for _, lid := range sim.GenLayers(s, sim.WithSequence(n)) {
last = lid
trt.TallyVotes(context.Background(), lid)
}
updates1 := trt.Updates()
require.Len(t, updates1, n+1)
require.Equal(t, types.GetEffectiveGenesis(), updates1[0].Layer)
require.Equal(t, last, updates1[n].Layer)
for _, item := range updates1[:n/2] {
require.NoError(t, layers.SetMeshHash(s.GetState(0).DB, item.Layer, item.Opinion))
}

recovered, err := Recover(context.Background(), s.GetState(0).DB, last, s.GetState(0).Beacons, WithLogger(logtest.New(t)), WithConfig(cfg))
require.NoError(t, err)
updates2 := recovered.Updates()
require.Len(t, updates2, n/2+1)
require.Equal(t, last-n/2, updates2[0].Layer)
require.Equal(t, last, updates2[n/2].Layer)
}

0 comments on commit 68d096d

Please sign in to comment.