From 7104e0a1e7d5409ad87d1e7cd8d43d67a757d308 Mon Sep 17 00:00:00 2001
From: gammazero <11790789+gammazero@users.noreply.github.com>
Date: Thu, 12 Dec 2024 16:28:20 -1000
Subject: [PATCH] improve mfs republisher

- Get updated values while retrying to publish
- Prefer Close to lifecycle context
- Do not require call to Start separate from New
---
 mfs/repub.go      | 157 ++++++++++++++++++++++++++--------------------
 mfs/repub_test.go |  54 +++++++++++-----
 mfs/root.go       |  17 +++--
 3 files changed, 136 insertions(+), 92 deletions(-)

diff --git a/mfs/repub.go b/mfs/repub.go
index 3525d919c..25228c47a 100644
--- a/mfs/repub.go
+++ b/mfs/repub.go
@@ -2,43 +2,47 @@ package mfs
 
 import (
 	"context"
+	"errors"
+	"sync"
 	"time"
 
 	cid "github.com/ipfs/go-cid"
 )
 
+// closeTimeout is how long to wait for current publishing to finish before
+// shutting down the republisher.
+const closeTimeout = 5 * time.Second
+
 // PubFunc is the user-defined function that determines exactly what
 // logic entails "publishing" a `Cid` value.
 type PubFunc func(context.Context, cid.Cid) error
 
 // Republisher manages when to publish a given entry.
 type Republisher struct {
-	TimeoutLong  time.Duration
-	TimeoutShort time.Duration
-	RetryTimeout time.Duration
-	pubfunc      PubFunc
-
+	pubfunc          PubFunc
 	update           chan cid.Cid
 	immediatePublish chan chan struct{}
 
-	ctx    context.Context
-	cancel func()
+	cancel    func()
+	closeOnce sync.Once
+	stopped   chan struct{}
 }
 
 // NewRepublisher creates a new Republisher object to republish the given root
 // using the given short and long time intervals.
-func NewRepublisher(ctx context.Context, pf PubFunc, tshort, tlong time.Duration) *Republisher {
-	ctx, cancel := context.WithCancel(ctx)
-	return &Republisher{
-		TimeoutShort:     tshort,
-		TimeoutLong:      tlong,
-		RetryTimeout:     tlong,
+func NewRepublisher(pf PubFunc, tshort, tlong time.Duration, lastPublished cid.Cid) *Republisher {
+	ctx, cancel := context.WithCancel(context.Background())
+	rp := &Republisher{
 		update:           make(chan cid.Cid, 1),
 		pubfunc:          pf,
 		immediatePublish: make(chan chan struct{}),
-		ctx:              ctx,
 		cancel:           cancel,
+		stopped:          make(chan struct{}),
 	}
+
+	go rp.run(ctx, tshort, tlong, lastPublished)
+
+	return rp
 }
 
 // WaitPub waits for the current value to be published (or returns early
@@ -58,10 +62,22 @@ func (rp *Republisher) WaitPub(ctx context.Context) error {
 	}
 }
 
+// Close tells the republisher to stop and waits for it to stop.
 func (rp *Republisher) Close() error {
-	// TODO(steb): Wait for `Run` to stop
-	err := rp.WaitPub(rp.ctx)
-	rp.cancel()
+	var err error
+	rp.closeOnce.Do(func() {
+		// Wait a short amount of time for any current publishing to finish.
+		ctx, cancel := context.WithTimeout(context.Background(), closeTimeout)
+		err = rp.WaitPub(ctx)
+		if errors.Is(err, context.DeadlineExceeded) {
+			err = errors.New("mfs/republisher: timed out waiting to publish during close")
+		}
+		cancel()
+		// Shutdown the publisher.
+		rp.cancel()
+	})
+	// Wait for pblisher to stop and then return.
+	<-rp.stopped
 	return err
 }
 
@@ -82,22 +98,23 @@ func (rp *Republisher) Update(c cid.Cid) {
 }
 
 // Run contains the core logic of the `Republisher`. It calls the user-defined
-// `pubfunc` function whenever the `Cid` value is updated to a *new* value. The
-// complexity comes from the fact that `pubfunc` may be slow so we need to batch
-// updates.
+// `pubfunc` function whenever the `Cid` value is updated to a *new* value.
+// Since calling the `pubfunc` may be slow, updates are batched
 //
 // Algorithm:
-//  1. When we receive the first update after publishing, we set a `longer` timer.
-//  2. When we receive any update, we reset the `quick` timer.
-//  3. If either the `quick` timeout or the `longer` timeout elapses,
-//     we call `publish` with the latest updated value.
+//  1. When receiving the first update after publishing, set a `longer` timer
+//  2. When receiving any update, reset the `quick` timer
+//  3. If either the `quick` timeout or the `longer` timeout elapses, call
+//     `publish` with the latest updated value.
 //
-// The `longer` timer ensures that we delay publishing by at most
-// `TimeoutLong`. The `quick` timer allows us to publish sooner if
-// it looks like there are no more updates coming down the pipe.
+// The `longer` timer ensures that publishing is delayed by at most that
+// duration. The `quick` timer allows publishing sooner if there are no more
+// updates available.
 //
-// Note: If a publish fails, we retry repeatedly every TimeoutRetry.
-func (rp *Republisher) Run(lastPublished cid.Cid) {
+// If a publish fails, retry repeatedly every `longer` timeout.
+func (rp *Republisher) run(ctx context.Context, timeoutShort, timeoutLong time.Duration, lastPublished cid.Cid) {
+	defer close(rp.stopped)
+
 	quick := time.NewTimer(0)
 	if !quick.Stop() {
 		<-quick.C
@@ -107,12 +124,13 @@ func (rp *Republisher) Run(lastPublished cid.Cid) {
 		<-longer.C
 	}
 
+	immediatePublish := rp.immediatePublish
 	var toPublish cid.Cid
-	for rp.ctx.Err() == nil {
-		var waiter chan struct{}
+	var waiter chan struct{}
 
+	for {
 		select {
-		case <-rp.ctx.Done():
+		case <-ctx.Done():
 			return
 		case newValue := <-rp.update:
 			// Skip already published values.
@@ -123,19 +141,20 @@ func (rp *Republisher) Run(lastPublished cid.Cid) {
 				break
 			}
 
-			// If we aren't already waiting to publish something,
-			// reset the long timeout.
+			// If mot already waiting to publish something, reset the long
+			// timeout.
 			if !toPublish.Defined() {
-				longer.Reset(rp.TimeoutLong)
+				longer.Reset(timeoutLong)
 			}
 
 			// Always reset the short timeout.
-			quick.Reset(rp.TimeoutShort)
+			quick.Reset(timeoutShort)
 
 			// Finally, set the new value to publish.
 			toPublish = newValue
+			// Wait for a newer value or the quick timer.
 			continue
-		case waiter = <-rp.immediatePublish:
+		case waiter = <-immediatePublish:
 			// Make sure to grab the *latest* value to publish.
 			select {
 			case toPublish = <-rp.update:
@@ -147,60 +166,62 @@ func (rp *Republisher) Run(lastPublished cid.Cid) {
 				toPublish = cid.Undef
 			}
 		case <-quick.C:
+			// Waited a short time for more updates and no more received.
 		case <-longer.C:
+			// Keep getting updates and now it is time to send what has been
+			// received so far.
 		}
 
 		// Cleanup, publish, and close waiters.
 
-		// 1. Stop any timers. Don't use the `if !t.Stop() { ... }`
-		//    idiom as these timers may not be running.
-
+		// 1. Stop any timers.
 		quick.Stop()
+		longer.Stop()
+
+		// Do not use the `if !t.Stop() { ... }` idiom as these timers may not
+		// be running.
+		//
+		// TODO: remove after go1.23 required.
 		select {
 		case <-quick.C:
 		default:
 		}
-
-		longer.Stop()
 		select {
 		case <-longer.C:
 		default:
 		}
 
-		// 2. If we have a value to publish, publish it now.
+		// 2. If there is a value to publish then publish it now.
 		if toPublish.Defined() {
-			var timer *time.Timer
-			for {
-				err := rp.pubfunc(rp.ctx, toPublish)
-				if err == nil {
-					break
-				}
-
-				if timer == nil {
-					timer = time.NewTimer(rp.RetryTimeout)
-					defer timer.Stop()
-				} else {
-					timer.Reset(rp.RetryTimeout)
-				}
-
-				// Keep retrying until we succeed or we abort.
-				// TODO(steb): We could try pulling new values
-				// off `update` but that's not critical (and
-				// complicates this code a bit). We'll pull off
-				// a new value on the next loop through.
-				select {
-				case <-timer.C:
-				case <-rp.ctx.Done():
-					return
-				}
+			err := rp.pubfunc(ctx, toPublish)
+			if err != nil {
+				// Republish failed, so retry after waiting for long timeout.
+				//
+				// Instead of entering a retry loop here, go back to waiting
+				// for more values and retrying to publish after the lomg
+				// timeout. Keep using the current waiter until it has been
+				// notified of a successful publish.
+				//
+				// Reset the long timer as it effectively becomes the retry
+				// timeout.
+				longer.Reset(timeoutLong)
+				// Stop reading waiters from immediatePublish while retrying,
+				// This causes the current waiter to be notified only after a
+				// successful call to pubfunc, and is what constitutes a retry.
+				immediatePublish = nil
+				continue
 			}
 			lastPublished = toPublish
 			toPublish = cid.Undef
+			// Resume reading waiters,
+			immediatePublish = rp.immediatePublish
 		}
 
-		// 3. Trigger anything waiting in `WaitPub`.
+		// 3. Notify anything waiting in `WaitPub` on successful call to
+		// pubfunc or if nothing to publish.
 		if waiter != nil {
 			close(waiter)
+			waiter = nil
 		}
 	}
 }
diff --git a/mfs/repub_test.go b/mfs/repub_test.go
index 6be5624ab..bb6385019 100644
--- a/mfs/repub_test.go
+++ b/mfs/repub_test.go
@@ -7,6 +7,7 @@ import (
 
 	cid "github.com/ipfs/go-cid"
 	ci "github.com/libp2p/go-libp2p-testing/ci"
+	"github.com/stretchr/testify/require"
 )
 
 func TestRepublisher(t *testing.T) {
@@ -14,12 +15,14 @@ func TestRepublisher(t *testing.T) {
 		t.Skip("dont run timing tests in CI")
 	}
 
-	ctx := context.TODO()
-
 	pub := make(chan struct{})
 
 	pf := func(ctx context.Context, c cid.Cid) error {
-		pub <- struct{}{}
+		select {
+		case pub <- struct{}{}:
+		case <-ctx.Done():
+			return ctx.Err()
+		}
 		return nil
 	}
 
@@ -29,8 +32,7 @@ func TestRepublisher(t *testing.T) {
 	tshort := time.Millisecond * 50
 	tlong := time.Second / 2
 
-	rp := NewRepublisher(ctx, pf, tshort, tlong)
-	go rp.Run(cid.Undef)
+	rp := NewRepublisher(pf, tshort, tlong, cid.Undef)
 
 	rp.Update(testCid1)
 
@@ -41,16 +43,17 @@ func TestRepublisher(t *testing.T) {
 	case <-pub:
 	}
 
-	cctx, cancel := context.WithCancel(context.Background())
-
+	stopUpdates := make(chan struct{})
 	go func() {
+		timer := time.NewTimer(time.Hour)
+		defer timer.Stop()
 		for {
 			rp.Update(testCid2)
-			time.Sleep(time.Millisecond * 10)
+			timer.Reset(time.Millisecond * 10)
 			select {
-			case <-cctx.Done():
+			case <-timer.C:
+			case <-stopUpdates:
 				return
-			default:
 			}
 		}
 	}()
@@ -66,10 +69,33 @@ func TestRepublisher(t *testing.T) {
 		t.Fatal("waited too long for pub!")
 	}
 
-	cancel()
+	close(stopUpdates)
 
-	err := rp.Close()
-	if err != nil {
-		t.Fatal(err)
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond)
+	defer cancel()
+
+	// Check that republishing update does not call pubfunc again
+	rp.Update(testCid2)
+	err := rp.WaitPub(context.Background())
+	require.NoError(t, err)
+	select {
+	case <-pub:
+		t.Fatal("pub func called again with repeated update")
+	case <-time.After(tlong * 2):
 	}
+
+	// Check that waitpub times out when blocked pubfunc is called
+	rp.Update(testCid1)
+	err = rp.WaitPub(ctx)
+	require.ErrorIs(t, err, context.DeadlineExceeded)
+
+	// Unblock pubfunc.
+	<-pub
+
+	err = rp.Close()
+	require.NoError(t, err)
+
+	// Check that additional call to Close is OK after republisher stopped.
+	err = rp.Close()
+	require.NoError(t, err)
 }
diff --git a/mfs/root.go b/mfs/root.go
index 5a7cb7ed1..e584b6e06 100644
--- a/mfs/root.go
+++ b/mfs/root.go
@@ -64,6 +64,11 @@ const (
 	TDir
 )
 
+const (
+	repubQuick = 300 * time.Millisecond
+	repubLong  = 3 * time.Second
+)
+
 // FSNode abstracts the `Directory` and `File` structures, it represents
 // any child node in the MFS (i.e., all the nodes besides the `Root`). It
 // is the counterpart of the `parent` interface which represents any
@@ -100,12 +105,7 @@ type Root struct {
 func NewRoot(parent context.Context, ds ipld.DAGService, node *dag.ProtoNode, pf PubFunc) (*Root, error) {
 	var repub *Republisher
 	if pf != nil {
-		repub = NewRepublisher(parent, pf, time.Millisecond*300, time.Second*3)
-
-		// No need to take the lock here since we just created
-		// the `Republisher` and no one has access to it yet.
-
-		go repub.Run(node.Cid())
+		repub = NewRepublisher(pf, repubQuick, repubLong, node.Cid())
 	}
 
 	root := &Root{
@@ -177,10 +177,7 @@ func (kr *Root) FlushMemFree(ctx context.Context) error {
 	dir.lock.Lock()
 	defer dir.lock.Unlock()
 
-	for name := range dir.entriesCache {
-		delete(dir.entriesCache, name)
-	}
-	// TODO: Can't we just create new maps?
+	clear(dir.entriesCache)
 
 	return nil
 }