From 71e8fd24696dd5d2b35c45855f44e7a84dbd4284 Mon Sep 17 00:00:00 2001
From: Travis Bischel <travis.bischel+github@gmail.com>
Date: Sun, 21 Jul 2024 20:17:29 -0600
Subject: [PATCH] kgo: fix deadlock in Produce when using MaxBufferedBytes

Copying from the issue,
"""
1) Produce() record A (100 bytes)
2) Produce() record B (50 bytes), waiting for buffer to free
3) Produce() record C (50 bytes), waiting for buffer to free
4) Record A is produced, finishRecordPromise() gets called, detects it was over the limit so publish 1 message to waitBuffer
5) Record B is unlocked, finishRecordPromise() gets called, does not detect it was over the limit (only 50 bytes), so record C is never unblocked and will wait indefinitely on waitBuffer
"""

The fix requires adding a lock while producing. This reuses the existing
lock on the `producer` type. This can lead to a few more spurious
wakeups in other functions that use this same mutex, but that's fine.

The prior algorithm counted anything to produce immediately into the
buffered records and bytes; the fix for #777 could not really be
possible unless we avoid counting them. Specifically, we need to have a
goroutine looping with a sync.Cond that checks *IF* we add the record,
will we still be blocked? This allows us to wake up all blocked
goroutines always (unlike one at a time, the problem this issue points
out), and each goroutine can check under a lock if they still do not
fit.

This also fixes an unreported bug where, if a record WOULD be blocked
but fails early due to no topic / not in a transaction while in a
transactional client, the serial promise finishing goroutine would
deadlock.

Closes #777.
---
 pkg/kgo/produce_request_test.go |  64 +++++++++++
 pkg/kgo/producer.go             | 184 ++++++++++++++++++++------------
 pkg/kgo/sink.go                 |   2 +-
 3 files changed, 181 insertions(+), 69 deletions(-)

diff --git a/pkg/kgo/produce_request_test.go b/pkg/kgo/produce_request_test.go
index 8a652a2f..ea005bb9 100644
--- a/pkg/kgo/produce_request_test.go
+++ b/pkg/kgo/produce_request_test.go
@@ -2,13 +2,77 @@ package kgo
 
 import (
 	"bytes"
+	"context"
+	"errors"
 	"hash/crc32"
+	"math/rand"
+	"strings"
+	"sync"
+	"sync/atomic"
 	"testing"
 
 	"github.com/twmb/franz-go/pkg/kbin"
 	"github.com/twmb/franz-go/pkg/kmsg"
 )
 
+func TestClient_Produce(t *testing.T) {
+	var (
+		topic, cleanup = tmpTopicPartitions(t, 1)
+		numWorkers     = 50
+		recsToWrite    = int64(20_000)
+
+		workers      sync.WaitGroup
+		writeSuccess atomic.Int64
+		writeFailure atomic.Int64
+
+		randRec = func() *Record {
+			return &Record{
+				Key:   []byte("test"),
+				Value: []byte(strings.Repeat("x", rand.Intn(1000))),
+				Topic: topic,
+			}
+		}
+	)
+	defer cleanup()
+
+	client, err := NewClient(MaxBufferedBytes(5000))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Start N workers that will concurrently write to the same partition.
+	var recsWritten atomic.Int64
+	var fatal atomic.Bool
+	for i := 0; i < numWorkers; i++ {
+		workers.Add(1)
+
+		go func() {
+			defer workers.Done()
+
+			for recsWritten.Add(1) <= recsToWrite {
+				res := client.ProduceSync(context.Background(), randRec())
+				if err := res.FirstErr(); err == nil {
+					writeSuccess.Add(1)
+				} else {
+					if !errors.Is(err, ErrMaxBuffered) {
+						t.Errorf("unexpected error: %v", err)
+						fatal.Store(true)
+					}
+
+					writeFailure.Add(1)
+				}
+			}
+		}()
+	}
+	workers.Wait()
+
+	t.Logf("writes succeeded: %d", writeSuccess.Load())
+	t.Logf("writes failed:    %d", writeFailure.Load())
+	if fatal.Load() {
+		t.Fatal("failed")
+	}
+}
+
 // This file contains golden tests against kmsg AppendTo's to ensure our custom
 // encoding is correct.
 
diff --git a/pkg/kgo/producer.go b/pkg/kgo/producer.go
index f09b4712..a399a9c1 100644
--- a/pkg/kgo/producer.go
+++ b/pkg/kgo/producer.go
@@ -14,9 +14,15 @@ import (
 )
 
 type producer struct {
-	bufferedRecords atomicI64
-	bufferedBytes   atomicI64
-	inflight        atomicI64 // high 16: # waiters, low 48: # inflight
+	inflight atomicI64 // high 16: # waiters, low 48: # inflight
+
+	// mu and c are used for flush and drain notifications; mu is used for
+	// a few other tight locks.
+	mu sync.Mutex
+	c  *sync.Cond
+
+	bufferedRecords int64
+	bufferedBytes   int64
 
 	cl *Client
 
@@ -45,19 +51,14 @@ type producer struct {
 	// We must have a producer field for flushing; we cannot just have a
 	// field on recBufs that is toggled on flush. If we did, then a new
 	// recBuf could be created and records sent to while we are flushing.
-	flushing atomicI32 // >0 if flushing, can Flush many times concurrently
-	blocked  atomicI32 // >0 if over max recs or bytes
+	flushing     atomicI32 // >0 if flushing, can Flush many times concurrently
+	blocked      atomicI32 // >0 if over max recs or bytes
+	blockedBytes int64
 
 	aborting atomicI32 // >0 if aborting, can abort many times concurrently
 
-	idMu       sync.Mutex
-	idVersion  int16
-	waitBuffer chan struct{}
-
-	// mu and c are used for flush and drain notifications; mu is used for
-	// a few other tight locks.
-	mu sync.Mutex
-	c  *sync.Cond
+	idMu      sync.Mutex
+	idVersion int16
 
 	batchPromises ringBatchPromise
 	promisesMu    sync.Mutex
@@ -86,14 +87,18 @@ type producer struct {
 // flushing records produced by your client (which can help determine network /
 // cluster health).
 func (cl *Client) BufferedProduceRecords() int64 {
-	return cl.producer.bufferedRecords.Load()
+	cl.producer.mu.Lock()
+	defer cl.producer.mu.Unlock()
+	return cl.producer.bufferedRecords + int64(cl.producer.blocked.Load())
 }
 
 // BufferedProduceBytes returns the number of bytes currently buffered for
 // producing within the client. This is the sum of all keys, values, and header
 // keys/values. See the related [BufferedProduceRecords] for more information.
 func (cl *Client) BufferedProduceBytes() int64 {
-	return cl.producer.bufferedBytes.Load()
+	cl.producer.mu.Lock()
+	defer cl.producer.mu.Unlock()
+	return cl.producer.bufferedBytes + cl.producer.blockedBytes
 }
 
 type unknownTopicProduces struct {
@@ -106,7 +111,6 @@ func (p *producer) init(cl *Client) {
 	p.cl = cl
 	p.topics = newTopicsPartitions()
 	p.unknownTopics = make(map[string]*unknownTopicProduces)
-	p.waitBuffer = make(chan struct{}, math.MaxInt32)
 	p.idVersion = -1
 	p.id.Store(&producerID{
 		id:    -1,
@@ -397,58 +401,88 @@ func (cl *Client) produce(
 		}
 	}
 
-	var (
-		userSize     = r.userSize()
-		bufRecs      = p.bufferedRecords.Add(1)
-		bufBytes     = p.bufferedBytes.Add(userSize)
-		overMaxRecs  = bufRecs > cl.cfg.maxBufferedRecords
-		overMaxBytes bool
-	)
-	if cl.cfg.maxBufferedBytes > 0 {
-		if userSize > cl.cfg.maxBufferedBytes {
-			p.promiseRecord(promisedRec{ctx, promise, r}, kerr.MessageTooLarge)
-			return
-		}
-		overMaxBytes = bufBytes > cl.cfg.maxBufferedBytes
-	}
-
+	// We can now fail the rec after the buffered hook.
 	if r.Topic == "" {
-		p.promiseRecord(promisedRec{ctx, promise, r}, errNoTopic)
+		p.promiseRecordBeforeBuf(promisedRec{ctx, promise, r}, errNoTopic)
 		return
 	}
 	if cl.cfg.txnID != nil && !p.producingTxn.Load() {
-		p.promiseRecord(promisedRec{ctx, promise, r}, errNotInTransaction)
+		p.promiseRecordBeforeBuf(promisedRec{ctx, promise, r}, errNotInTransaction)
 		return
 	}
 
+	userSize := r.userSize()
+	if cl.cfg.maxBufferedBytes > 0 && userSize > cl.cfg.maxBufferedBytes {
+		p.promiseRecordBeforeBuf(promisedRec{ctx, promise, r}, kerr.MessageTooLarge)
+		return
+	}
+
+	// We have to grab the produce lock to check if this record will exceed
+	// configured limits. We try to keep the logic tight since this is
+	// effectively a global lock.
+	p.mu.Lock()
+	var (
+		nextBufRecs  = p.bufferedRecords + 1
+		nextBufBytes = p.bufferedBytes + userSize
+		overMaxRecs  = nextBufRecs > cl.cfg.maxBufferedRecords
+		overMaxBytes = cl.cfg.maxBufferedBytes > 0 && nextBufBytes > cl.cfg.maxBufferedBytes
+	)
 	if overMaxRecs || overMaxBytes {
+		if !block || cl.cfg.manualFlushing {
+			p.mu.Unlock()
+			p.promiseRecordBeforeBuf(promisedRec{ctx, promise, r}, ErrMaxBuffered)
+			return
+		}
+
+		// Before we potentially unlinger, add that we are blocked to
+		// ensure we do NOT start a linger anymore. We THEN wakeup
+		// anything that is actively lingering. Note that blocked is
+		// also used when finishing promises to see if we need to be
+		// notified.
+		p.blocked.Add(1)
+		p.blockedBytes += userSize
+		p.mu.Unlock()
+
 		cl.cfg.logger.Log(LogLevelDebug, "blocking Produce because we are either over max buffered records or max buffered bytes",
 			"over_max_records", overMaxRecs,
 			"over_max_bytes", overMaxBytes,
+			"at_recs", p.bufferedRecords,
+			"at_bytes", p.bufferedBytes,
 		)
-		// Before we potentially unlinger, add that we are blocked.
-		// Lingering always checks blocked, so we will not start a
-		// linger while we are blocked. We THEN wakeup anything that
-		// is actively lingering.
-		cl.producer.blocked.Add(1)
+
 		cl.unlingerDueToMaxRecsBuffered()
-		// If the client ctx cancels or the produce ctx cancels, we
-		// need to un-count our buffering of this record. We also need
-		// to drain a slot from the waitBuffer chan, which could be
-		// sent to right when we are erroring.
+
+		wait := make(chan struct{})
+		var quit bool
+		go func() {
+			defer close(wait)
+			p.mu.Lock()
+			defer p.mu.Unlock()
+			for !quit && (overMaxRecs || overMaxBytes) {
+				p.c.Wait()
+				nextBufRecs = p.bufferedRecords + 1
+				nextBufBytes = p.bufferedBytes + userSize
+				overMaxRecs = nextBufRecs > cl.cfg.maxBufferedRecords
+				overMaxBytes = cl.cfg.maxBufferedBytes > 0 && nextBufBytes > cl.cfg.maxBufferedBytes
+			}
+		}()
+
 		drainBuffered := func(err error) {
-			p.promiseRecord(promisedRec{ctx, promise, r}, err)
-			<-p.waitBuffer
-			cl.producer.blocked.Add(-1)
-		}
-		if !block || cl.cfg.manualFlushing {
-			drainBuffered(ErrMaxBuffered)
-			return
+			p.mu.Lock()                                                 // lock to set quit and dec blockedBytes
+			p.blocked.Add(-1)                                           // ensure no more unneeded broadcasts
+			p.blockedBytes -= userSize                                  // remove accounting for this "buffered" record
+			quit = true                                                 // kill the goroutine just above
+			p.mu.Unlock()                                               // unlock to continue
+			p.c.Broadcast()                                             // ensure the goroutine just above wakes up
+			p.promiseRecordBeforeBuf(promisedRec{ctx, promise, r}, err) // finally, finish the promise and call any unbuffered hook
 		}
+
 		select {
-		case <-p.waitBuffer:
-			cl.cfg.logger.Log(LogLevelDebug, "Produce block signaled, continuing to produce")
-			cl.producer.blocked.Add(-1)
+		case <-wait:
+			cl.cfg.logger.Log(LogLevelDebug, "Produce block awoken, we now have space to produce, continuing to partition and produce")
+			p.mu.Lock()
+			p.blocked.Add(-1)
+			p.blockedBytes -= userSize
 		case <-cl.ctx.Done():
 			drainBuffered(ErrClientClosed)
 			cl.cfg.logger.Log(LogLevelDebug, "client ctx canceled while blocked in Produce, returning")
@@ -459,6 +493,9 @@ func (cl *Client) produce(
 			return
 		}
 	}
+	p.bufferedRecords = nextBufRecs
+	p.bufferedBytes = nextBufBytes
+	p.mu.Unlock()
 
 	cl.partitionRecord(promisedRec{ctx, promise, r})
 }
@@ -468,6 +505,7 @@ type batchPromise struct {
 	pid        int64
 	epoch      int16
 	attrs      RecordAttrs
+	beforeBuf  bool
 	partition  int32
 	recs       []promisedRec
 	err        error
@@ -483,6 +521,10 @@ func (p *producer) promiseRecord(pr promisedRec, err error) {
 	p.promiseBatch(batchPromise{recs: []promisedRec{pr}, err: err})
 }
 
+func (p *producer) promiseRecordBeforeBuf(pr promisedRec, err error) {
+	p.promiseBatch(batchPromise{recs: []promisedRec{pr}, beforeBuf: true, err: err})
+}
+
 func (p *producer) finishPromises(b batchPromise) {
 	cl := p.cl
 	var more bool
@@ -495,7 +537,7 @@ start:
 		pr.ProducerID = b.pid
 		pr.ProducerEpoch = b.epoch
 		pr.Attrs = b.attrs
-		cl.finishRecordPromise(pr, b.err)
+		cl.finishRecordPromise(pr, b.err, b.beforeBuf)
 		b.recs[i] = promisedRec{}
 	}
 	p.promisesMu.Unlock()
@@ -509,7 +551,7 @@ start:
 	}
 }
 
-func (cl *Client) finishRecordPromise(pr promisedRec, err error) {
+func (cl *Client) finishRecordPromise(pr promisedRec, err error, beforeBuffering bool) {
 	p := &cl.producer
 
 	if p.hooks != nil && len(p.hooks.unbuffered) > 0 {
@@ -519,22 +561,28 @@ func (cl *Client) finishRecordPromise(pr promisedRec, err error) {
 	}
 
 	// Capture user size before potential modification by the promise.
+	//
+	// We call the promise before finishing the flush notification,
+	// allowing users of Flush to know all buf recs are done by the
+	// time we notify flush below.
 	userSize := pr.userSize()
-	nowBufBytes := p.bufferedBytes.Add(-userSize)
-	nowBufRecs := p.bufferedRecords.Add(-1)
-	wasOverMaxRecs := nowBufRecs >= cl.cfg.maxBufferedRecords
-	wasOverMaxBytes := cl.cfg.maxBufferedBytes > 0 && nowBufBytes+userSize > cl.cfg.maxBufferedBytes
-
-	// We call the promise before finishing the record; this allows users
-	// of Flush to know that all buffered records are completely done
-	// before Flush returns.
 	pr.promise(pr.Record, err)
 
-	if wasOverMaxRecs || wasOverMaxBytes {
-		p.waitBuffer <- struct{}{}
-	} else if nowBufRecs == 0 && p.flushing.Load() > 0 {
-		p.mu.Lock()
-		p.mu.Unlock() //nolint:gocritic,staticcheck // We use the lock as a barrier, unlocking immediately is safe.
+	// If this record was never buffered, it's size was never accounted
+	// for: return early. Note that we still call the buffered/unbuffered
+	// hooks, we just never account for the actual sizing.
+	if beforeBuffering {
+		return
+	}
+
+	// Keep the lock as tight as possible: the broadcast can come after.
+	p.mu.Lock()
+	p.bufferedBytes -= userSize
+	p.bufferedRecords--
+	broadcast := p.blocked.Load() > 0 || p.bufferedRecords == 0 && p.flushing.Load() > 0
+	p.mu.Unlock()
+
+	if broadcast {
 		p.c.Broadcast()
 	}
 }
@@ -1021,7 +1069,7 @@ func (cl *Client) Flush(ctx context.Context) error {
 		defer p.mu.Unlock()
 		defer close(done)
 
-		for !quit && p.bufferedRecords.Load() > 0 {
+		for !quit && p.bufferedRecords+int64(p.blocked.Load()) > 0 {
 			p.c.Wait()
 		}
 	}()
diff --git a/pkg/kgo/sink.go b/pkg/kgo/sink.go
index 49bff3b1..f5283ddf 100644
--- a/pkg/kgo/sink.go
+++ b/pkg/kgo/sink.go
@@ -258,7 +258,7 @@ func (s *sink) produce(sem <-chan struct{}) bool {
 	// We could have been triggered from a metadata update even though the
 	// user is not producing at all. If we have no buffered records, let's
 	// avoid potentially creating a producer ID.
-	if s.cl.producer.bufferedRecords.Load() == 0 {
+	if s.cl.BufferedProduceRecords() == 0 {
 		return false
 	}