From 70a98bca5e57f9d24b262efe0ba753ff313055c2 Mon Sep 17 00:00:00 2001 From: Radu Berinde Date: Fri, 18 Oct 2024 14:14:45 -0700 Subject: [PATCH] colblk: cache IndexBlockDecoder in block metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``` RandSeekInSST/v4/single-level-10 1.21µs ± 1% 1.25µs ± 5% +3.40% (p=0.005 n=8+8) RandSeekInSST/v4/two-level-10 2.06µs ± 4% 2.06µs ± 1% ~ (p=0.959 n=8+8) RandSeekInSST/v5/single-level-10 1.11µs ± 1% 1.06µs ± 1% -4.03% (p=0.000 n=8+8) RandSeekInSST/v5/two-level-10 1.79µs ± 5% 1.59µs ± 1% -11.45% (p=0.000 n=7+8) ``` --- sstable/block/buffer_pool.go | 6 +++--- sstable/colblk/data_block.go | 29 ++++++++++++++++++++++------- sstable/colblk/index_block.go | 17 ++++++++--------- sstable/colblk/index_block_test.go | 11 ++++++++--- sstable/colblk/keyspan_test.go | 2 +- sstable/layout.go | 2 +- sstable/reader.go | 25 +++++++++++++++++-------- 7 files changed, 60 insertions(+), 32 deletions(-) diff --git a/sstable/block/buffer_pool.go b/sstable/block/buffer_pool.go index bd7d22177b..38809b41ed 100644 --- a/sstable/block/buffer_pool.go +++ b/sstable/block/buffer_pool.go @@ -38,13 +38,13 @@ func (b Value) getInternalBuf() []byte { return b.v.Buf() } -// Get returns the byte slice for the block data. -func (b Value) Get() []byte { +// BlockData returns the byte slice for the block data. +func (b Value) BlockData() []byte { return b.getInternalBuf()[MetadataSize:] } // GetMetadata returns the block metadata. -func (b Value) GetMetadata() *Metadata { +func (b Value) BlockMetadata() *Metadata { return (*Metadata)(b.getInternalBuf()) } diff --git a/sstable/colblk/data_block.go b/sstable/colblk/data_block.go index c25c610bea..5e8ef7c87a 100644 --- a/sstable/colblk/data_block.go +++ b/sstable/colblk/data_block.go @@ -737,13 +737,11 @@ func (rw *DataBlockRewriter) RewriteSuffixes( return start, end, rewritten, nil } -// DataBlockDecoderSize is the size of a DataBlockDecoder struct. If allocating -// memory for a data block, the caller may want to additionally allocate memory -// for the corresponding DataBlockDecoder. -const DataBlockDecoderSize = unsafe.Sizeof(DataBlockDecoder{}) - // Assert that a DataBlockDecoder can fit inside block.Metadata. -const _ uint = block.MetadataSize - uint(DataBlockDecoderSize) +const _ uint = block.MetadataSize - uint(unsafe.Sizeof(DataBlockDecoder{})) + +// Assert that an IndexBlockDecoder can fit inside block.Metadata. +const _ uint = block.MetadataSize - uint(unsafe.Sizeof(IndexBlockDecoder{})) // InitDataBlockMetadata initializes the metadata for a data block. func InitDataBlockMetadata(schema KeySchema, md *block.Metadata, data []byte) (err error) { @@ -763,6 +761,23 @@ func InitDataBlockMetadata(schema KeySchema, md *block.Metadata, data []byte) (e return nil } +// InitIndexBlockMetadata initializes the metadata for an index block. +func InitIndexBlockMetadata(md *block.Metadata, data []byte) (err error) { + if uintptr(unsafe.Pointer(md))%8 != 0 { + return errors.AssertionFailedf("metadata is not 8-byte aligned") + } + d := (*IndexBlockDecoder)(unsafe.Pointer(md)) + // Initialization can panic; convert panics to corruption errors (so higher + // layers can add file number and offset information). + defer func() { + if r := recover(); r != nil { + err = base.CorruptionErrorf("error initializing data block metadata: %v", r) + } + }() + d.Init(data) + return nil +} + // A DataBlockDecoder holds state for interpreting a columnar data block. It may // be shared among multiple DataBlockIters. type DataBlockDecoder struct { @@ -899,7 +914,7 @@ func (i *DataBlockIter) InitOnce( } // Init initializes the data block iterator, configuring it to read from the -// provided reader. +// provided decoder. func (i *DataBlockIter) Init(d *DataBlockDecoder, transforms block.IterTransforms) error { i.d = d // Leave i.h unchanged. diff --git a/sstable/colblk/index_block.go b/sstable/colblk/index_block.go index 2fc7d294e3..de46035d5a 100644 --- a/sstable/colblk/index_block.go +++ b/sstable/colblk/index_block.go @@ -6,6 +6,7 @@ package colblk import ( "slices" + "unsafe" "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble/internal/base" @@ -192,7 +193,9 @@ type IndexIter struct { syntheticSuffix block.SyntheticSuffix noTransforms bool - h block.BufferHandle + h block.BufferHandle + // TODO(radu): remove allocDecoder and require any Init callers to provide the + // decoder. allocDecoder IndexBlockDecoder keyBuf []byte } @@ -200,7 +203,7 @@ type IndexIter struct { // Assert that IndexIter satisfies the block.IndexBlockIterator interface. var _ block.IndexBlockIterator = (*IndexIter)(nil) -// InitWithDecoder initializes an index iterator from the provided reader. +// InitWithDecoder initializes an index iterator from the provided decoder. func (i *IndexIter) InitWithDecoder( compare base.Compare, split base.Split, d *IndexBlockDecoder, transforms block.IterTransforms, ) { @@ -209,6 +212,7 @@ func (i *IndexIter) InitWithDecoder( split: split, d: d, n: int(d.bd.header.Rows), + row: -1, h: i.h, allocDecoder: i.allocDecoder, keyBuf: i.keyBuf, @@ -233,15 +237,10 @@ func (i *IndexIter) Init( func (i *IndexIter) InitHandle( cmp base.Compare, split base.Split, blk block.BufferHandle, transforms block.IterTransforms, ) error { - // TODO(jackson): If block.h != nil, use a *IndexBlockDecoder that's allocated - // when the block is loaded into the block cache. On cache hits, this will - // reduce the amount of setup necessary to use an iterator. (It's relatively - // common to open an iterator and perform just a few seeks, so avoiding the - // overhead can be material.) i.h.Release() i.h = blk - i.allocDecoder.Init(i.h.BlockData()) - i.InitWithDecoder(cmp, split, &i.allocDecoder, transforms) + d := (*IndexBlockDecoder)(unsafe.Pointer(blk.BlockMetadata())) + i.InitWithDecoder(cmp, split, d, transforms) return nil } diff --git a/sstable/colblk/index_block_test.go b/sstable/colblk/index_block_test.go index 6c1bad3a6b..9435f7e355 100644 --- a/sstable/colblk/index_block_test.go +++ b/sstable/colblk/index_block_test.go @@ -11,6 +11,7 @@ import ( "strings" "sync" "testing" + "unsafe" "github.com/cockroachdb/datadriven" "github.com/cockroachdb/pebble/internal/base" @@ -115,12 +116,16 @@ func TestIndexIterInitHandle(t *testing.T) { bh2 := block.Handle{Offset: 2008, Length: 1000} w.AddBlockHandle([]byte("a"), bh1, nil) w.AddBlockHandle([]byte("b"), bh2, nil) - b := w.Finish(w.Rows()) + blockData := w.Finish(w.Rows()) c := cache.New(10 << 10) defer c.Unref() - v := block.Alloc(len(b), nil) - copy(v.Get(), b) + + v := block.Alloc(block.MetadataSize+len(blockData), nil) + copy(v.BlockData(), blockData) + d := (*IndexBlockDecoder)(unsafe.Pointer(v.BlockMetadata())) + d.Init(blockData) + v.MakeHandle(c, cache.ID(1), base.DiskFileNum(1), 0).Release() getBlockAndIterate := func(it *IndexIter) { diff --git a/sstable/colblk/keyspan_test.go b/sstable/colblk/keyspan_test.go index 1cb0016bd9..b938d70553 100644 --- a/sstable/colblk/keyspan_test.go +++ b/sstable/colblk/keyspan_test.go @@ -85,7 +85,7 @@ func TestKeyspanBlockPooling(t *testing.T) { c := cache.New(10 << 10) defer c.Unref() v := block.Alloc(len(b), nil) - copy(v.Get(), b) + copy(v.BlockData(), b) v.MakeHandle(c, cache.ID(1), base.DiskFileNum(1), 0).Release() getBlockAndIterate := func() { diff --git a/sstable/layout.go b/sstable/layout.go index 98be8a88f4..72376a091a 100644 --- a/sstable/layout.go +++ b/sstable/layout.go @@ -321,7 +321,7 @@ var ( ) func formatColblkIndexBlock(tp treeprinter.Node, r *Reader, b NamedBlockHandle, data []byte) error { - iter := new(colblk.IndexIter) + var iter colblk.IndexIter if err := iter.Init(r.Compare, r.Split, data, NoTransforms); err != nil { return err } diff --git a/sstable/reader.go b/sstable/reader.go index 132ea82ba3..49317acf8a 100644 --- a/sstable/reader.go +++ b/sstable/reader.go @@ -393,7 +393,16 @@ func (r *Reader) readIndexBlock( ctx context.Context, env readBlockEnv, readHandle objstorage.ReadHandle, bh block.Handle, ) (block.BufferHandle, error) { ctx = objiotracing.WithBlockType(ctx, objiotracing.MetadataBlock) - return r.readBlockInternal(ctx, env, readHandle, bh, noInitBlockMetadataFn) + return r.readBlockInternal(ctx, env, readHandle, bh, r.initIndexBlockMetadata) +} + +// initIndexBlockMetadata initializes the Metadata for a data block. This will +// later be used (and reused) when reading from the block. +func (r *Reader) initIndexBlockMetadata(metadata *block.Metadata, data []byte) error { + if r.tableFormat.BlockColumnar() { + return colblk.InitIndexBlockMetadata(metadata, data) + } + return nil } func (r *Reader) readDataBlock( @@ -507,9 +516,9 @@ func (r *Reader) readBlockInternal( readStopwatch := makeStopwatch() var err error if readHandle != nil { - err = readHandle.ReadAt(ctx, compressed.Get(), int64(bh.Offset)) + err = readHandle.ReadAt(ctx, compressed.BlockData(), int64(bh.Offset)) } else { - err = r.readable.ReadAt(ctx, compressed.Get(), int64(bh.Offset)) + err = r.readable.ReadAt(ctx, compressed.BlockData(), int64(bh.Offset)) } readDuration := readStopwatch.stop() // Call IsTracingEnabled to avoid the allocations of boxing integers into an @@ -528,12 +537,12 @@ func (r *Reader) readBlockInternal( return block.BufferHandle{}, err } env.BlockRead(bh.Length, readDuration) - if err := checkChecksum(r.checksumType, compressed.Get(), bh, r.cacheOpts.FileNum); err != nil { + if err := checkChecksum(r.checksumType, compressed.BlockData(), bh, r.cacheOpts.FileNum); err != nil { compressed.Release() return block.BufferHandle{}, err } - typ := block.CompressionIndicator(compressed.Get()[bh.Length]) + typ := block.CompressionIndicator(compressed.BlockData()[bh.Length]) compressed.Truncate(int(bh.Length)) var decompressed block.Value @@ -541,21 +550,21 @@ func (r *Reader) readBlockInternal( decompressed = compressed } else { // Decode the length of the decompressed value. - decodedLen, prefixLen, err := block.DecompressedLen(typ, compressed.Get()) + decodedLen, prefixLen, err := block.DecompressedLen(typ, compressed.BlockData()) if err != nil { compressed.Release() return block.BufferHandle{}, err } decompressed = block.Alloc(decodedLen, env.BufferPool) - err = block.DecompressInto(typ, compressed.Get()[prefixLen:], decompressed.Get()) + err = block.DecompressInto(typ, compressed.BlockData()[prefixLen:], decompressed.BlockData()) compressed.Release() if err != nil { decompressed.Release() return block.BufferHandle{}, err } } - if err := initBlockMetadataFn(decompressed.GetMetadata(), decompressed.Get()); err != nil { + if err := initBlockMetadataFn(decompressed.BlockMetadata(), decompressed.BlockData()); err != nil { decompressed.Release() return block.BufferHandle{}, err }