Skip to content

Commit

Permalink
colblk: improve Uint encoding for very few rows
Browse files Browse the repository at this point in the history
We currently choose the best uint encoding assuming we have a decent
number of rows. For less than 8 rows, the best encoding might be
different: e.g.  a non-delta encoding with 2 bytes per row can be
smaller than an 8 byte delta base plus 1 byte per row. In rare cases,
this can make blocks *decrease* in size when we add a row which can be
confusing (especially in tests).

This change takes the number of rows into account to get the smallest
possible encoding.
  • Loading branch information
RaduBerinde committed Oct 15, 2024
1 parent fbbf740 commit 88babd7
Show file tree
Hide file tree
Showing 11 changed files with 707 additions and 563 deletions.
6 changes: 3 additions & 3 deletions sstable/colblk/prefix_bytes.go
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,7 @@ func (b *PrefixBytesBuilder) Put(key []byte, bytesSharedWithPrev int) {
currentBundlePrefixOffset: 1,
completedBundleLen: 0,
compressedDataLen: len(key),
offsetEncoding: DetermineUintEncoding(0, uint64(len(key))),
offsetEncoding: DetermineUintEncoding(0, uint64(len(key)), UintEncodingRowThreshold),
}
case b.nKeys&(b.bundleSize-1) == 0:
// We're starting a new bundle.
Expand Down Expand Up @@ -855,7 +855,7 @@ func (b *PrefixBytesBuilder) Put(key []byte, bytesSharedWithPrev int) {
currentBundleDistinctKeys: 1,
compressedDataLen: completedBundleSize + len(key) - (b.bundleCount(b.nKeys)-1)*blockPrefixLen,
}
curr.offsetEncoding = DetermineUintEncoding(0, uint64(curr.compressedDataLen))
curr.offsetEncoding = DetermineUintEncoding(0, uint64(curr.compressedDataLen), UintEncodingRowThreshold)
b.data = append(b.data, key...)
b.addOffset(0) // Placeholder for bundle prefix.
b.addOffset(uint32(len(b.data)))
Expand Down Expand Up @@ -897,7 +897,7 @@ func (b *PrefixBytesBuilder) Put(key []byte, bytesSharedWithPrev int) {
curr.compressedDataLen -= (b.bundleCount(b.nKeys) - 1) * curr.blockPrefixLen
// The compressedDataLen is the largest offset we'll need to encode in the
// offset table.
curr.offsetEncoding = DetermineUintEncoding(0, uint64(curr.compressedDataLen))
curr.offsetEncoding = DetermineUintEncoding(0, uint64(curr.compressedDataLen), UintEncodingRowThreshold)
b.data = append(b.data, key...)
b.addOffset(uint32(len(b.data)))
}
Expand Down
96 changes: 50 additions & 46 deletions sstable/colblk/testdata/data_block/rewrite_suffixes
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ write
poi@12#0,SET:big
yaya@5#0,SET:mini
----
size=110:
size=102:
0: prefixes: prefixbytes(16): 2 keys
1: suffixes: bytes: 2 rows set; 5 bytes in data
2: trailers: uint: 2 rows
Expand All @@ -38,13 +38,13 @@ data block header
│ ├── 021-022: b 00000010 # col 2: uint
│ ├── 022-026: x 44000000 # col 2: page start 68
│ ├── 026-027: b 00000001 # col 3: bool
│ ├── 027-031: x 4d000000 # col 3: page start 77
│ ├── 027-031: x 47000000 # col 3: page start 71
│ ├── 031-032: b 00000011 # col 4: bytes
│ ├── 032-036: x 60000000 # col 4: page start 96
│ ├── 032-036: x 58000000 # col 4: page start 88
│ ├── 036-037: b 00000001 # col 5: bool
│ ├── 037-041: x 6b000000 # col 5: page start 107
│ ├── 037-041: x 63000000 # col 5: page start 99
│ ├── 041-042: b 00000001 # col 6: bool
│ └── 042-046: x 6c000000 # col 6: page start 108
│ └── 042-046: x 64000000 # col 6: page start 100
├── data for column 0 (prefixbytes)
│ ├── 046-047: x 04 # bundle size: 16
│ ├── offsets table
Expand All @@ -68,27 +68,27 @@ data block header
│ ├── 063-066: x 403132 # data[0]: @12
│ └── 066-068: x 4035 # data[1]: @5
├── data for column 2 (uint)
│ ├── 068-069: x 80 # encoding: const
│ └── 069-077: x 0100000000000000 # 64-bit constant: 1
│ ├── 068-069: x 01 # encoding: 1b
│ ├── 069-070: x 01 # data[0] = 1
│ └── 070-071: x 01 # data[1] = 1
├── data for column 3 (bool)
│ ├── 077-078: x 00 # default bitmap encoding
│ ├── 078-080: x 0000 # padding to align to 64-bit boundary
│ ├── 080-088: b 0000001100000000000000000000000000000000000000000000000000000000 # bitmap word 0
│ └── 088-096: b 0000000100000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63
│ ├── 071-072: x 00 # default bitmap encoding
│ ├── 072-080: b 0000001100000000000000000000000000000000000000000000000000000000 # bitmap word 0
│ └── 080-088: b 0000000100000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63
├── data for column 4 (bytes)
│ ├── offsets table
│ │ ├── 096-097: x 01 # encoding: 1b
│ │ ├── 097-098: x 00 # data[0] = 0 [100 overall]
│ │ ├── 098-099: x 03 # data[1] = 3 [103 overall]
│ │ └── 099-100: x 07 # data[2] = 7 [107 overall]
│ │ ├── 088-089: x 01 # encoding: 1b
│ │ ├── 089-090: x 00 # data[0] = 0 [92 overall]
│ │ ├── 090-091: x 03 # data[1] = 3 [95 overall]
│ │ └── 091-092: x 07 # data[2] = 7 [99 overall]
│ └── data
│ ├── 100-103: x 626967 # data[0]: big
│ └── 103-107: x 6d696e69 # data[1]: mini
│ ├── 092-095: x 626967 # data[0]: big
│ └── 095-099: x 6d696e69 # data[1]: mini
├── data for column 5 (bool)
│ └── 107-108: x 01 # zero bitmap encoding
│ └── 099-100: x 01 # zero bitmap encoding
├── data for column 6 (bool)
│ └── 108-109: x 01 # zero bitmap encoding
└── 109-110: x 00 # block padding byte
│ └── 100-101: x 01 # zero bitmap encoding
└── 101-102: x 00 # block padding byte

rewrite from=@12 to=@22
----
Expand Down Expand Up @@ -135,7 +135,7 @@ data block header
│ ├── 021-022: b 00000010 # col 2: uint
│ ├── 022-026: x 4d000000 # col 2: page start 77
│ ├── 026-027: b 00000001 # col 3: bool
│ ├── 027-031: x 56000000 # col 3: page start 86
│ ├── 027-031: x 51000000 # col 3: page start 81
│ ├── 031-032: b 00000011 # col 4: bytes
│ ├── 032-036: x 68000000 # col 4: page start 104
│ ├── 036-037: b 00000001 # col 5: bool
Expand Down Expand Up @@ -169,11 +169,13 @@ data block header
│ ├── 073-075: x 4036 # data[1]: @6
│ └── 075-077: x 4036 # data[2]: @6
├── data for column 2 (uint)
│ ├── 077-078: x 80 # encoding: const
│ └── 078-086: x 0100000000000000 # 64-bit constant: 1
│ ├── 077-078: x 01 # encoding: 1b
│ ├── 078-079: x 01 # data[0] = 1
│ ├── 079-080: x 01 # data[1] = 1
│ └── 080-081: x 01 # data[2] = 1
├── data for column 3 (bool)
│ ├── 086-087: x 00 # default bitmap encoding
│ ├── 087-088: x 00 # padding to align to 64-bit boundary
│ ├── 081-082: x 00 # default bitmap encoding
│ ├── 082-088: x 000000000000 # padding to align to 64-bit boundary
│ ├── 088-096: b 0000011100000000000000000000000000000000000000000000000000000000 # bitmap word 0
│ └── 096-104: b 0000000100000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63
├── data for column 4 (bytes)
Expand Down Expand Up @@ -210,13 +212,13 @@ data block header
│ ├── 021-022: b 00000010 # col 2: uint
│ ├── 022-026: x 50000000 # col 2: page start 80
│ ├── 026-027: b 00000001 # col 3: bool
│ ├── 027-031: x 59000000 # col 3: page start 89
│ ├── 027-031: x 54000000 # col 3: page start 84
│ ├── 031-032: b 00000011 # col 4: bytes
│ ├── 032-036: x 70000000 # col 4: page start 112
│ ├── 032-036: x 68000000 # col 4: page start 104
│ ├── 036-037: b 00000001 # col 5: bool
│ ├── 037-041: x 7e000000 # col 5: page start 126
│ ├── 037-041: x 76000000 # col 5: page start 118
│ ├── 041-042: b 00000001 # col 6: bool
│ └── 042-046: x 7f000000 # col 6: page start 127
│ └── 042-046: x 77000000 # col 6: page start 119
├── data for column 0 (prefixbytes)
│ ├── 046-047: x 04 # bundle size: 16
│ ├── offsets table
Expand Down Expand Up @@ -244,26 +246,28 @@ data block header
│ ├── 074-077: x 403534 # data[1]: @54
│ └── 077-080: x 403534 # data[2]: @54
├── data for column 2 (uint)
│ ├── 080-081: x 80 # encoding: const
│ └── 081-089: x 0100000000000000 # 64-bit constant: 1
│ ├── 080-081: x 01 # encoding: 1b
│ ├── 081-082: x 01 # data[0] = 1
│ ├── 082-083: x 01 # data[1] = 1
│ └── 083-084: x 01 # data[2] = 1
├── data for column 3 (bool)
│ ├── 089-090: x 00 # default bitmap encoding
│ ├── 090-096: x 000000000000 # padding to align to 64-bit boundary
│ ├── 096-104: b 0000011100000000000000000000000000000000000000000000000000000000 # bitmap word 0
│ └── 104-112: b 0000000100000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63
│ ├── 084-085: x 00 # default bitmap encoding
│ ├── 085-088: x 000000 # padding to align to 64-bit boundary
│ ├── 088-096: b 0000011100000000000000000000000000000000000000000000000000000000 # bitmap word 0
│ └── 096-104: b 0000000100000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63
├── data for column 4 (bytes)
│ ├── offsets table
│ │ ├── 112-113: x 01 # encoding: 1b
│ │ ├── 113-114: x 00 # data[0] = 0 [117 overall]
│ │ ├── 114-115: x 03 # data[1] = 3 [120 overall]
│ │ ├── 115-116: x 06 # data[2] = 6 [123 overall]
│ │ └── 116-117: x 09 # data[3] = 9 [126 overall]
│ │ ├── 104-105: x 01 # encoding: 1b
│ │ ├── 105-106: x 00 # data[0] = 0 [109 overall]
│ │ ├── 106-107: x 03 # data[1] = 3 [112 overall]
│ │ ├── 107-108: x 06 # data[2] = 6 [115 overall]
│ │ └── 108-109: x 09 # data[3] = 9 [118 overall]
│ └── data
│ ├── 117-120: x 666f6f # data[0]: foo
│ ├── 120-123: x 626172 # data[1]: bar
│ └── 123-126: x 626178 # data[2]: bax
│ ├── 109-112: x 666f6f # data[0]: foo
│ ├── 112-115: x 626172 # data[1]: bar
│ └── 115-118: x 626178 # data[2]: bax
├── data for column 5 (bool)
│ └── 126-127: x 01 # zero bitmap encoding
│ └── 118-119: x 01 # zero bitmap encoding
├── data for column 6 (bool)
│ └── 127-128: x 01 # zero bitmap encoding
└── 128-129: x 00 # block padding byte
│ └── 119-120: x 01 # zero bitmap encoding
└── 120-121: x 00 # block padding byte
30 changes: 15 additions & 15 deletions sstable/colblk/testdata/keyspan_block
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ size=37:
add
a-b:{(#0,RANGEDEL)}
----
size=57:
size=50:
0: user keys: bytes: 2 rows set; 2 bytes in data
1: start indices: uint: 2 rows
2: trailers: uint: 1 rows
Expand Down Expand Up @@ -232,7 +232,7 @@ size=37:
add
b-d:{(#4,RANGEKEYSET,@3,coconut)}
----
size=70:
size=64:
0: user keys: bytes: 2 rows set; 2 bytes in data
1: start indices: uint: 2 rows
2: trailers: uint: 1 rows
Expand All @@ -256,9 +256,9 @@ keyspan-decoder
│ ├── 21-22: b 00000010 # col 2: uint
│ ├── 22-26: x 2d000000 # col 2: page start 45
│ ├── 26-27: b 00000011 # col 3: bytes
│ ├── 27-31: x 36000000 # col 3: page start 54
│ ├── 27-31: x 30000000 # col 3: page start 48
│ ├── 31-32: b 00000011 # col 4: bytes
│ └── 32-36: x 3b000000 # col 4: page start 59
│ └── 32-36: x 35000000 # col 4: page start 53
├── data for column 0 (bytes)
│ ├── offsets table
│ │ ├── 36-37: x 01 # encoding: 1b
Expand All @@ -273,23 +273,23 @@ keyspan-decoder
│ ├── 43-44: x 00 # data[0] = 0
│ └── 44-45: x 01 # data[1] = 1
├── data for column 2 (uint)
│ ├── 45-46: x 80 # encoding: const
│ └── 46-54: x 1504000000000000 # 64-bit constant: 1045
│ ├── 45-46: x 02 # encoding: 2b
│ └── 46-48: x 1504 # data[0] = 1045
├── data for column 3 (bytes)
│ ├── offsets table
│ │ ├── 54-55: x 01 # encoding: 1b
│ │ ├── 55-56: x 00 # data[0] = 0 [57 overall]
│ │ └── 56-57: x 02 # data[1] = 2 [59 overall]
│ │ ├── 48-49: x 01 # encoding: 1b
│ │ ├── 49-50: x 00 # data[0] = 0 [51 overall]
│ │ └── 50-51: x 02 # data[1] = 2 [53 overall]
│ └── data
│ └── 57-59: x 4033 # data[0]: @3
│ └── 51-53: x 4033 # data[0]: @3
├── data for column 4 (bytes)
│ ├── offsets table
│ │ ├── 59-60: x 01 # encoding: 1b
│ │ ├── 60-61: x 00 # data[0] = 0 [62 overall]
│ │ └── 61-62: x 07 # data[1] = 7 [69 overall]
│ │ ├── 53-54: x 01 # encoding: 1b
│ │ ├── 54-55: x 00 # data[0] = 0 [56 overall]
│ │ └── 55-56: x 07 # data[1] = 7 [63 overall]
│ └── data
│ └── 62-69: x 636f636f6e7574 # data[0]: coconut
└── 69-70: x 00 # block padding byte
│ └── 56-63: x 636f636f6e7574 # data[0]: coconut
└── 63-64: x 00 # block padding byte

iter
seek-ge a
Expand Down
11 changes: 8 additions & 3 deletions sstable/colblk/testdata/uints
Original file line number Diff line number Diff line change
Expand Up @@ -495,13 +495,18 @@ write

size rows=(6)
----
Size(6, 0) = 9
Size(6, 0) = 7

finish rows=6
----
uints
├── 0-1: x 80 # encoding: const
└── 1-9: x 0100000000000000 # 64-bit constant: 1
├── 0-1: x 01 # encoding: 1b
├── 1-2: x 01 # data[0] = 1
├── 2-3: x 01 # data[1] = 1
├── 3-4: x 01 # data[2] = 1
├── 4-5: x 01 # data[3] = 1
├── 5-6: x 01 # data[4] = 1
└── 6-7: x 01 # data[5] = 1

# Test 32-bit delta encoding.

Expand Down
Loading

0 comments on commit 88babd7

Please sign in to comment.