From 8b85d783f22c3cd5a46d8eb6727ca59d567a217e Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 7 Aug 2023 09:28:04 +0200 Subject: [PATCH 01/11] WIP: S2++ If the first bytes of a block is `0x40, 0x00` (repeat, length 4), this indicates that all [Copy with 4-byte offset (11)](https://github.com/google/snappy/blob/main/format_description.txt#L106) are all 3 bytes instead for the remainder of the block. There can be no literals before this tag and no repeats before a match as specified above. This will only trigger on this exact tag. > These are like the copies with 2-byte offsets (see previous subsection), > except that the offset is stored as a 24-bit integer instead of a > 16-bit integer (and thus will occupy three bytes). When in this mode the maximum backreference offset is 16777215. This *cannot* be combined with dictionaries. --- s2/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/s2/README.md b/s2/README.md index 8284bb0810..e7a76d40ab 100644 --- a/s2/README.md +++ b/s2/README.md @@ -1022,6 +1022,7 @@ See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-i * Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`. * [Framed compressed blocks](https://github.com/google/snappy/blob/master/format_description.txt) can be up to 4MB (up from 64KB). * Compressed blocks can have an offset of `0`, which indicates to repeat the last seen offset. +* If the first bytes of a block is `0x80, 0x00, 0x00` (copy, 2 byte offset = 0), this indicates that all [Copy with 4-byte offset (11)](https://github.com/google/snappy/blob/main/format_description.txt#L106) are all 3 bytes instead for the remainder of the block. Repeat offsets must be encoded as a [2.2.1. Copy with 1-byte offset (01)](https://github.com/google/snappy/blob/master/format_description.txt#L89), where the offset is 0. @@ -1047,6 +1048,19 @@ The first copy of a block cannot be a repeat offset and the offset is reset on e Default streaming block size is 1MB. +## 3 Byte Offsets + +If the first bytes of a block is `0x80, 0x00, 0x00` (copy, 2 byte offset = 0), this indicates that all [Copy with 4-byte offset (11)](https://github.com/google/snappy/blob/main/format_description.txt#L106) are all 3 bytes instead for the remainder of the block. + +There can be no literals before this tag and no repeats before a match as specified above. +This will only trigger on this exact tag. + +> These are like the copies with 2-byte offsets (see previous subsection), +> except that the offset is stored as a 24-bit integer instead of a +> 16-bit integer (and thus will occupy three bytes). + +When in this mode the maximum backreference offset is 16777215. + # Dictionary Encoding Adding dictionaries allow providing a custom dictionary that will serve as lookup in the beginning of blocks. From 6729fa1539575a19f291ca49a7cde7665d60e36d Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 7 Aug 2023 11:08:28 +0200 Subject: [PATCH 02/11] Add length esitmation to noasm (revert this) --- s2/encode_best.go | 8 ++++---- s2/encode_go.go | 19 +++++++------------ s2/writer.go | 16 +++++++++++++--- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/s2/encode_best.go b/s2/encode_best.go index 1d13e869a1..836096b835 100644 --- a/s2/encode_best.go +++ b/s2/encode_best.go @@ -718,14 +718,14 @@ func emitCopySize(offset, length int) int { length -= 64 if length >= 4 { // Emit remaining as repeats - return 5 + emitRepeatSize(offset, length) + return 4 + emitRepeatSize(offset, length) } - i = 5 + i = 4 } if length == 0 { return i } - return i + 5 + return i + 4 } // Offset no more than 2 bytes. @@ -752,7 +752,7 @@ func emitCopySize(offset, length int) int { // 4 <= length && length <= 1 << 24 func emitCopyNoRepeatSize(offset, length int) int { if offset >= 65536 { - return 5 + 5*(length/64) + return 4 + 4*(length/64) } // Offset no more than 2 bytes. diff --git a/s2/encode_go.go b/s2/encode_go.go index 0d39c7b0e0..e4acf93dac 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -92,12 +92,11 @@ func emitLiteral(dst, lit []byte) int { dst[0] = 62<<2 | tagLiteral i = 4 default: - dst[4] = uint8(n >> 24) dst[3] = uint8(n >> 16) dst[2] = uint8(n >> 8) dst[1] = uint8(n) dst[0] = 63<<2 | tagLiteral - i = 5 + i = 4 } return i + copy(dst[i:], lit) } @@ -163,7 +162,6 @@ func emitCopy(dst []byte, offset, length int) int { i := 0 if length > 64 { // Emit a length 64 copy, encoded as 5 bytes. - dst[4] = uint8(offset >> 24) dst[3] = uint8(offset >> 16) dst[2] = uint8(offset >> 8) dst[1] = uint8(offset) @@ -171,9 +169,9 @@ func emitCopy(dst []byte, offset, length int) int { length -= 64 if length >= 4 { // Emit remaining as repeats - return 5 + emitRepeat(dst[5:], offset, length) + return 4 + emitRepeat(dst[4:], offset, length) } - i = 5 + i = 4 } if length == 0 { return i @@ -183,8 +181,7 @@ func emitCopy(dst []byte, offset, length int) int { dst[i+1] = uint8(offset) dst[i+2] = uint8(offset >> 8) dst[i+3] = uint8(offset >> 16) - dst[i+4] = uint8(offset >> 24) - return i + 5 + return i + 4 } // Offset no more than 2 bytes. @@ -232,7 +229,6 @@ func emitCopyNoRepeat(dst []byte, offset, length int) int { i := 0 if length > 64 { // Emit a length 64 copy, encoded as 5 bytes. - dst[4] = uint8(offset >> 24) dst[3] = uint8(offset >> 16) dst[2] = uint8(offset >> 8) dst[1] = uint8(offset) @@ -240,9 +236,9 @@ func emitCopyNoRepeat(dst []byte, offset, length int) int { length -= 64 if length >= 4 { // Emit remaining as repeats - return 5 + emitCopyNoRepeat(dst[5:], offset, length) + return 4 + emitCopyNoRepeat(dst[4:], offset, length) } - i = 5 + i = 4 } if length == 0 { return i @@ -252,8 +248,7 @@ func emitCopyNoRepeat(dst []byte, offset, length int) int { dst[i+1] = uint8(offset) dst[i+2] = uint8(offset >> 8) dst[i+3] = uint8(offset >> 16) - dst[i+4] = uint8(offset >> 24) - return i + 5 + return i + 4 } // Offset no more than 2 bytes. diff --git a/s2/writer.go b/s2/writer.go index 089cd36d8c..89266797af 100644 --- a/s2/writer.go +++ b/s2/writer.go @@ -448,13 +448,23 @@ func (w *Writer) encodeBlock(obuf, uncompressed []byte) int { } return 0 } + adjust := func(n int) int { + if n <= 0 { + return 0 + } + n += 3 + if n >= len(uncompressed) { + return 0 + } + return n + } switch w.level { case levelFast: - return encodeBlock(obuf, uncompressed) + return adjust(encodeBlock(obuf, uncompressed)) case levelBetter: - return encodeBlockBetter(obuf, uncompressed) + return adjust(encodeBlockBetter(obuf, uncompressed)) case levelBest: - return encodeBlockBest(obuf, uncompressed, nil) + return adjust(encodeBlockBest(obuf, uncompressed, nil)) } return 0 } From e1915757d175c2aeb52fe9a2b1402e35a469e821 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 29 Aug 2023 14:07:24 +0200 Subject: [PATCH 03/11] Do not bail, except length 4 --- s2/encode_best.go | 2 +- s2/encode_better.go | 2 +- s2/encode_go.go | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/s2/encode_best.go b/s2/encode_best.go index 836096b835..5827990e14 100644 --- a/s2/encode_best.go +++ b/s2/encode_best.go @@ -375,7 +375,7 @@ func encodeBlockBest(dst, src []byte, dict *Dict) (d int) { offset := s - best.offset s += best.length - if offset > 65535 && s-base <= 5 && !best.rep { + if offset > 65535 && s-base <= 4 && !best.rep { // Bail if the match is equal or worse to the encoding. s = best.s + 1 if s >= sLimit { diff --git a/s2/encode_better.go b/s2/encode_better.go index 544cb1e17b..6963fb5137 100644 --- a/s2/encode_better.go +++ b/s2/encode_better.go @@ -231,7 +231,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { candidateL += 8 } - if offset > 65535 && s-base <= 5 && repeat != offset { + if offset > 65535 && s-base <= 4 && repeat != offset { // Bail if the match is equal or worse to the encoding. s = nextS + 1 if s >= sLimit { diff --git a/s2/encode_go.go b/s2/encode_go.go index e4acf93dac..83a5eb1b25 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -161,7 +161,7 @@ func emitCopy(dst []byte, offset, length int) int { if offset >= 65536 { i := 0 if length > 64 { - // Emit a length 64 copy, encoded as 5 bytes. + // Emit a length 64 copy, encoded as 4 bytes. dst[3] = uint8(offset >> 16) dst[2] = uint8(offset >> 8) dst[1] = uint8(offset) @@ -176,7 +176,7 @@ func emitCopy(dst []byte, offset, length int) int { if length == 0 { return i } - // Emit a copy, offset encoded as 4 bytes. + // Emit a copy, offset encoded as 3 bytes. dst[i+0] = uint8(length-1)<<2 | tagCopy4 dst[i+1] = uint8(offset) dst[i+2] = uint8(offset >> 8) From d1416ed88a1da3d723d0d630239f09cda8a5be51 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 29 Aug 2023 18:16:22 +0200 Subject: [PATCH 04/11] Better repeat codes. --- s2/README.md | 8 +++++++- s2/encode_best.go | 11 +++-------- s2/encode_go.go | 41 +++++++++++------------------------------ 3 files changed, 21 insertions(+), 39 deletions(-) diff --git a/s2/README.md b/s2/README.md index e7a76d40ab..0321bd4c50 100644 --- a/s2/README.md +++ b/s2/README.md @@ -1050,7 +1050,7 @@ Default streaming block size is 1MB. ## 3 Byte Offsets -If the first bytes of a block is `0x80, 0x00, 0x00` (copy, 2 byte offset = 0), this indicates that all [Copy with 4-byte offset (11)](https://github.com/google/snappy/blob/main/format_description.txt#L106) are all 3 bytes instead for the remainder of the block. +If the first bytes of a block is `0x80, 0x00, 0x00` (copy, 2 byte offset = 0), this indicates that all [Copy with 4-byte offset (11)](https://github.com/google/snappy/blob/main/format_description.txt#L106) are all 3 bytes instead for the remainder of the block and literal value 63 is now a repeat code. There can be no literals before this tag and no repeats before a match as specified above. This will only trigger on this exact tag. @@ -1061,6 +1061,12 @@ This will only trigger on this exact tag. When in this mode the maximum backreference offset is 16777215. +Furthermore, encoding with literal code 63 no longer emits literals, but indicates a 1 byte repeat offset code. + +The next byte indicates the length of the repeat offset - minus one, so length 1 to 256 can be encoded as 2 bytes. + +Decode as such: `if tag == 63 { length = readByte() + 1 }`. + # Dictionary Encoding Adding dictionaries allow providing a custom dictionary that will serve as lookup in the beginning of blocks. diff --git a/s2/encode_best.go b/s2/encode_best.go index 5827990e14..63e5e781e9 100644 --- a/s2/encode_best.go +++ b/s2/encode_best.go @@ -770,21 +770,16 @@ func emitCopyNoRepeatSize(offset, length int) int { // emitRepeatSize returns the number of bytes required to encode a repeat. // Length must be at least 4 and < 1<<24 func emitRepeatSize(offset, length int) int { - // Repeat offset, make length cheaper - if length <= 4+4 || (length < 8+4 && offset < 2048) { + if length <= 256 { return 2 } - if length < (1<<8)+4+4 { - return 3 - } - if length < (1<<16)+(1<<8)+4 { + if length <= 65536 { return 4 } const maxRepeat = (1 << 24) - 1 - length -= (1 << 16) - 4 left := 0 if length > maxRepeat { - left = length - maxRepeat + 4 + left = length - maxRepeat } if left > 0 { return 5 + emitRepeatSize(offset, left) diff --git a/s2/encode_go.go b/s2/encode_go.go index 83a5eb1b25..76352db925 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -92,11 +92,7 @@ func emitLiteral(dst, lit []byte) int { dst[0] = 62<<2 | tagLiteral i = 4 default: - dst[3] = uint8(n >> 16) - dst[2] = uint8(n >> 8) - dst[1] = uint8(n) - dst[0] = 63<<2 | tagLiteral - i = 4 + panic("unreachable") } return i + copy(dst[i:], lit) } @@ -105,45 +101,30 @@ func emitLiteral(dst, lit []byte) int { // Length must be at least 4 and < 1<<24 func emitRepeat(dst []byte, offset, length int) int { // Repeat offset, make length cheaper - length -= 4 - if length <= 4 { - dst[0] = uint8(length)<<2 | tagCopy1 - dst[1] = 0 + if length <= 256 { + dst[1] = uint8(length - 1) + dst[0] = 63 | tagLiteral return 2 } - if length < 8 && offset < 2048 { - // Encode WITH offset - dst[1] = uint8(offset) - dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1 - return 2 - } - if length < (1<<8)+4 { - length -= 4 - dst[2] = uint8(length) - dst[1] = 0 - dst[0] = 5<<2 | tagCopy1 - return 3 - } - if length < (1<<16)+(1<<8) { - length -= 1 << 8 + length-- + if length < 65536 { dst[3] = uint8(length >> 8) dst[2] = uint8(length >> 0) dst[1] = 0 - dst[0] = 6<<2 | tagCopy1 + dst[0] = 0<<2 | tagCopy1 return 4 } const maxRepeat = (1 << 24) - 1 - length -= 1 << 16 - left := 0 + var left int if length > maxRepeat { - left = length - maxRepeat + 4 - length = maxRepeat - 4 + left = length - maxRepeat + length = maxRepeat } dst[4] = uint8(length >> 16) dst[3] = uint8(length >> 8) dst[2] = uint8(length >> 0) dst[1] = 0 - dst[0] = 7<<2 | tagCopy1 + dst[0] = 1<<2 | tagCopy1 if left > 0 { return 5 + emitRepeat(dst[5:], offset, left) } From 4100709e77064548e7107a5d94ae7ee9f0a91394 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sun, 10 Sep 2023 17:04:24 +0200 Subject: [PATCH 05/11] Tweak emit priority. --- s2/encode_best.go | 27 +++++++--------------- s2/encode_go.go | 58 +++++++++++++++++++++-------------------------- s2/s2.go | 1 + 3 files changed, 35 insertions(+), 51 deletions(-) diff --git a/s2/encode_best.go b/s2/encode_best.go index 63e5e781e9..dc1e64ae3d 100644 --- a/s2/encode_best.go +++ b/s2/encode_best.go @@ -713,29 +713,18 @@ emitRemainder: // 4 <= length && length <= 1 << 24 func emitCopySize(offset, length int) int { if offset >= 65536 { - i := 0 - if length > 64 { - length -= 64 - if length >= 4 { - // Emit remaining as repeats - return 4 + emitRepeatSize(offset, length) - } - i = 4 - } - if length == 0 { - return i - } - return i + 4 + // Emit remaining as repeats + return 4 + emitRepeatSize(offset, length-64) } // Offset no more than 2 bytes. if length > 64 { if offset < 2048 { // Emit 8 bytes, then rest as repeats... - return 2 + emitRepeatSize(offset, length-8) + return 2 + emitRepeatSize(offset, length-12) } // Emit remaining as repeats, at least 4 bytes remain. - return 3 + emitRepeatSize(offset, length-60) + return 3 + emitRepeatSize(offset, length-64) } if length >= 12 || offset >= 2048 { return 3 @@ -770,6 +759,9 @@ func emitCopyNoRepeatSize(offset, length int) int { // emitRepeatSize returns the number of bytes required to encode a repeat. // Length must be at least 4 and < 1<<24 func emitRepeatSize(offset, length int) int { + if length <= 0 { + return 0 + } if length <= 256 { return 2 } @@ -781,8 +773,5 @@ func emitRepeatSize(offset, length int) int { if length > maxRepeat { left = length - maxRepeat } - if left > 0 { - return 5 + emitRepeatSize(offset, left) - } - return 5 + return 5 + emitRepeatSize(offset, left) } diff --git a/s2/encode_go.go b/s2/encode_go.go index 76352db925..e267b1f79b 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -106,7 +106,7 @@ func emitRepeat(dst []byte, offset, length int) int { dst[0] = 63 | tagLiteral return 2 } - length-- + if length < 65536 { dst[3] = uint8(length >> 8) dst[2] = uint8(length >> 0) @@ -140,51 +140,45 @@ func emitRepeat(dst []byte, offset, length int) int { // 4 <= length && length <= 1 << 24 func emitCopy(dst []byte, offset, length int) int { if offset >= 65536 { - i := 0 - if length > 64 { - // Emit a length 64 copy, encoded as 4 bytes. - dst[3] = uint8(offset >> 16) - dst[2] = uint8(offset >> 8) + // Emit a length 64 copy, encoded as 4 bytes. + if length <= 64 { + // Emit a copy, offset encoded as 3 bytes. + dst[0] = uint8(length-1)<<2 | tagCopy4 dst[1] = uint8(offset) - dst[0] = 63<<2 | tagCopy4 - length -= 64 - if length >= 4 { - // Emit remaining as repeats - return 4 + emitRepeat(dst[4:], offset, length) - } - i = 4 - } - if length == 0 { - return i + dst[2] = uint8(offset >> 8) + dst[3] = uint8(offset >> 16) + return 4 } - // Emit a copy, offset encoded as 3 bytes. - dst[i+0] = uint8(length-1)<<2 | tagCopy4 - dst[i+1] = uint8(offset) - dst[i+2] = uint8(offset >> 8) - dst[i+3] = uint8(offset >> 16) - return i + 4 + + dst[3] = uint8(offset >> 16) + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = 63<<2 | tagCopy4 + length -= 64 + // Emit remaining as repeats + return 4 + emitRepeat(dst[4:], offset, length) } // Offset no more than 2 bytes. if length > 64 { - off := 3 if offset < 2048 { // emit 8 bytes as tagCopy1, rest as repeats. dst[1] = uint8(offset) - dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1 - length -= 8 - off = 2 + dst[0] = uint8(offset>>8)<<5 | uint8(7)<<2 | tagCopy1 + length -= 12 + return 2 + emitRepeat(dst[2:], offset, length) } else { - // Emit a length 60 copy, encoded as 3 bytes. - // Emit remaining as repeat value (minimum 4 bytes). + // Emit a length 64 copy, encoded as 3 bytes. + // Emit remaining as repeat value. dst[2] = uint8(offset >> 8) dst[1] = uint8(offset) - dst[0] = 59<<2 | tagCopy2 - length -= 60 + dst[0] = 63<<2 | tagCopy2 + length -= 64 + // Emit remaining as repeats. At least 1 byte. + return 3 + emitRepeat(dst[3:], offset, length) } - // Emit remaining as repeats, at least 4 bytes remain. - return off + emitRepeat(dst[off:], offset, length) } + if length >= 12 || offset >= 2048 { // Emit the remaining copy, encoded as 3 bytes. dst[2] = uint8(offset >> 8) diff --git a/s2/s2.go b/s2/s2.go index dae3f731fa..0cbeef946f 100644 --- a/s2/s2.go +++ b/s2/s2.go @@ -77,6 +77,7 @@ const ( magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy magicBodySnappy = "sNaPpY" magicBody = "S2sTwO" + magicBodyPP = "S2s2++" // maxBlockSize is the maximum size of the input to encodeBlock. // From a2dba0e22d65f353d909ed00a302f2e2e0175216 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 18 Sep 2023 10:32:52 +0200 Subject: [PATCH 06/11] Use copy4 for repeats as well. --- s2/encode_best.go | 10 ++++-- s2/encode_go.go | 81 +++++++++++++++++++++++++---------------------- 2 files changed, 51 insertions(+), 40 deletions(-) diff --git a/s2/encode_best.go b/s2/encode_best.go index dc1e64ae3d..c6474c2108 100644 --- a/s2/encode_best.go +++ b/s2/encode_best.go @@ -714,7 +714,7 @@ emitRemainder: func emitCopySize(offset, length int) int { if offset >= 65536 { // Emit remaining as repeats - return 4 + emitRepeatSize(offset, length-64) + return 3 + emitRepeatSize(offset, length-3) } // Offset no more than 2 bytes. @@ -762,16 +762,20 @@ func emitRepeatSize(offset, length int) int { if length <= 0 { return 0 } + + if length <= 29 { + return 1 + } if length <= 256 { return 2 } if length <= 65536 { - return 4 + return 3 } const maxRepeat = (1 << 24) - 1 left := 0 if length > maxRepeat { left = length - maxRepeat } - return 5 + emitRepeatSize(offset, left) + return 4 + emitRepeatSize(offset, left) } diff --git a/s2/encode_go.go b/s2/encode_go.go index e267b1f79b..7d3e3980c9 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -92,43 +92,60 @@ func emitLiteral(dst, lit []byte) int { dst[0] = 62<<2 | tagLiteral i = 4 default: - panic("unreachable") + dst[4] = uint8(n >> 24) + dst[3] = uint8(n >> 16) + dst[2] = uint8(n >> 8) + dst[1] = uint8(n) + dst[0] = 63<<2 | tagLiteral + i = 5 } return i + copy(dst[i:], lit) } +// 0-28: Length 1 -> 29 +// 29: Length (Read 1) + 1 +// 30: Length (Read 2) + 1 +// 31: Length (Read 3) + 1 + +const tagRepeat = tagCopy4 | 4 + // emitRepeat writes a repeat chunk and returns the number of bytes written. // Length must be at least 4 and < 1<<24 func emitRepeat(dst []byte, offset, length int) int { // Repeat offset, make length cheaper - if length <= 256 { - dst[1] = uint8(length - 1) - dst[0] = 63 | tagLiteral + return encodeLength(dst, tagRepeat, length) +} + +// encodeLength encodes a length and returns the number of bytes written. +// length must be at least 1 and < 1<<24 +func encodeLength(dst []byte, tag uint8, length int) int { + // Repeat offset, make length cheaper + length-- + if length <= 28 { + dst[0] = uint8(length)<<3 | tag + return 1 + } + if length < 256 { + dst[1] = uint8(length >> 0) + dst[0] = 29<<3 | tag return 2 } if length < 65536 { - dst[3] = uint8(length >> 8) - dst[2] = uint8(length >> 0) - dst[1] = 0 - dst[0] = 0<<2 | tagCopy1 - return 4 + dst[2] = uint8(length >> 8) + dst[1] = uint8(length >> 0) + dst[0] = 30<<3 | tag + return 3 } const maxRepeat = (1 << 24) - 1 - var left int if length > maxRepeat { - left = length - maxRepeat - length = maxRepeat - } - dst[4] = uint8(length >> 16) - dst[3] = uint8(length >> 8) - dst[2] = uint8(length >> 0) - dst[1] = 0 - dst[0] = 1<<2 | tagCopy1 - if left > 0 { - return 5 + emitRepeat(dst[5:], offset, left) + panic("unreachable") } - return 5 + dst[3] = uint8(length >> 16) + dst[2] = uint8(length >> 8) + dst[1] = uint8(length >> 0) + dst[0] = 31<<3 | tagRepeat + return 4 } // emitCopy writes a copy chunk and returns the number of bytes written. @@ -140,23 +157,13 @@ func emitRepeat(dst []byte, offset, length int) int { // 4 <= length && length <= 1 << 24 func emitCopy(dst []byte, offset, length int) int { if offset >= 65536 { - // Emit a length 64 copy, encoded as 4 bytes. - if length <= 64 { - // Emit a copy, offset encoded as 3 bytes. - dst[0] = uint8(length-1)<<2 | tagCopy4 - dst[1] = uint8(offset) - dst[2] = uint8(offset >> 8) - dst[3] = uint8(offset >> 16) - return 4 - } - - dst[3] = uint8(offset >> 16) - dst[2] = uint8(offset >> 8) - dst[1] = uint8(offset) - dst[0] = 63<<2 | tagCopy4 - length -= 64 + // Emit a long offset code. + n := encodeLength(dst, tagCopy4, length-3) + dst[n+2] = uint8(offset >> 16) + dst[n+1] = uint8(offset >> 8) + dst[n+0] = uint8(offset) // Emit remaining as repeats - return 4 + emitRepeat(dst[4:], offset, length) + return 3 + n } // Offset no more than 2 bytes. From 51032dcb13cd7be86978185aa1109ae378e3c223 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 18 Sep 2023 12:00:47 +0200 Subject: [PATCH 07/11] Update docs --- s2/README.md | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/s2/README.md b/s2/README.md index 0321bd4c50..f15827a4ae 100644 --- a/s2/README.md +++ b/s2/README.md @@ -1048,24 +1048,33 @@ The first copy of a block cannot be a repeat offset and the offset is reset on e Default streaming block size is 1MB. -## 3 Byte Offsets +## S2++ Mode If the first bytes of a block is `0x80, 0x00, 0x00` (copy, 2 byte offset = 0), this indicates that all [Copy with 4-byte offset (11)](https://github.com/google/snappy/blob/main/format_description.txt#L106) are all 3 bytes instead for the remainder of the block and literal value 63 is now a repeat code. There can be no literals before this tag and no repeats before a match as specified above. This will only trigger on this exact tag. -> These are like the copies with 2-byte offsets (see previous subsection), -> except that the offset is stored as a 24-bit integer instead of a -> 16-bit integer (and thus will occupy three bytes). +## Tag 0x3 (TagCopy4) -When in this mode the maximum backreference offset is 16777215. +| Bits | Meaning | Description | +|------|---------|------------------------------------------------------------------------| +| 0-1 | Tag | Always 0x3 | +| 2 | Repeat | 0 if copy, 1 if repeat. | +| 3-7 | Length | Length of copy or repeat
Values are 0-31. See decoding table below | -Furthermore, encoding with literal code 63 no longer emits literals, but indicates a 1 byte repeat offset code. +| Value | Output | +|-------|---------------------| +| 0-28 | Base + Value | +| 29 | Base + Read 1 byte | +| 30 | Base + Read 2 bytes | +| 31 | Base + Read 3 bytes | -The next byte indicates the length of the repeat offset - minus one, so length 1 to 256 can be encoded as 2 bytes. +For copy operations the Base value is `4` For repeat, the base value is `1`. -Decode as such: `if tag == 63 { length = readByte() + 1 }`. +Copy offsets are encoded as `3` bytes following the length. The maximum backreference offset is therefore 16777215. + +The S2 repeat encoding specified on TagCopy2 is not valid in this mode. # Dictionary Encoding From 2ee8f3933a078dd27e6d0ef9fe24dbed571e41c5 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 17 Nov 2023 16:26:45 +0100 Subject: [PATCH 08/11] Modify TagCopy2 as well --- s2/README.md | 25 ++++++++++++++- s2/encode_best.go | 41 +++++++++++++++++------- s2/encode_go.go | 79 +++++++++++++++++++++++++++++++---------------- 3 files changed, 105 insertions(+), 40 deletions(-) diff --git a/s2/README.md b/s2/README.md index f15827a4ae..56f7ffa5f4 100644 --- a/s2/README.md +++ b/s2/README.md @@ -1050,11 +1050,34 @@ Default streaming block size is 1MB. ## S2++ Mode -If the first bytes of a block is `0x80, 0x00, 0x00` (copy, 2 byte offset = 0), this indicates that all [Copy with 4-byte offset (11)](https://github.com/google/snappy/blob/main/format_description.txt#L106) are all 3 bytes instead for the remainder of the block and literal value 63 is now a repeat code. +If the first bytes of a block is `0x80, 0x00, 0x00` (copy, 2 byte offset = 0), +this indicates that all [Copy with 2-byte offset (10)](https://github.com/google/snappy/blob/main/format_description.txt#L98) +and [Copy with 4-byte offset (11)](https://github.com/google/snappy/blob/main/format_description.txt#L106) tags change. There can be no literals before this tag and no repeats before a match as specified above. This will only trigger on this exact tag. +## Tag 0x2 (TagCopy2) + +The length field now has a base value of 4 and there are 3 special valaues for longer matches. + +| Bits | Meaning | Description | +|------|---------|------------------------------------------------------------------------| +| 0-1 | Tag | Always 0x2 | +| 2-7 | Length | Length of copy or repeat
Values are 0-63. See decoding table below | + +| Value | Output | +|-------|---------------------| +| 0-60 | Base + Value | +| 61 | Base + Read 1 byte | +| 62 | Base + Read 2 bytes | +| 63 | Base + Read 3 bytes | + +Base value is 4 for all copies. + +Offsets are encoded as 2 bytes following the length. +The maximum backreference offset is therefore 65535. + ## Tag 0x3 (TagCopy4) | Bits | Meaning | Description | diff --git a/s2/encode_best.go b/s2/encode_best.go index 41f8eb2e30..4ebd1b2723 100644 --- a/s2/encode_best.go +++ b/s2/encode_best.go @@ -721,19 +721,11 @@ func emitCopySize(offset, length int) int { } // Offset no more than 2 bytes. - if length > 64 { - if offset < 2048 { - // Emit 8 bytes, then rest as repeats... - return 2 + emitRepeatSize(offset, length-12) - } - // Emit remaining as repeats, at least 4 bytes remain. - return 3 + emitRepeatSize(offset, length-64) + if offset < 2048 { + // Emit 11 bytes, then rest as repeats... + return 2 + emitRepeatSize(offset, length-12) } - if length >= 12 || offset >= 2048 { - return 3 - } - // Emit the remaining copy, encoded as 2 bytes. - return 2 + return 2 + emitCopy2Size(length) } // emitCopyNoRepeatSize returns the size to encode the offset+length @@ -782,3 +774,28 @@ func emitRepeatSize(offset, length int) int { } return 4 + emitRepeatSize(offset, left) } + +// emitRepeatSize returns the number of bytes required to encode a repeat. +// Length must be at least 4 and < 1<<24 +func emitCopy2Size(length int) int { + length -= 4 + if length < 0 { + return 0 + } + + if length <= 60 { + return 1 + } + if length <= 256 { + return 2 + } + if length <= 65536 { + return 3 + } + const maxRepeat = (1 << 24) - 1 + left := 0 + if length > maxRepeat { + left = length - maxRepeat + } + return 4 + emitRepeatSize(0, left) +} diff --git a/s2/encode_go.go b/s2/encode_go.go index 8b0ea83293..6d107c3bfa 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -5,6 +5,7 @@ package s2 import ( "bytes" + "fmt" "math/bits" ) @@ -121,6 +122,9 @@ func emitRepeat(dst []byte, offset, length int) int { func encodeLength(dst []byte, tag uint8, length int) int { // Repeat offset, make length cheaper length-- + if length < 0 { + panic(fmt.Sprintf("invalid length %d", length)) + } if length <= 28 { dst[0] = uint8(length)<<3 | tag return 1 @@ -148,6 +152,41 @@ func encodeLength(dst []byte, tag uint8, length int) int { return 4 } +// encodeLength60 encodes a length and returns the number of bytes written. +// length must be at least 4 and < 1<<24 +func encodeLength60(dst []byte, tag uint8, length int) int { + // Repeat offset, make length cheaper + length -= 4 + if length < 0 { + panic(fmt.Sprintf("invalid length %d", length)) + } + if length <= 60 { + dst[0] = uint8(length)<<2 | tag + return 1 + } + if length < 256 { + dst[1] = uint8(length >> 0) + dst[0] = 61<<2 | tag + return 2 + } + + if length < 65536 { + dst[2] = uint8(length >> 8) + dst[1] = uint8(length >> 0) + dst[0] = 62<<2 | tag + return 3 + } + const maxRepeat = (1 << 24) - 1 + if length > maxRepeat { + panic("unreachable") + } + dst[3] = uint8(length >> 16) + dst[2] = uint8(length >> 8) + dst[1] = uint8(length >> 0) + dst[0] = 63<<2 | tagRepeat + return 4 +} + // emitCopy writes a copy chunk and returns the number of bytes written. // // It assumes that: @@ -167,36 +206,22 @@ func emitCopy(dst []byte, offset, length int) int { } // Offset no more than 2 bytes. - if length > 64 { - if offset < 2048 { - // emit 8 bytes as tagCopy1, rest as repeats. - dst[1] = uint8(offset) - dst[0] = uint8(offset>>8)<<5 | uint8(7)<<2 | tagCopy1 - length -= 12 - return 2 + emitRepeat(dst[2:], offset, length) - } else { - // Emit a length 64 copy, encoded as 3 bytes. - // Emit remaining as repeat value. - dst[2] = uint8(offset >> 8) - dst[1] = uint8(offset) - dst[0] = 63<<2 | tagCopy2 - length -= 64 - // Emit remaining as repeats. At least 1 byte. - return 3 + emitRepeat(dst[3:], offset, length) + if offset < 2048 { + // emit 12 bytes as tagCopy1, rest as repeats. + dst[1] = uint8(offset) + dst[0] = uint8(offset>>8)<<5 | uint8(7)<<2 | tagCopy1 + length -= 12 + if length > 0 { + emitRepeat(dst[2:], offset, length) } + return 2 } - if length >= 12 || offset >= 2048 { - // Emit the remaining copy, encoded as 3 bytes. - dst[2] = uint8(offset >> 8) - dst[1] = uint8(offset) - dst[0] = uint8(length-1)<<2 | tagCopy2 - return 3 - } - // Emit the remaining copy, encoded as 2 bytes. - dst[1] = uint8(offset) - dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 - return 2 + // Emit the remaining copy, encoded as 3+ bytes. + n := encodeLength60(dst, tagCopy2, length) + dst[n+1] = uint8(offset >> 8) + dst[n] = uint8(offset) + return n + 2 } // emitCopyNoRepeat writes a copy chunk and returns the number of bytes written. From fb2efcb7d1509c67acba432620bef0b050e3688d Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 24 Nov 2023 12:44:02 +0100 Subject: [PATCH 09/11] With subs --- s2/encode_best.go | 35 ++++++++++++++++++++++------------- s2/encode_go.go | 15 +++++++-------- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/s2/encode_best.go b/s2/encode_best.go index 4ebd1b2723..7e8fdab756 100644 --- a/s2/encode_best.go +++ b/s2/encode_best.go @@ -716,16 +716,17 @@ emitRemainder: // 4 <= length && length <= 1 << 24 func emitCopySize(offset, length int) int { if offset >= 65536 { - // Emit remaining as repeats + // 3 Byte offset + Variable length (base length 4). return 3 + emitRepeatSize(offset, length-3) } // Offset no more than 2 bytes. - if offset < 2048 { - // Emit 11 bytes, then rest as repeats... - return 2 + emitRepeatSize(offset, length-12) + if offset < 2048 && length < 12 { + // Emit up to 11 bytes with short offset. + return 2 } - return 2 + emitCopy2Size(length) + // 2 byte offset + Variable length (base length 4). + return emitCopy2Size(length) } // emitCopyNoRepeatSize returns the size to encode the offset+length @@ -752,7 +753,7 @@ func emitCopyNoRepeatSize(offset, length int) int { } // emitRepeatSize returns the number of bytes required to encode a repeat. -// Length must be at least 4 and < 1<<24 +// Length must be at least 1 and < 1<<24 func emitRepeatSize(offset, length int) int { if length <= 0 { return 0 @@ -761,6 +762,7 @@ func emitRepeatSize(offset, length int) int { if length <= 29 { return 1 } + length -= 29 if length <= 256 { return 2 } @@ -775,27 +777,34 @@ func emitRepeatSize(offset, length int) int { return 4 + emitRepeatSize(offset, left) } -// emitRepeatSize returns the number of bytes required to encode a repeat. -// Length must be at least 4 and < 1<<24 +// emitCopy2Size returns the number of bytes required to encode a copy2. +// Length must be less than 1<<24 func emitCopy2Size(length int) int { length -= 4 if length < 0 { - return 0 + // Should not happen, but we keep it so caller doesn't have to check. + return 2 } if length <= 60 { - return 1 + // Length inside tag. + return 1 + 2 } + length -= 60 if length <= 256 { - return 2 + // Length in 1 byte. + return 2 + 2 } if length <= 65536 { - return 3 + // Length in 2 bytes. + return 3 + 2 } + // Length in 3 bytes. + // Anything remaining must be repeats. const maxRepeat = (1 << 24) - 1 left := 0 if length > maxRepeat { left = length - maxRepeat } - return 4 + emitRepeatSize(0, left) + return 2 + 4 + emitRepeatSize(0, left) } diff --git a/s2/encode_go.go b/s2/encode_go.go index 6d107c3bfa..1c487b0f75 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -129,6 +129,7 @@ func encodeLength(dst []byte, tag uint8, length int) int { dst[0] = uint8(length)<<3 | tag return 1 } + length -= 28 if length < 256 { dst[1] = uint8(length >> 0) dst[0] = 29<<3 | tag @@ -164,6 +165,7 @@ func encodeLength60(dst []byte, tag uint8, length int) int { dst[0] = uint8(length)<<2 | tag return 1 } + length -= 60 if length < 256 { dst[1] = uint8(length >> 0) dst[0] = 61<<2 | tag @@ -196,28 +198,25 @@ func encodeLength60(dst []byte, tag uint8, length int) int { // 4 <= length && length <= 1 << 24 func emitCopy(dst []byte, offset, length int) int { if offset >= 65536 { - // Emit a long offset code. + // Encode tag+length as up to 4 bytes. n := encodeLength(dst, tagCopy4, length-3) + // Encode offset as 3 bytes. dst[n+2] = uint8(offset >> 16) dst[n+1] = uint8(offset >> 8) dst[n+0] = uint8(offset) - // Emit remaining as repeats return 3 + n } - // Offset no more than 2 bytes. - if offset < 2048 { + // Offset no more than 2 bytes and length less than 12. + if offset < 2048 && length < 12 { // emit 12 bytes as tagCopy1, rest as repeats. dst[1] = uint8(offset) dst[0] = uint8(offset>>8)<<5 | uint8(7)<<2 | tagCopy1 length -= 12 - if length > 0 { - emitRepeat(dst[2:], offset, length) - } return 2 } - // Emit the remaining copy, encoded as 3+ bytes. + // 2 byte offset, with variable length. n := encodeLength60(dst, tagCopy2, length) dst[n+1] = uint8(offset >> 8) dst[n] = uint8(offset) From 617aeece5d3cf32bc6a5f4ab6287bc5ac5b9f1a1 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 11 Dec 2023 11:30:59 +0100 Subject: [PATCH 10/11] Tweak encoding --- s2/encode_best.go | 11 ++++++++--- s2/encode_go.go | 21 +++++++++++++++------ s2/encode_test.go | 19 +++++++++++++++++++ 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/s2/encode_best.go b/s2/encode_best.go index 7e8fdab756..1e66d332c8 100644 --- a/s2/encode_best.go +++ b/s2/encode_best.go @@ -721,9 +721,14 @@ func emitCopySize(offset, length int) int { } // Offset no more than 2 bytes. - if offset < 2048 && length < 12 { - // Emit up to 11 bytes with short offset. - return 2 + if offset < 1024 { + if length < 11+8 { + // Emit up to 18 bytes with short offset. + return 2 + } + if length < 18+256 { + return 3 + } } // 2 byte offset + Variable length (base length 4). return emitCopy2Size(length) diff --git a/s2/encode_go.go b/s2/encode_go.go index 1c487b0f75..8fe687b702 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -197,6 +197,7 @@ func encodeLength60(dst []byte, tag uint8, length int) int { // 1 <= offset && offset <= math.MaxUint32 // 4 <= length && length <= 1 << 24 func emitCopy(dst []byte, offset, length int) int { + offset-- if offset >= 65536 { // Encode tag+length as up to 4 bytes. n := encodeLength(dst, tagCopy4, length-3) @@ -208,12 +209,20 @@ func emitCopy(dst []byte, offset, length int) int { } // Offset no more than 2 bytes and length less than 12. - if offset < 2048 && length < 12 { - // emit 12 bytes as tagCopy1, rest as repeats. - dst[1] = uint8(offset) - dst[0] = uint8(offset>>8)<<5 | uint8(7)<<2 | tagCopy1 - length -= 12 - return 2 + if offset < 1024 { + if length < 12+8 { + // FIXME: Incorrect encoding + dst[1] = uint8(offset) + dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 + return 2 + } + if length < 18+256 { + // FIXME: Incorrect encoding + dst[2] = uint8(length - 18) + dst[1] = uint8(offset) + dst[0] = uint8(offset>>8)<<5 | uint8(11+8)<<2 | tagCopy1 + return 3 + } } // 2 byte offset, with variable length. diff --git a/s2/encode_test.go b/s2/encode_test.go index 418a79e127..e8f79e94e9 100644 --- a/s2/encode_test.go +++ b/s2/encode_test.go @@ -6,6 +6,7 @@ package s2 import ( "bytes" + "encoding/binary" "fmt" "math" "testing" @@ -67,3 +68,21 @@ func TestEncodeHuge(t *testing.T) { } test(t, make([]byte, MaxBlockSize)) } + +func TestSizes(t *testing.T) { + var src [2]byte + src[0] = 123 + src[1] = 57 + s := 2 + + want := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) + //got := bits.RotateLeft16(binary.LittleEndian.Uint16(src[:]), 16-5) & 2047 + got := binary.LittleEndian.Uint16(src[:]) + t.Logf("w:%012b G:%016b", want, got) + for i := 4; i < 100; i++ { + if i == 99 { + i = (1 << 24) - 1 + } + t.Logf("%d: short:%d medium: %d long: %d repeat: %d", i, emitCopySize(10, i), emitCopySize(4000, i), emitCopySize(70000, i), emitRepeatSize(0, i)) + } +} From c928a8d246b99596f55f9b3480dddf2dd182aa89 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 11 Dec 2023 14:21:02 +0100 Subject: [PATCH 11/11] Use a bit for extra copy length in TagCopy4. --- s2/encode_best.go | 6 +++++- s2/encode_go.go | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/s2/encode_best.go b/s2/encode_best.go index 1e66d332c8..22e741e88f 100644 --- a/s2/encode_best.go +++ b/s2/encode_best.go @@ -717,7 +717,11 @@ emitRemainder: func emitCopySize(offset, length int) int { if offset >= 65536 { // 3 Byte offset + Variable length (base length 4). - return 3 + emitRepeatSize(offset, length-3) + length -= 3 + if length > 28 { + length -= 28 + } + return 3 + emitRepeatSize(offset, length) } // Offset no more than 2 bytes. diff --git a/s2/encode_go.go b/s2/encode_go.go index 8fe687b702..967611d371 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -199,8 +199,13 @@ func encodeLength60(dst []byte, tag uint8, length int) int { func emitCopy(dst []byte, offset, length int) int { offset-- if offset >= 65536 { + length = length - 3 // Encode tag+length as up to 4 bytes. - n := encodeLength(dst, tagCopy4, length-3) + if length > 28 { + offset |= 1 << 23 + length -= 28 + } + n := encodeLength(dst, tagCopy4, length) // Encode offset as 3 bytes. dst[n+2] = uint8(offset >> 16) dst[n+1] = uint8(offset >> 8)