Skip to content

Commit

Permalink
Merge branch 'KillingSpark:master' into access-inner-reader
Browse files Browse the repository at this point in the history
  • Loading branch information
ifd3f authored May 29, 2024
2 parents 057f710 + 0b96073 commit 3a8c5cd
Show file tree
Hide file tree
Showing 24 changed files with 509 additions and 83 deletions.
17 changes: 17 additions & 0 deletions src/blocks/block.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
//! Block header definitions.

/// There are 4 different kinds of blocks, and the type of block influences the meaning of `Block_Size`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BlockType {
/// An uncompressed block.
Raw,
/// A single byte, repeated `Block_Size` times (Run Length Encoding).
RLE,
/// A Zstandard compressed block. `Block_Size` is the length of the compressed data.
Compressed,
/// This is not a valid block, and this value should not be used.
/// If this value is present, it should be considered corrupted data.
Reserved,
}

Expand All @@ -17,9 +25,18 @@ impl core::fmt::Display for BlockType {
}
}

/// A representation of a single block header. As well as containing a frame header,
/// each Zstandard frame contains one or more blocks.
pub struct BlockHeader {
/// Whether this block is the last block in the frame.
/// It may be followed by an optional `Content_Checksum` if it is.
pub last_block: bool,
pub block_type: BlockType,
/// The size of the decompressed data. If the block type
/// is [BlockType::Reserved] or [BlockType::Compressed],
/// this value is set to zero and should not be referenced.
pub decompressed_size: u32,
/// The size of the block. If the block is [BlockType::RLE],
/// this value will be 1.
pub content_size: u32,
}
90 changes: 61 additions & 29 deletions src/blocks/literals_section.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,44 @@
//! Utilities and representations for the first half of a block, the literals section.
//! It contains data that is then copied from by the sequences section.
use super::super::decoding::bit_reader::{BitReader, GetBitsError};

/// A compressed block consists of two sections, a literals section, and a sequences section.
/// This is the first of those two sections. A literal is just any arbitrary data, and it is copied by the sequences section
pub struct LiteralsSection {
/// - If this block is of type [LiteralsSectionType::Raw], then the data is `regenerated_bytes`
/// bytes long, and it contains the raw literals data to be used during the second section,
/// the sequences section.
/// - If this block is of type [LiteralsSectionType::RLE],
/// then the literal consists of a single byte repeated `regenerated_size` times.
/// - For types [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless],
/// then this is the size of the decompressed data.
pub regenerated_size: u32,
/// - For types [LiteralsSectionType::Raw] and [LiteralsSectionType::RLE], this value is not present.
/// - For types [LiteralsSectionType::Compressed] and [LiteralsSectionType::Treeless], this value will
/// be set to the size of the compressed data.
pub compressed_size: Option<u32>,
/// This value will be either 1 stream or 4 streams if the literal is of type
/// [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless], and it
/// is not used for RLE or uncompressed literals.
pub num_streams: Option<u8>,
/// The type of the literal section.
pub ls_type: LiteralsSectionType,
}

/// The way which a literal section is encoded.
pub enum LiteralsSectionType {
/// Literals are stored uncompressed.
Raw,
/// Literals consist of a single byte value repeated [LiteralsSection::regenerated_size] times.
RLE,
/// This is a standard Huffman-compressed block, starting with a Huffman tree description.
/// In this mode, there are at least *2* different literals represented in the Huffman tree
/// description.
Compressed,
/// This is a Huffman-compressed block,
/// using the Huffman tree from the previous [LiteralsSectionType::Compressed] block
/// in the sequence. If this mode is triggered without any previous Huffman-tables in the
/// frame (or dictionary), it should be treated as data corruption.
Treeless,
}

Expand Down Expand Up @@ -77,6 +105,7 @@ impl Default for LiteralsSection {
}

impl LiteralsSection {
/// Create a new [LiteralsSection].
pub fn new() -> LiteralsSection {
LiteralsSection {
regenerated_size: 0,
Expand All @@ -86,25 +115,26 @@ impl LiteralsSection {
}
}

/// Given the first byte of a header, determine the size of the whole header, from 1 to 5 bytes.
pub fn header_bytes_needed(&self, first_byte: u8) -> Result<u8, LiteralsSectionParseError> {
let ls_type = Self::section_type(first_byte)?;
let ls_type: LiteralsSectionType = Self::section_type(first_byte)?;
let size_format = (first_byte >> 2) & 0x3;
match ls_type {
LiteralsSectionType::RLE | LiteralsSectionType::Raw => {
match size_format {
0 | 2 => {
//size_format actually only uses one bit
//regenerated_size uses 5 bits
// size_format actually only uses one bit
// regenerated_size uses 5 bits
Ok(1)
}
1 => {
//size_format uses 2 bit
//regenerated_size uses 12 bits
// size_format uses 2 bit
// regenerated_size uses 12 bits
Ok(2)
}
3 => {
//size_format uses 2 bit
//regenerated_size uses 20 bits
// size_format uses 2 bit
// regenerated_size uses 20 bits
Ok(3)
}
_ => panic!(
Expand All @@ -115,16 +145,16 @@ impl LiteralsSection {
LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
match size_format {
0 | 1 => {
//Only differ in num_streams
//both regenerated and compressed sizes use 10 bit
// Only differ in num_streams
// both regenerated and compressed sizes use 10 bit
Ok(3)
}
2 => {
//both regenerated and compressed sizes use 14 bit
// both regenerated and compressed sizes use 14 bit
Ok(4)
}
3 => {
//both regenerated and compressed sizes use 18 bit
// both regenerated and compressed sizes use 18 bit
Ok(5)
}

Expand All @@ -136,10 +166,11 @@ impl LiteralsSection {
}
}

/// Parse the header into `self`, and returns the number of bytes read.
pub fn parse_from_header(&mut self, raw: &[u8]) -> Result<u8, LiteralsSectionParseError> {
let mut br = BitReader::new(raw);
let t = br.get_bits(2)? as u8;
self.ls_type = Self::section_type(t)?;
let mut br: BitReader<'_> = BitReader::new(raw);
let block_type = br.get_bits(2)? as u8;
self.ls_type = Self::section_type(block_type)?;
let size_format = br.get_bits(2)? as u8;

let byte_needed = self.header_bytes_needed(raw[0])?;
Expand All @@ -155,20 +186,20 @@ impl LiteralsSection {
self.compressed_size = None;
match size_format {
0 | 2 => {
//size_format actually only uses one bit
//regenerated_size uses 5 bits
// size_format actually only uses one bit
// regenerated_size uses 5 bits
self.regenerated_size = u32::from(raw[0]) >> 3;
Ok(1)
}
1 => {
//size_format uses 2 bit
//regenerated_size uses 12 bits
// size_format uses 2 bit
// regenerated_size uses 12 bits
self.regenerated_size = (u32::from(raw[0]) >> 4) + (u32::from(raw[1]) << 4);
Ok(2)
}
3 => {
//size_format uses 2 bit
//regenerated_size uses 20 bits
// size_format uses 2 bit
// regenerated_size uses 20 bits
self.regenerated_size = (u32::from(raw[0]) >> 4)
+ (u32::from(raw[1]) << 4)
+ (u32::from(raw[2]) << 12);
Expand All @@ -194,10 +225,10 @@ impl LiteralsSection {

match size_format {
0 | 1 => {
//Differ in num_streams see above
//both regenerated and compressed sizes use 10 bit
// Differ in num_streams see above
// both regenerated and compressed sizes use 10 bit

//4 from the first, six from the second byte
// 4 from the first, six from the second byte
self.regenerated_size =
(u32::from(raw[0]) >> 4) + ((u32::from(raw[1]) & 0x3f) << 4);

Expand All @@ -207,27 +238,27 @@ impl LiteralsSection {
Ok(3)
}
2 => {
//both regenerated and compressed sizes use 14 bit
// both regenerated and compressed sizes use 14 bit

//4 from first, full second, 2 from the third byte
// 4 from first, full second, 2 from the third byte
self.regenerated_size = (u32::from(raw[0]) >> 4)
+ (u32::from(raw[1]) << 4)
+ ((u32::from(raw[2]) & 0x3) << 12);

//6 from the third, full last byte
// 6 from the third, full last byte
self.compressed_size =
Some((u32::from(raw[2]) >> 2) + (u32::from(raw[3]) << 6));
Ok(4)
}
3 => {
//both regenerated and compressed sizes use 18 bit
// both regenerated and compressed sizes use 18 bit

//4 from first, full second, six from third byte
// 4 from first, full second, six from third byte
self.regenerated_size = (u32::from(raw[0]) >> 4)
+ (u32::from(raw[1]) << 4)
+ ((u32::from(raw[2]) & 0x3F) << 12);

//2 from third, full fourth, full fifth byte
// 2 from third, full fourth, full fifth byte
self.compressed_size = Some(
(u32::from(raw[2]) >> 6)
+ (u32::from(raw[3]) << 2)
Expand All @@ -244,6 +275,7 @@ impl LiteralsSection {
}
}

/// Given the first two bits of a header, determine the type of a header.
fn section_type(raw: u8) -> Result<LiteralsSectionType, LiteralsSectionParseError> {
let t = raw & 0x3;
match t {
Expand Down
7 changes: 7 additions & 0 deletions src/blocks/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
//! In a Zstandard frame, there's a frame header, followed by one or more *blocks*.
//!
//! A block contains data, and a header describing how that data is encoded, as well
//! as other misc metadata.
//!
//! <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#blocks>

pub mod block;
pub mod literals_section;
pub mod sequence_section;
37 changes: 36 additions & 1 deletion src/blocks/sequence_section.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@
//! Utilities and representations for the second half of a block, the sequence section.
//! This section copies literals from the literals section into the decompressed output.

pub struct SequencesHeader {
pub num_sequences: u32,
pub modes: Option<CompressionModes>,
}

/// A sequence represents potentially redundant data, and it can be broken up into 2 steps:
/// - A copy step, where data is copied from the literals section to the decompressed output
/// - A *match* copy step that copies data from within the previously decompressed output.
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequence-execution>
#[derive(Clone, Copy)]
pub struct Sequence {
/// Literal length, or the number of bytes to be copied from the literals section
/// in the copy step.
pub ll: u32,
/// The length of the match to make during the match copy step.
pub ml: u32,
/// How far back to go in the decompressed data to read from the match copy step.
/// If this value is greater than 3, then the offset is `of -3`. If `of` is from 1-3,
/// then it has special handling:
///
/// The first 3 values define 3 different repeated offsets, with 1 referring to the most
/// recent, 2 the second recent, and so on. When the current sequence has a literal length of 0,
/// then the repeated offsets are shifted by 1. So an offset value of 1 refers to 2, 2 refers to 3,
/// and 3 refers to the most recent offset minus one. If that value is equal to zero, the data
/// is considered corrupted.
pub of: u32,
}

Expand All @@ -16,16 +36,27 @@ impl core::fmt::Display for Sequence {
}
}

/// This byte defines the compression mode of each symbol type
#[derive(Copy, Clone)]
pub struct CompressionModes(u8);
/// The compression mode used for symbol compression
pub enum ModeType {
/// A predefined FSE distribution table is used, and no distribution table
/// will be present.
Predefined,
/// The table consists of a single byte, which contains the symbol's value.
RLE,
/// Standard FSE compression, a distribution table will be present. This
/// mode should not be used when only one symbol is present.
FSECompressed,
/// The table used in the previous compressed block with at least one sequence
/// will be used again. If this is the first block, the table in the dictionary will
/// be used.
Repeat,
}

impl CompressionModes {
/// Deserialize a two bit mode value into a [ModeType]
pub fn decode_mode(m: u8) -> ModeType {
match m {
0 => ModeType::Predefined,
Expand All @@ -35,15 +66,17 @@ impl CompressionModes {
_ => panic!("This can never happen"),
}
}

/// Read the compression mode of the literal lengths field.
pub fn ll_mode(self) -> ModeType {
Self::decode_mode(self.0 >> 6)
}

/// Read the compression mode of the offset value field.
pub fn of_mode(self) -> ModeType {
Self::decode_mode((self.0 >> 4) & 0x3)
}

/// Read the compression mode of the match lengths field.
pub fn ml_mode(self) -> ModeType {
Self::decode_mode((self.0 >> 2) & 0x3)
}
Expand Down Expand Up @@ -79,13 +112,15 @@ impl core::fmt::Display for SequencesHeaderParseError {
}

impl SequencesHeader {
/// Create a new [SequencesHeader].
pub fn new() -> SequencesHeader {
SequencesHeader {
num_sequences: 0,
modes: None,
}
}

/// Attempt to deserialize the provided buffer into `self`, returning the number of bytes read.
pub fn parse_from_header(&mut self, source: &[u8]) -> Result<u8, SequencesHeaderParseError> {
let mut bytes_read = 0;
if source.is_empty() {
Expand Down
1 change: 1 addition & 0 deletions src/decoding/bit_reader.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/// Interact with a provided source at a bit level.
pub struct BitReader<'s> {
idx: usize, //index counts bits already read
source: &'s [u8],
Expand Down
12 changes: 10 additions & 2 deletions src/decoding/bit_reader_reverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,21 @@ pub use super::bit_reader::GetBitsError;
use byteorder::ByteOrder;
use byteorder::LittleEndian;

/// Zstandard encodes some types of data in a way that the data must be read
/// back to front to decode it properly. `BitReaderReversed` provides a
/// convenient interface to do that.
pub struct BitReaderReversed<'s> {
idx: isize, //index counts bits already read
source: &'s [u8],

/// The reader doesn't read directly from the source,
/// it reads bits from here, and the container is
/// "refilled" as it's emptied.
bit_container: u64,
bits_in_container: u8,
}

impl<'s> BitReaderReversed<'s> {
/// How many bits are left to read by the reader.
pub fn bits_remaining(&self) -> isize {
self.idx + self.bits_in_container as isize
}
Expand Down Expand Up @@ -102,6 +108,8 @@ impl<'s> BitReaderReversed<'s> {
(self.idx - 1) / 8
}

/// Read `n` number of bits from the source. Returns an error if the reader
/// requests more bits than remain for reading.
#[inline(always)]
pub fn get_bits(&mut self, n: u8) -> Result<u64, GetBitsError> {
if n == 0 {
Expand Down Expand Up @@ -162,7 +170,7 @@ impl<'s> BitReaderReversed<'s> {
return Ok((0, 0, 0));
}
if sum > 56 {
// try and get the values separatly
// try and get the values separately
return Ok((self.get_bits(n1)?, self.get_bits(n2)?, self.get_bits(n3)?));
}
let sum = sum as u8;
Expand Down
Loading

0 comments on commit 3a8c5cd

Please sign in to comment.