Merge branch 'KillingSpark:master' into access-inner-reader

KillingSpark · May 29, 2024 · 3a8c5cd · 3a8c5cd
2 parents 057f710 + 0b96073
commit 3a8c5cd
Show file tree

Hide file tree

Showing 24 changed files with 509 additions and 83 deletions.
diff --git a/src/blocks/block.rs b/src/blocks/block.rs
@@ -1,8 +1,16 @@
+//! Block header definitions.
+
+/// There are 4 different kinds of blocks, and the type of block influences the meaning of `Block_Size`.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum BlockType {
+    /// An uncompressed block.
     Raw,
+    /// A single byte, repeated `Block_Size` times (Run Length Encoding).
     RLE,
+    /// A Zstandard compressed block. `Block_Size` is the length of the compressed data.
     Compressed,
+    /// This is not a valid block, and this value should not be used.
+    /// If this value is present, it should be considered corrupted data.
     Reserved,
 }
 
@@ -17,9 +25,18 @@ impl core::fmt::Display for BlockType {
     }
 }
 
+/// A representation of a single block header. As well as containing a frame header,
+/// each Zstandard frame contains one or more blocks.
 pub struct BlockHeader {
+    /// Whether this block is the last block in the frame.
+    /// It may be followed by an optional `Content_Checksum` if it is.
     pub last_block: bool,
     pub block_type: BlockType,
+    /// The size of the decompressed data. If the block type
+    /// is [BlockType::Reserved] or [BlockType::Compressed],
+    /// this value is set to zero and should not be referenced.
     pub decompressed_size: u32,
+    /// The size of the block. If the block is [BlockType::RLE],
+    /// this value will be 1.
     pub content_size: u32,
 }
diff --git a/src/blocks/literals_section.rs b/src/blocks/literals_section.rs
@@ -1,16 +1,44 @@
+//! Utilities and representations for the first half of a block, the literals section.
+//! It contains data that is then copied from by the sequences section.
 use super::super::decoding::bit_reader::{BitReader, GetBitsError};
 
+/// A compressed block consists of two sections, a literals section, and a sequences section.
+/// This is the first of those two sections. A literal is just any arbitrary data, and it is copied by the sequences section
 pub struct LiteralsSection {
+    /// - If this block is of type [LiteralsSectionType::Raw], then the data is `regenerated_bytes`
+    /// bytes long, and it contains the raw literals data to be used during the second section,
+    /// the sequences section.
+    /// - If this block is of type [LiteralsSectionType::RLE],
+    /// then the literal consists of a single byte repeated `regenerated_size` times.
+    /// - For types [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless],
+    /// then this is the size of the decompressed data.
     pub regenerated_size: u32,
+    /// - For types [LiteralsSectionType::Raw] and [LiteralsSectionType::RLE], this value is not present.
+    /// - For types [LiteralsSectionType::Compressed] and [LiteralsSectionType::Treeless], this value will
+    /// be set to the size of the compressed data.
     pub compressed_size: Option<u32>,
+    /// This value will be either 1 stream or 4 streams if the literal is of type
+    /// [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless], and it
+    /// is not used for RLE or uncompressed literals.
     pub num_streams: Option<u8>,
+    /// The type of the literal section.
     pub ls_type: LiteralsSectionType,
 }
 
+/// The way which a literal section is encoded.
 pub enum LiteralsSectionType {
+    /// Literals are stored uncompressed.
     Raw,
+    /// Literals consist of a single byte value repeated [LiteralsSection::regenerated_size] times.
     RLE,
+    /// This is a standard Huffman-compressed block, starting with a Huffman tree description.
+    /// In this mode, there are at least *2* different literals represented in the Huffman tree
+    /// description.
     Compressed,
+    /// This is a Huffman-compressed block,
+    /// using the Huffman tree from the previous [LiteralsSectionType::Compressed] block
+    /// in the sequence. If this mode is triggered without any previous Huffman-tables in the
+    /// frame (or dictionary), it should be treated as data corruption.
     Treeless,
 }
 
@@ -77,6 +105,7 @@ impl Default for LiteralsSection {
 }
 
 impl LiteralsSection {
+    /// Create a new [LiteralsSection].
     pub fn new() -> LiteralsSection {
         LiteralsSection {
             regenerated_size: 0,
@@ -86,25 +115,26 @@ impl LiteralsSection {
         }
     }
 
+    /// Given the first byte of a header, determine the size of the whole header, from 1 to 5 bytes.
     pub fn header_bytes_needed(&self, first_byte: u8) -> Result<u8, LiteralsSectionParseError> {
-        let ls_type = Self::section_type(first_byte)?;
+        let ls_type: LiteralsSectionType = Self::section_type(first_byte)?;
         let size_format = (first_byte >> 2) & 0x3;
         match ls_type {
             LiteralsSectionType::RLE | LiteralsSectionType::Raw => {
                 match size_format {
                     0 | 2 => {
-                        //size_format actually only uses one bit
-                        //regenerated_size uses 5 bits
+                        // size_format actually only uses one bit
+                        // regenerated_size uses 5 bits
                         Ok(1)
                     }
                     1 => {
-                        //size_format uses 2 bit
-                        //regenerated_size uses 12 bits
+                        // size_format uses 2 bit
+                        // regenerated_size uses 12 bits
                         Ok(2)
                     }
                     3 => {
-                        //size_format uses 2 bit
-                        //regenerated_size uses 20 bits
+                        // size_format uses 2 bit
+                        // regenerated_size uses 20 bits
                         Ok(3)
                     }
                     _ => panic!(
@@ -115,16 +145,16 @@ impl LiteralsSection {
             LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
                 match size_format {
                     0 | 1 => {
-                        //Only differ in num_streams
-                        //both regenerated and compressed sizes use 10 bit
+                        // Only differ in num_streams
+                        // both regenerated and compressed sizes use 10 bit
                         Ok(3)
                     }
                     2 => {
-                        //both regenerated and compressed sizes use 14 bit
+                        // both regenerated and compressed sizes use 14 bit
                         Ok(4)
                     }
                     3 => {
-                        //both regenerated and compressed sizes use 18 bit
+                        // both regenerated and compressed sizes use 18 bit
                         Ok(5)
                     }
 
@@ -136,10 +166,11 @@ impl LiteralsSection {
         }
     }
 
+    /// Parse the header into `self`, and returns the number of bytes read.
     pub fn parse_from_header(&mut self, raw: &[u8]) -> Result<u8, LiteralsSectionParseError> {
-        let mut br = BitReader::new(raw);
-        let t = br.get_bits(2)? as u8;
-        self.ls_type = Self::section_type(t)?;
+        let mut br: BitReader<'_> = BitReader::new(raw);
+        let block_type = br.get_bits(2)? as u8;
+        self.ls_type = Self::section_type(block_type)?;
         let size_format = br.get_bits(2)? as u8;
 
         let byte_needed = self.header_bytes_needed(raw[0])?;
@@ -155,20 +186,20 @@ impl LiteralsSection {
                 self.compressed_size = None;
                 match size_format {
                     0 | 2 => {
-                        //size_format actually only uses one bit
-                        //regenerated_size uses 5 bits
+                        // size_format actually only uses one bit
+                        // regenerated_size uses 5 bits
                         self.regenerated_size = u32::from(raw[0]) >> 3;
                         Ok(1)
                     }
                     1 => {
-                        //size_format uses 2 bit
-                        //regenerated_size uses 12 bits
+                        // size_format uses 2 bit
+                        // regenerated_size uses 12 bits
                         self.regenerated_size = (u32::from(raw[0]) >> 4) + (u32::from(raw[1]) << 4);
                         Ok(2)
                     }
                     3 => {
-                        //size_format uses 2 bit
-                        //regenerated_size uses 20 bits
+                        // size_format uses 2 bit
+                        // regenerated_size uses 20 bits
                         self.regenerated_size = (u32::from(raw[0]) >> 4)
                             + (u32::from(raw[1]) << 4)
                             + (u32::from(raw[2]) << 12);
@@ -194,10 +225,10 @@ impl LiteralsSection {
 
                 match size_format {
                     0 | 1 => {
-                        //Differ in num_streams see above
-                        //both regenerated and compressed sizes use 10 bit
+                        // Differ in num_streams see above
+                        // both regenerated and compressed sizes use 10 bit
 
-                        //4 from the first, six from the second byte
+                        // 4 from the first, six from the second byte
                         self.regenerated_size =
                             (u32::from(raw[0]) >> 4) + ((u32::from(raw[1]) & 0x3f) << 4);
 
@@ -207,27 +238,27 @@ impl LiteralsSection {
                         Ok(3)
                     }
                     2 => {
-                        //both regenerated and compressed sizes use 14 bit
+                        // both regenerated and compressed sizes use 14 bit
 
-                        //4 from first, full second, 2 from the third byte
+                        // 4 from first, full second, 2 from the third byte
                         self.regenerated_size = (u32::from(raw[0]) >> 4)
                             + (u32::from(raw[1]) << 4)
                             + ((u32::from(raw[2]) & 0x3) << 12);
 
-                        //6 from the third, full last byte
+                        // 6 from the third, full last byte
                         self.compressed_size =
                             Some((u32::from(raw[2]) >> 2) + (u32::from(raw[3]) << 6));
                         Ok(4)
                     }
                     3 => {
-                        //both regenerated and compressed sizes use 18 bit
+                        // both regenerated and compressed sizes use 18 bit
 
-                        //4 from first, full second, six from third byte
+                        // 4 from first, full second, six from third byte
                         self.regenerated_size = (u32::from(raw[0]) >> 4)
                             + (u32::from(raw[1]) << 4)
                             + ((u32::from(raw[2]) & 0x3F) << 12);
 
-                        //2 from third, full fourth, full fifth byte
+                        // 2 from third, full fourth, full fifth byte
                         self.compressed_size = Some(
                             (u32::from(raw[2]) >> 6)
                                 + (u32::from(raw[3]) << 2)
@@ -244,6 +275,7 @@ impl LiteralsSection {
         }
     }
 
+    /// Given the first two bits of a header, determine the type of a header.
     fn section_type(raw: u8) -> Result<LiteralsSectionType, LiteralsSectionParseError> {
         let t = raw & 0x3;
         match t {

diff --git a/src/blocks/mod.rs b/src/blocks/mod.rs
@@ -1,3 +1,10 @@
+//! In a Zstandard frame, there's a frame header, followed by one or more *blocks*.
+//!
+//! A block contains data, and a header describing how that data is encoded, as well
+//! as other misc metadata.
+//!
+//! <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#blocks>
+
 pub mod block;
 pub mod literals_section;
 pub mod sequence_section;
diff --git a/src/blocks/sequence_section.rs b/src/blocks/sequence_section.rs
@@ -1,12 +1,32 @@
+//! Utilities and representations for the second half of a block, the sequence section.
+//! This section copies literals from the literals section into the decompressed output.
+
 pub struct SequencesHeader {
     pub num_sequences: u32,
     pub modes: Option<CompressionModes>,
 }
 
+/// A sequence represents potentially redundant data, and it can be broken up into 2 steps:
+/// - A copy step, where data is copied from the literals section to the decompressed output
+/// - A *match* copy step that copies data from within the previously decompressed output.
+///
+/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequence-execution>
 #[derive(Clone, Copy)]
 pub struct Sequence {
+    /// Literal length, or the number of bytes to be copied from the literals section
+    /// in the copy step.
     pub ll: u32,
+    /// The length of the match to make during the match copy step.
     pub ml: u32,
+    /// How far back to go in the decompressed data to read from the match copy step.
+    /// If this value is greater than 3, then the offset is `of -3`. If `of` is from 1-3,
+    /// then it has special handling:
+    ///
+    /// The first 3 values define 3 different repeated offsets, with 1 referring to the most
+    /// recent, 2 the second recent, and so on. When the current sequence has a literal length of 0,
+    /// then the repeated offsets are shifted by 1. So an offset value of 1 refers to 2, 2 refers to 3,
+    /// and 3 refers to the most recent offset minus one. If that value is equal to zero, the data
+    /// is considered corrupted.
     pub of: u32,
 }
 
@@ -16,16 +36,27 @@ impl core::fmt::Display for Sequence {
     }
 }
 
+/// This byte defines the compression mode of each symbol type
 #[derive(Copy, Clone)]
 pub struct CompressionModes(u8);
+/// The compression mode used for symbol compression
 pub enum ModeType {
+    /// A predefined FSE distribution table is used, and no distribution table
+    /// will be present.
     Predefined,
+    /// The table consists of a single byte, which contains the symbol's value.
     RLE,
+    /// Standard FSE compression, a distribution table will be present. This
+    /// mode should not be used when only one symbol is present.
     FSECompressed,
+    /// The table used in the previous compressed block with at least one sequence
+    /// will be used again. If this is the first block, the table in the dictionary will
+    /// be used.
     Repeat,
 }
 
 impl CompressionModes {
+    /// Deserialize a two bit mode value into a [ModeType]
     pub fn decode_mode(m: u8) -> ModeType {
         match m {
             0 => ModeType::Predefined,
@@ -35,15 +66,17 @@ impl CompressionModes {
             _ => panic!("This can never happen"),
         }
     }
-
+    /// Read the compression mode of the literal lengths field.
     pub fn ll_mode(self) -> ModeType {
         Self::decode_mode(self.0 >> 6)
     }
 
+    /// Read the compression mode of the offset value field.
     pub fn of_mode(self) -> ModeType {
         Self::decode_mode((self.0 >> 4) & 0x3)
     }
 
+    /// Read the compression mode of the match lengths field.
     pub fn ml_mode(self) -> ModeType {
         Self::decode_mode((self.0 >> 2) & 0x3)
     }
@@ -79,13 +112,15 @@ impl core::fmt::Display for SequencesHeaderParseError {
 }
 
 impl SequencesHeader {
+    /// Create a new [SequencesHeader].
     pub fn new() -> SequencesHeader {
         SequencesHeader {
             num_sequences: 0,
             modes: None,
         }
     }
 
+    /// Attempt to deserialize the provided buffer into `self`, returning the number of bytes read.
     pub fn parse_from_header(&mut self, source: &[u8]) -> Result<u8, SequencesHeaderParseError> {
         let mut bytes_read = 0;
         if source.is_empty() {

diff --git a/src/decoding/bit_reader.rs b/src/decoding/bit_reader.rs
@@ -1,3 +1,4 @@
+/// Interact with a provided source at a bit level.
 pub struct BitReader<'s> {
     idx: usize, //index counts bits already read
     source: &'s [u8],

diff --git a/src/decoding/bit_reader_reverse.rs b/src/decoding/bit_reader_reverse.rs
@@ -2,15 +2,21 @@ pub use super::bit_reader::GetBitsError;
 use byteorder::ByteOrder;
 use byteorder::LittleEndian;
 
+/// Zstandard encodes some types of data in a way that the data must be read
+/// back to front to decode it properly. `BitReaderReversed` provides a
+/// convenient interface to do that.
 pub struct BitReaderReversed<'s> {
     idx: isize, //index counts bits already read
     source: &'s [u8],
-
+    /// The reader doesn't read directly from the source,
+    /// it reads bits from here, and the container is
+    /// "refilled" as it's emptied.
     bit_container: u64,
     bits_in_container: u8,
 }
 
 impl<'s> BitReaderReversed<'s> {
+    /// How many bits are left to read by the reader.
     pub fn bits_remaining(&self) -> isize {
         self.idx + self.bits_in_container as isize
     }
@@ -102,6 +108,8 @@ impl<'s> BitReaderReversed<'s> {
         (self.idx - 1) / 8
     }
 
+    /// Read `n` number of bits from the source. Returns an error if the reader
+    /// requests more bits than remain for reading.
     #[inline(always)]
     pub fn get_bits(&mut self, n: u8) -> Result<u64, GetBitsError> {
         if n == 0 {
@@ -162,7 +170,7 @@ impl<'s> BitReaderReversed<'s> {
             return Ok((0, 0, 0));
         }
         if sum > 56 {
-            // try and get the values separatly
+            // try and get the values separately
             return Ok((self.get_bits(n1)?, self.get_bits(n2)?, self.get_bits(n3)?));
         }
         let sum = sum as u8;