Merge branch 'master' into bitreader_no_error

KillingSpark · May 30, 2024 · ccbe90d · ccbe90d
2 parents e70edb5 + 53e7b1a
commit ccbe90d
Show file tree

Hide file tree

Showing 26 changed files with 1,562 additions and 328 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,7 +15,6 @@ categories = ["compression"]
 [dependencies]
 byteorder = { version = "1.5", default-features = false }
 twox-hash = { version = "1.6", default-features = false, optional = true }
-derive_more = { version = "0.99", default-features = false, features = ["display", "from"] }
 
 [dev-dependencies]
 criterion = "0.5"
@@ -24,7 +23,7 @@ rand = { version = "0.8.5", features = ["small_rng"] }
 [features]
 default = ["hash", "std"]
 hash = ["dep:twox-hash"]
-std = ["derive_more/error"]
+std = []
 
 [[bench]]
 name = "reversedbitreader_bench"

diff --git a/src/blocks/block.rs b/src/blocks/block.rs
@@ -1,8 +1,16 @@
+//! Block header definitions.
+
+/// There are 4 different kinds of blocks, and the type of block influences the meaning of `Block_Size`.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum BlockType {
+    /// An uncompressed block.
     Raw,
+    /// A single byte, repeated `Block_Size` times (Run Length Encoding).
     RLE,
+    /// A Zstandard compressed block. `Block_Size` is the length of the compressed data.
     Compressed,
+    /// This is not a valid block, and this value should not be used.
+    /// If this value is present, it should be considered corrupted data.
     Reserved,
 }
 
@@ -17,9 +25,18 @@ impl core::fmt::Display for BlockType {
     }
 }
 
+/// A representation of a single block header. As well as containing a frame header,
+/// each Zstandard frame contains one or more blocks.
 pub struct BlockHeader {
+    /// Whether this block is the last block in the frame.
+    /// It may be followed by an optional `Content_Checksum` if it is.
     pub last_block: bool,
     pub block_type: BlockType,
+    /// The size of the decompressed data. If the block type
+    /// is [BlockType::Reserved] or [BlockType::Compressed],
+    /// this value is set to zero and should not be referenced.
     pub decompressed_size: u32,
+    /// The size of the block. If the block is [BlockType::RLE],
+    /// this value will be 1.
     pub content_size: u32,
 }
diff --git a/src/blocks/literals_section.rs b/src/blocks/literals_section.rs
@@ -1,34 +1,92 @@
+//! Utilities and representations for the first half of a block, the literals section.
+//! It contains data that is then copied from by the sequences section.
 use super::super::decoding::bit_reader::{BitReader, GetBitsError};
 
+/// A compressed block consists of two sections, a literals section, and a sequences section.
+/// This is the first of those two sections. A literal is just any arbitrary data, and it is copied by the sequences section
 pub struct LiteralsSection {
+    /// - If this block is of type [LiteralsSectionType::Raw], then the data is `regenerated_bytes`
+    ///     bytes long, and it contains the raw literals data to be used during the second section,
+    ///     the sequences section.
+    /// - If this block is of type [LiteralsSectionType::RLE],
+    ///     then the literal consists of a single byte repeated `regenerated_size` times.
+    /// - For types [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless],
+    ///     then this is the size of the decompressed data.
     pub regenerated_size: u32,
+    /// - For types [LiteralsSectionType::Raw] and [LiteralsSectionType::RLE], this value is not present.
+    /// - For types [LiteralsSectionType::Compressed] and [LiteralsSectionType::Treeless], this value will
+    ///     be set to the size of the compressed data.
     pub compressed_size: Option<u32>,
+    /// This value will be either 1 stream or 4 streams if the literal is of type
+    /// [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless], and it
+    /// is not used for RLE or uncompressed literals.
     pub num_streams: Option<u8>,
+    /// The type of the literal section.
     pub ls_type: LiteralsSectionType,
 }
 
+/// The way which a literal section is encoded.
 pub enum LiteralsSectionType {
+    /// Literals are stored uncompressed.
     Raw,
+    /// Literals consist of a single byte value repeated [LiteralsSection::regenerated_size] times.
     RLE,
+    /// This is a standard Huffman-compressed block, starting with a Huffman tree description.
+    /// In this mode, there are at least *2* different literals represented in the Huffman tree
+    /// description.
     Compressed,
+    /// This is a Huffman-compressed block,
+    /// using the Huffman tree from the previous [LiteralsSectionType::Compressed] block
+    /// in the sequence. If this mode is triggered without any previous Huffman-tables in the
+    /// frame (or dictionary), it should be treated as data corruption.
     Treeless,
 }
 
-#[derive(Debug, derive_more::Display, derive_more::From)]
-#[cfg_attr(feature = "std", derive(derive_more::Error))]
+#[derive(Debug)]
 #[non_exhaustive]
 pub enum LiteralsSectionParseError {
-    #[display(fmt = "Illegal literalssectiontype. Is: {got}, must be in: 0, 1, 2, 3")]
     IllegalLiteralSectionType { got: u8 },
-    #[display(fmt = "{_0:?}")]
-    #[from]
     GetBitsError(GetBitsError),
-    #[display(
-        fmt = "Not enough byte to parse the literals section header. Have: {have}, Need: {need}"
-    )]
     NotEnoughBytes { have: usize, need: u8 },
 }
 
+#[cfg(feature = "std")]
+impl std::error::Error for LiteralsSectionParseError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self {
+            LiteralsSectionParseError::GetBitsError(source) => Some(source),
+            _ => None,
+        }
+    }
+}
+impl core::fmt::Display for LiteralsSectionParseError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        match self {
+            LiteralsSectionParseError::IllegalLiteralSectionType { got } => {
+                write!(
+                    f,
+                    "Illegal literalssectiontype. Is: {}, must be in: 0, 1, 2, 3",
+                    got
+                )
+            }
+            LiteralsSectionParseError::GetBitsError(e) => write!(f, "{:?}", e),
+            LiteralsSectionParseError::NotEnoughBytes { have, need } => {
+                write!(
+                    f,
+                    "Not enough byte to parse the literals section header. Have: {}, Need: {}",
+                    have, need,
+                )
+            }
+        }
+    }
+}
+
+impl From<GetBitsError> for LiteralsSectionParseError {
+    fn from(val: GetBitsError) -> Self {
+        Self::GetBitsError(val)
+    }
+}
+
 impl core::fmt::Display for LiteralsSectionType {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
         match self {
@@ -47,6 +105,7 @@ impl Default for LiteralsSection {
 }
 
 impl LiteralsSection {
+    /// Create a new [LiteralsSection].
     pub fn new() -> LiteralsSection {
         LiteralsSection {
             regenerated_size: 0,
@@ -56,25 +115,26 @@ impl LiteralsSection {
         }
     }
 
+    /// Given the first byte of a header, determine the size of the whole header, from 1 to 5 bytes.
     pub fn header_bytes_needed(&self, first_byte: u8) -> Result<u8, LiteralsSectionParseError> {
-        let ls_type = Self::section_type(first_byte)?;
+        let ls_type: LiteralsSectionType = Self::section_type(first_byte)?;
         let size_format = (first_byte >> 2) & 0x3;
         match ls_type {
             LiteralsSectionType::RLE | LiteralsSectionType::Raw => {
                 match size_format {
                     0 | 2 => {
-                        //size_format actually only uses one bit
-                        //regenerated_size uses 5 bits
+                        // size_format actually only uses one bit
+                        // regenerated_size uses 5 bits
                         Ok(1)
                     }
                     1 => {
-                        //size_format uses 2 bit
-                        //regenerated_size uses 12 bits
+                        // size_format uses 2 bit
+                        // regenerated_size uses 12 bits
                         Ok(2)
                     }
                     3 => {
-                        //size_format uses 2 bit
-                        //regenerated_size uses 20 bits
+                        // size_format uses 2 bit
+                        // regenerated_size uses 20 bits
                         Ok(3)
                     }
                     _ => panic!(
@@ -85,16 +145,16 @@ impl LiteralsSection {
             LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
                 match size_format {
                     0 | 1 => {
-                        //Only differ in num_streams
-                        //both regenerated and compressed sizes use 10 bit
+                        // Only differ in num_streams
+                        // both regenerated and compressed sizes use 10 bit
                         Ok(3)
                     }
                     2 => {
-                        //both regenerated and compressed sizes use 14 bit
+                        // both regenerated and compressed sizes use 14 bit
                         Ok(4)
                     }
                     3 => {
-                        //both regenerated and compressed sizes use 18 bit
+                        // both regenerated and compressed sizes use 18 bit
                         Ok(5)
                     }
 
@@ -106,10 +166,11 @@ impl LiteralsSection {
         }
     }
 
+    /// Parse the header into `self`, and returns the number of bytes read.
     pub fn parse_from_header(&mut self, raw: &[u8]) -> Result<u8, LiteralsSectionParseError> {
-        let mut br = BitReader::new(raw);
-        let t = br.get_bits(2)? as u8;
-        self.ls_type = Self::section_type(t)?;
+        let mut br: BitReader<'_> = BitReader::new(raw);
+        let block_type = br.get_bits(2)? as u8;
+        self.ls_type = Self::section_type(block_type)?;
         let size_format = br.get_bits(2)? as u8;
 
         let byte_needed = self.header_bytes_needed(raw[0])?;
@@ -125,20 +186,20 @@ impl LiteralsSection {
                 self.compressed_size = None;
                 match size_format {
                     0 | 2 => {
-                        //size_format actually only uses one bit
-                        //regenerated_size uses 5 bits
+                        // size_format actually only uses one bit
+                        // regenerated_size uses 5 bits
                         self.regenerated_size = u32::from(raw[0]) >> 3;
                         Ok(1)
                     }
                     1 => {
-                        //size_format uses 2 bit
-                        //regenerated_size uses 12 bits
+                        // size_format uses 2 bit
+                        // regenerated_size uses 12 bits
                         self.regenerated_size = (u32::from(raw[0]) >> 4) + (u32::from(raw[1]) << 4);
                         Ok(2)
                     }
                     3 => {
-                        //size_format uses 2 bit
-                        //regenerated_size uses 20 bits
+                        // size_format uses 2 bit
+                        // regenerated_size uses 20 bits
                         self.regenerated_size = (u32::from(raw[0]) >> 4)
                             + (u32::from(raw[1]) << 4)
                             + (u32::from(raw[2]) << 12);
@@ -164,10 +225,10 @@ impl LiteralsSection {
 
                 match size_format {
                     0 | 1 => {
-                        //Differ in num_streams see above
-                        //both regenerated and compressed sizes use 10 bit
+                        // Differ in num_streams see above
+                        // both regenerated and compressed sizes use 10 bit
 
-                        //4 from the first, six from the second byte
+                        // 4 from the first, six from the second byte
                         self.regenerated_size =
                             (u32::from(raw[0]) >> 4) + ((u32::from(raw[1]) & 0x3f) << 4);
 
@@ -177,27 +238,27 @@ impl LiteralsSection {
                         Ok(3)
                     }
                     2 => {
-                        //both regenerated and compressed sizes use 14 bit
+                        // both regenerated and compressed sizes use 14 bit
 
-                        //4 from first, full second, 2 from the third byte
+                        // 4 from first, full second, 2 from the third byte
                         self.regenerated_size = (u32::from(raw[0]) >> 4)
                             + (u32::from(raw[1]) << 4)
                             + ((u32::from(raw[2]) & 0x3) << 12);
 
-                        //6 from the third, full last byte
+                        // 6 from the third, full last byte
                         self.compressed_size =
                             Some((u32::from(raw[2]) >> 2) + (u32::from(raw[3]) << 6));
                         Ok(4)
                     }
                     3 => {
-                        //both regenerated and compressed sizes use 18 bit
+                        // both regenerated and compressed sizes use 18 bit
 
-                        //4 from first, full second, six from third byte
+                        // 4 from first, full second, six from third byte
                         self.regenerated_size = (u32::from(raw[0]) >> 4)
                             + (u32::from(raw[1]) << 4)
                             + ((u32::from(raw[2]) & 0x3F) << 12);
 
-                        //2 from third, full fourth, full fifth byte
+                        // 2 from third, full fourth, full fifth byte
                         self.compressed_size = Some(
                             (u32::from(raw[2]) >> 6)
                                 + (u32::from(raw[3]) << 2)
@@ -214,6 +275,7 @@ impl LiteralsSection {
         }
     }
 
+    /// Given the first two bits of a header, determine the type of a header.
     fn section_type(raw: u8) -> Result<LiteralsSectionType, LiteralsSectionParseError> {
         let t = raw & 0x3;
         match t {

diff --git a/src/blocks/mod.rs b/src/blocks/mod.rs
@@ -1,3 +1,10 @@
+//! In a Zstandard frame, there's a frame header, followed by one or more *blocks*.
+//!
+//! A block contains data, and a header describing how that data is encoded, as well
+//! as other misc metadata.
+//!
+//! <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#blocks>
+
 pub mod block;
 pub mod literals_section;
 pub mod sequence_section;