Skip to content

Commit

Permalink
Merge branch 'master' into bitreader_no_error
Browse files Browse the repository at this point in the history
  • Loading branch information
KillingSpark committed May 30, 2024
2 parents e70edb5 + 53e7b1a commit ccbe90d
Show file tree
Hide file tree
Showing 26 changed files with 1,562 additions and 328 deletions.
3 changes: 1 addition & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ categories = ["compression"]
[dependencies]
byteorder = { version = "1.5", default-features = false }
twox-hash = { version = "1.6", default-features = false, optional = true }
derive_more = { version = "0.99", default-features = false, features = ["display", "from"] }

[dev-dependencies]
criterion = "0.5"
Expand All @@ -24,7 +23,7 @@ rand = { version = "0.8.5", features = ["small_rng"] }
[features]
default = ["hash", "std"]
hash = ["dep:twox-hash"]
std = ["derive_more/error"]
std = []

[[bench]]
name = "reversedbitreader_bench"
Expand Down
17 changes: 17 additions & 0 deletions src/blocks/block.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
//! Block header definitions.

/// There are 4 different kinds of blocks, and the type of block influences the meaning of `Block_Size`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BlockType {
/// An uncompressed block.
Raw,
/// A single byte, repeated `Block_Size` times (Run Length Encoding).
RLE,
/// A Zstandard compressed block. `Block_Size` is the length of the compressed data.
Compressed,
/// This is not a valid block, and this value should not be used.
/// If this value is present, it should be considered corrupted data.
Reserved,
}

Expand All @@ -17,9 +25,18 @@ impl core::fmt::Display for BlockType {
}
}

/// A representation of a single block header. As well as containing a frame header,
/// each Zstandard frame contains one or more blocks.
pub struct BlockHeader {
/// Whether this block is the last block in the frame.
/// It may be followed by an optional `Content_Checksum` if it is.
pub last_block: bool,
pub block_type: BlockType,
/// The size of the decompressed data. If the block type
/// is [BlockType::Reserved] or [BlockType::Compressed],
/// this value is set to zero and should not be referenced.
pub decompressed_size: u32,
/// The size of the block. If the block is [BlockType::RLE],
/// this value will be 1.
pub content_size: u32,
}
136 changes: 99 additions & 37 deletions src/blocks/literals_section.rs
Original file line number Diff line number Diff line change
@@ -1,34 +1,92 @@
//! Utilities and representations for the first half of a block, the literals section.
//! It contains data that is then copied from by the sequences section.
use super::super::decoding::bit_reader::{BitReader, GetBitsError};

/// A compressed block consists of two sections, a literals section, and a sequences section.
/// This is the first of those two sections. A literal is just any arbitrary data, and it is copied by the sequences section
pub struct LiteralsSection {
/// - If this block is of type [LiteralsSectionType::Raw], then the data is `regenerated_bytes`
/// bytes long, and it contains the raw literals data to be used during the second section,
/// the sequences section.
/// - If this block is of type [LiteralsSectionType::RLE],
/// then the literal consists of a single byte repeated `regenerated_size` times.
/// - For types [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless],
/// then this is the size of the decompressed data.
pub regenerated_size: u32,
/// - For types [LiteralsSectionType::Raw] and [LiteralsSectionType::RLE], this value is not present.
/// - For types [LiteralsSectionType::Compressed] and [LiteralsSectionType::Treeless], this value will
/// be set to the size of the compressed data.
pub compressed_size: Option<u32>,
/// This value will be either 1 stream or 4 streams if the literal is of type
/// [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless], and it
/// is not used for RLE or uncompressed literals.
pub num_streams: Option<u8>,
/// The type of the literal section.
pub ls_type: LiteralsSectionType,
}

/// The way which a literal section is encoded.
pub enum LiteralsSectionType {
/// Literals are stored uncompressed.
Raw,
/// Literals consist of a single byte value repeated [LiteralsSection::regenerated_size] times.
RLE,
/// This is a standard Huffman-compressed block, starting with a Huffman tree description.
/// In this mode, there are at least *2* different literals represented in the Huffman tree
/// description.
Compressed,
/// This is a Huffman-compressed block,
/// using the Huffman tree from the previous [LiteralsSectionType::Compressed] block
/// in the sequence. If this mode is triggered without any previous Huffman-tables in the
/// frame (or dictionary), it should be treated as data corruption.
Treeless,
}

#[derive(Debug, derive_more::Display, derive_more::From)]
#[cfg_attr(feature = "std", derive(derive_more::Error))]
#[derive(Debug)]
#[non_exhaustive]
pub enum LiteralsSectionParseError {
#[display(fmt = "Illegal literalssectiontype. Is: {got}, must be in: 0, 1, 2, 3")]
IllegalLiteralSectionType { got: u8 },
#[display(fmt = "{_0:?}")]
#[from]
GetBitsError(GetBitsError),
#[display(
fmt = "Not enough byte to parse the literals section header. Have: {have}, Need: {need}"
)]
NotEnoughBytes { have: usize, need: u8 },
}

#[cfg(feature = "std")]
impl std::error::Error for LiteralsSectionParseError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
LiteralsSectionParseError::GetBitsError(source) => Some(source),
_ => None,
}
}
}
impl core::fmt::Display for LiteralsSectionParseError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
LiteralsSectionParseError::IllegalLiteralSectionType { got } => {
write!(
f,
"Illegal literalssectiontype. Is: {}, must be in: 0, 1, 2, 3",
got
)
}
LiteralsSectionParseError::GetBitsError(e) => write!(f, "{:?}", e),
LiteralsSectionParseError::NotEnoughBytes { have, need } => {
write!(
f,
"Not enough byte to parse the literals section header. Have: {}, Need: {}",
have, need,
)
}
}
}
}

impl From<GetBitsError> for LiteralsSectionParseError {
fn from(val: GetBitsError) -> Self {
Self::GetBitsError(val)
}
}

impl core::fmt::Display for LiteralsSectionType {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
match self {
Expand All @@ -47,6 +105,7 @@ impl Default for LiteralsSection {
}

impl LiteralsSection {
/// Create a new [LiteralsSection].
pub fn new() -> LiteralsSection {
LiteralsSection {
regenerated_size: 0,
Expand All @@ -56,25 +115,26 @@ impl LiteralsSection {
}
}

/// Given the first byte of a header, determine the size of the whole header, from 1 to 5 bytes.
pub fn header_bytes_needed(&self, first_byte: u8) -> Result<u8, LiteralsSectionParseError> {
let ls_type = Self::section_type(first_byte)?;
let ls_type: LiteralsSectionType = Self::section_type(first_byte)?;
let size_format = (first_byte >> 2) & 0x3;
match ls_type {
LiteralsSectionType::RLE | LiteralsSectionType::Raw => {
match size_format {
0 | 2 => {
//size_format actually only uses one bit
//regenerated_size uses 5 bits
// size_format actually only uses one bit
// regenerated_size uses 5 bits
Ok(1)
}
1 => {
//size_format uses 2 bit
//regenerated_size uses 12 bits
// size_format uses 2 bit
// regenerated_size uses 12 bits
Ok(2)
}
3 => {
//size_format uses 2 bit
//regenerated_size uses 20 bits
// size_format uses 2 bit
// regenerated_size uses 20 bits
Ok(3)
}
_ => panic!(
Expand All @@ -85,16 +145,16 @@ impl LiteralsSection {
LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
match size_format {
0 | 1 => {
//Only differ in num_streams
//both regenerated and compressed sizes use 10 bit
// Only differ in num_streams
// both regenerated and compressed sizes use 10 bit
Ok(3)
}
2 => {
//both regenerated and compressed sizes use 14 bit
// both regenerated and compressed sizes use 14 bit
Ok(4)
}
3 => {
//both regenerated and compressed sizes use 18 bit
// both regenerated and compressed sizes use 18 bit
Ok(5)
}

Expand All @@ -106,10 +166,11 @@ impl LiteralsSection {
}
}

/// Parse the header into `self`, and returns the number of bytes read.
pub fn parse_from_header(&mut self, raw: &[u8]) -> Result<u8, LiteralsSectionParseError> {
let mut br = BitReader::new(raw);
let t = br.get_bits(2)? as u8;
self.ls_type = Self::section_type(t)?;
let mut br: BitReader<'_> = BitReader::new(raw);
let block_type = br.get_bits(2)? as u8;
self.ls_type = Self::section_type(block_type)?;
let size_format = br.get_bits(2)? as u8;

let byte_needed = self.header_bytes_needed(raw[0])?;
Expand All @@ -125,20 +186,20 @@ impl LiteralsSection {
self.compressed_size = None;
match size_format {
0 | 2 => {
//size_format actually only uses one bit
//regenerated_size uses 5 bits
// size_format actually only uses one bit
// regenerated_size uses 5 bits
self.regenerated_size = u32::from(raw[0]) >> 3;
Ok(1)
}
1 => {
//size_format uses 2 bit
//regenerated_size uses 12 bits
// size_format uses 2 bit
// regenerated_size uses 12 bits
self.regenerated_size = (u32::from(raw[0]) >> 4) + (u32::from(raw[1]) << 4);
Ok(2)
}
3 => {
//size_format uses 2 bit
//regenerated_size uses 20 bits
// size_format uses 2 bit
// regenerated_size uses 20 bits
self.regenerated_size = (u32::from(raw[0]) >> 4)
+ (u32::from(raw[1]) << 4)
+ (u32::from(raw[2]) << 12);
Expand All @@ -164,10 +225,10 @@ impl LiteralsSection {

match size_format {
0 | 1 => {
//Differ in num_streams see above
//both regenerated and compressed sizes use 10 bit
// Differ in num_streams see above
// both regenerated and compressed sizes use 10 bit

//4 from the first, six from the second byte
// 4 from the first, six from the second byte
self.regenerated_size =
(u32::from(raw[0]) >> 4) + ((u32::from(raw[1]) & 0x3f) << 4);

Expand All @@ -177,27 +238,27 @@ impl LiteralsSection {
Ok(3)
}
2 => {
//both regenerated and compressed sizes use 14 bit
// both regenerated and compressed sizes use 14 bit

//4 from first, full second, 2 from the third byte
// 4 from first, full second, 2 from the third byte
self.regenerated_size = (u32::from(raw[0]) >> 4)
+ (u32::from(raw[1]) << 4)
+ ((u32::from(raw[2]) & 0x3) << 12);

//6 from the third, full last byte
// 6 from the third, full last byte
self.compressed_size =
Some((u32::from(raw[2]) >> 2) + (u32::from(raw[3]) << 6));
Ok(4)
}
3 => {
//both regenerated and compressed sizes use 18 bit
// both regenerated and compressed sizes use 18 bit

//4 from first, full second, six from third byte
// 4 from first, full second, six from third byte
self.regenerated_size = (u32::from(raw[0]) >> 4)
+ (u32::from(raw[1]) << 4)
+ ((u32::from(raw[2]) & 0x3F) << 12);

//2 from third, full fourth, full fifth byte
// 2 from third, full fourth, full fifth byte
self.compressed_size = Some(
(u32::from(raw[2]) >> 6)
+ (u32::from(raw[3]) << 2)
Expand All @@ -214,6 +275,7 @@ impl LiteralsSection {
}
}

/// Given the first two bits of a header, determine the type of a header.
fn section_type(raw: u8) -> Result<LiteralsSectionType, LiteralsSectionParseError> {
let t = raw & 0x3;
match t {
Expand Down
7 changes: 7 additions & 0 deletions src/blocks/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
//! In a Zstandard frame, there's a frame header, followed by one or more *blocks*.
//!
//! A block contains data, and a header describing how that data is encoded, as well
//! as other misc metadata.
//!
//! <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#blocks>

pub mod block;
pub mod literals_section;
pub mod sequence_section;
Loading

0 comments on commit ccbe90d

Please sign in to comment.