diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..7d5a729 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,56 @@ +name: CI + +on: + push: + branches: [ "develop" ] + pull_request: { } + workflow_dispatch: { } + +permissions: + actions: read + contents: read + +jobs: + build: + name: 'build' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Rust Version + id: rust-version + shell: bash + run: echo "version=$(cat rust-toolchain.toml | grep channel | awk -F'\"' '{print $2}')" >> $GITHUB_OUTPUT + + - name: Rust Toolchain + id: rust-toolchain + uses: dtolnay/rust-toolchain@master + if: steps.rustup-cache.outputs.cache-hit != 'true' + with: + toolchain: "${{ steps.rust-version.outputs.version }}" + components: clippy, rustfmt + + - name: Rust Dependency Cache + uses: Swatinem/rust-cache@v2 + with: + save-if: ${{ github.ref == 'refs/heads/develop' }} + shared-key: "shared" # To allow reuse across jobs + + - name: Rust Compile Cache + uses: mozilla-actions/sccache-action@v0.0.5 + - name: Rust Compile Cache Config + shell: bash + # echo "CARGO_LOG=cargo::core::compiler::fingerprint=info" >> $GITHUB_ENV + run: | + echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV + echo "RUSTC_WRAPPER=sccache" >> $GITHUB_ENV + echo "CARGO_INCREMENTAL=0" >> $GITHUB_ENV + + - name: Rust Build + run: cargo build --all-features --all-targets + - name: Rust Lint - Format + run: cargo fmt --all --check + - name: Rust Lint - Clippy + run: cargo clippy --all-features --all-targets + - name: Rust Test + run: cargo test --workspace --all-features \ No newline at end of file diff --git a/.github/workflows/release-plz.yml b/.github/workflows/release-plz.yml new file mode 100644 index 0000000..4e6f481 --- /dev/null +++ b/.github/workflows/release-plz.yml @@ -0,0 +1,28 @@ +name: Release-plz + +permissions: + pull-requests: write + contents: write + +# TODO(aduffy): uncomment when we're ready to publish +on: {} + # push: + # branches: + # - develop + +jobs: + release-plz: + name: Release-plz + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + - name: Run release-plz + uses: MarcoIeni/release-plz-action@v0.5 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} diff --git a/.gitignore b/.gitignore index a5ff07f..8b196e9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /target +.idea/ # Added by cargo diff --git a/Cargo.lock b/Cargo.lock index 414f6c6..48d9198 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,630 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" + +[[package]] +name = "autocfg" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cc" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e8aabfac534be767c909e0690571677d49f41bd8465ae876fe043d52ba5292" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11d8838454fda655dafd3accb2b6e2bea645b9e4078abe84a22ceb947235c5cc" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + [[package]] name = "fsst-rs" -version = "0.1.0" +version = "0.0.1" +dependencies = [ + "criterion", + "lz4", +] + +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "is-terminal" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "js-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "lz4" +version = "1.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958b4caa893816eea05507c20cfe47574a43d9a697138a7872990bba8a0ece68" +dependencies = [ + "libc", + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "109de74d5d2353660401699a4174a4ff23fcc649caf553df71933c7fb45ad868" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "oorandom" +version = "11.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" + +[[package]] +name = "plotters" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a15b6eccb8484002195a3e44fe65a4ce8e93a625797a063735536fd59cb01cf3" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "414cec62c6634ae900ea1c56128dfe87cf63e7caece0852ec76aba307cebadb7" + +[[package]] +name = "plotters-svg" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81b30686a7d9c3e010b84284bdd26a29f2138574f52f5eb6f794fc0ad924e705" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.206" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b3e4cd94123dd520a128bcd11e34d9e9e423e7e3e50425cb1b4b1e3549d0284" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.206" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabfb6138d2383ea8208cf98ccf69cdfb1aff4088460681d84189aa259762f97" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.124" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66ad62847a56b3dba58cc891acd13884b9c61138d330c0d7b6181713d4fce38d" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" + +[[package]] +name = "web-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml index c90cd98..030301c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,28 @@ [package] name = "fsst-rs" -version = "0.1.0" +version = "0.0.1" edition = "2021" -[dependencies] +[lints.rust] +warnings = "deny" +missing_docs = "deny" + +[lints.clippy] +all = { level = "deny", priority = -1 } +if_then_some_else_none = { level = "deny" } +mem_forget = { level = "deny" } +or_fun_call = "deny" +panic_in_result_fn = { level = "deny" } +same_name_method = { level = "deny" } +tests_outside_test_module = { level = "deny" } +unwrap_in_result = { level = "deny" } +use_debug = { level = "deny" } + +[dev-dependencies] +criterion = "0.5" +lz4 = "1" + +[[bench]] +name = "compress" +harness = false +bench = true diff --git a/README.md b/README.md index f0911f4..d6957db 100644 --- a/README.md +++ b/README.md @@ -1 +1,16 @@ -# fsst \ No newline at end of file +# fsst-rs + +A pure-Rust, zero-dependency implementation of the [FSST string compression algorithm][whitepaper]. + +FSST is a string compression algorithm meant for use in database systems. It was designed by +[Peter Boncz, Thomas Neumann, and Viktor Leis][whitepaper]. It provides 1-3GB/sec compression +and decompression of strings at compression rates competitive with or better than LZ4. + +This implementation is somewhat inspired by the [MIT-licensed implementation] from the paper authors, written in C++, +but it is mostly written from a careful reading of the paper. + +**NOTE: This current implementation is still in-progress and is not production ready, please use at your own risk.** + + +[whitepaper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf +[MIT-licensed implementation]: https://github.com/cwida/fsst diff --git a/benches/compress.rs b/benches/compress.rs new file mode 100644 index 0000000..829b7e6 --- /dev/null +++ b/benches/compress.rs @@ -0,0 +1,109 @@ +//! Compression benchmark. +//! +//! Contains benchmarks for FSST compression, decompression, and symbol table training. +//! +//! Also contains LZ4 baseline. +#![allow(missing_docs)] +use std::io::{Cursor, Read, Write}; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use lz4::liblz4::BlockChecksum; +use lz4::{BlockSize, ContentChecksum}; + +use fsst_rs::{train, Code}; + +const CORPUS: &str = include_str!("dracula.txt"); +const TEST: &str = "I found my smattering of German very useful here"; + +fn bench_fsst(c: &mut Criterion) { + let mut group = c.benchmark_group("fsst"); + group.bench_function("train", |b| { + let corpus = CORPUS.as_bytes(); + b.iter(|| black_box(train(black_box(corpus)))); + }); + + let table = train(CORPUS); + let plaintext = TEST.as_bytes(); + + let compressed = table.compress(plaintext); + let escape_count = compressed + .iter() + .filter(|b| **b == Code::ESCAPE_CODE) + .count(); + let ratio = (plaintext.len() as f64) / (compressed.len() as f64); + println!( + "Escapes = {escape_count}/{}, compression_ratio = {ratio}", + compressed.len() + ); + + assert_eq!(table.decompress(&compressed), TEST.as_bytes()); + + group.bench_function("compress-single", |b| { + b.iter(|| black_box(table.compress(black_box(plaintext)))); + }); + + group.bench_function("decompress-single", |b| { + b.iter(|| black_box(table.decompress(black_box(&compressed)))); + }); +} + +fn bench_lz4(c: &mut Criterion) { + let mut group = c.benchmark_group("lz4"); + + // { + // let compressed = Vec::with_capacity(10_000); + // let mut encoder = lz4::EncoderBuilder::new() + // .block_size(BlockSize::Max64KB) + // .build(compressed) + // .unwrap(); + // + // encoder.write_all(TEST.as_bytes()).unwrap(); + // let (compressed, result) = encoder.finish(); + // result.unwrap(); + // + // let ratio = (TEST.as_bytes().len() as f64) / (compressed.len() as f64); + // println!("LZ4 compress_ratio = {ratio}"); + // + // // ensure decodes cleanly + // let cursor = Cursor::new(compressed); + // let mut decoder = lz4::Decoder::new(cursor).unwrap(); + // let mut output = String::new(); + // + // decoder.read_to_string(&mut output).unwrap(); + // assert_eq!(output.as_str(), TEST); + // } + + group.bench_function("compress-single", |b| { + let mut compressed = Vec::with_capacity(100_000_000); + let mut encoder = lz4::EncoderBuilder::new() + .block_size(BlockSize::Max64KB) + .checksum(ContentChecksum::NoChecksum) + .block_checksum(BlockChecksum::NoBlockChecksum) + .build(&mut compressed) + .unwrap(); + + b.iter(|| encoder.write_all(TEST.as_bytes()).unwrap()); + }); + + group.bench_function("decompress-single", |b| { + let compressed = Vec::new(); + let mut encoder = lz4::EncoderBuilder::new() + .block_size(BlockSize::Max64KB) + .checksum(ContentChecksum::NoChecksum) + .block_checksum(BlockChecksum::NoBlockChecksum) + .build(compressed) + .unwrap(); + encoder.write_all(TEST.as_bytes()).unwrap(); + let (compressed, result) = encoder.finish(); + result.unwrap(); + + let cursor = Cursor::new(compressed); + let mut decoder = lz4::Decoder::new(cursor).unwrap(); + let mut output = Vec::new(); + + b.iter(|| decoder.read_to_end(&mut output).unwrap()); + }); +} + +criterion_group!(compress_bench, bench_fsst, bench_lz4); +criterion_main!(compress_bench); diff --git a/benches/dracula.txt b/benches/dracula.txt new file mode 100644 index 0000000..88adb22 --- /dev/null +++ b/benches/dracula.txt @@ -0,0 +1 @@ +How these papers have been placed in sequence will be made manifest in the reading of them. All needless matters have been eliminated, so that a history almost at variance with the possibilities of later-day belief may stand forth as simple fact. There is throughout no statement of past things wherein memory may err, for all the records chosen are exactly contemporary, given from the standpoints and within the range of knowledge of those who made them. We left in pretty good time, and came after nightfall to Klausenburgh. Here I stopped for the night at the Hotel Royale. I had for dinner, or rather supper, a chicken done up some way with red pepper, which was very good but thirsty. (Mem., get recipe for Mina.) I asked the waiter, and he said it was called “paprika hendl,” and that, as it was a national dish, I should be able to get it anywhere along the Carpathians. I found my smattering of German very useful here; indeed, I don’t know how I should be able to get on without it. diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..544af13 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,5 @@ +[toolchain] +channel = "nightly-2024-06-19" +components = ["rust-src", "rustfmt", "clippy"] +profile = "minimal" + diff --git a/src/builder.rs b/src/builder.rs new file mode 100644 index 0000000..1dca853 --- /dev/null +++ b/src/builder.rs @@ -0,0 +1,218 @@ +//! Functions and types used for building a [`SymbolTable`] from a corpus of text. +//! +//! This module implements the logic from Algorithm 3 of the [FSST Paper]. +//! +//! [FSST Paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf + +use std::cmp::Ordering; +use std::collections::BinaryHeap; + +use crate::{Code, Symbol, SymbolTable}; + +#[derive(Debug, Clone)] +struct Counter { + /// Frequency count for each code. + counts1: Vec, + + /// Frequency count for each code-pair. + counts2: Vec>, +} + +impl Counter { + fn new() -> Self { + Self { + counts1: vec![0; 511], + counts2: vec![vec![0; 511]; 511], + } + } + + #[inline] + fn record_count1(&mut self, code1: Code) { + self.counts1[code1.0 as usize] += 1; + } + + #[inline] + fn record_count2(&mut self, code1: Code, code2: Code) { + self.counts2[code1.0 as usize][code2.0 as usize] += 1; + } + + #[inline] + fn count1(&self, code: Code) -> usize { + self.counts1[code.0 as usize] + } + + #[inline] + fn count2(&self, code1: Code, code2: Code) -> usize { + self.counts2[code1.0 as usize][code2.0 as usize] + } +} + +/// The number of generations used for training. This is taken from the [FSST paper]. +/// +/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf +pub const MAX_GENERATIONS: usize = 5; + +/// Build and train a `SymbolTable` from a sample corpus of text. +/// +/// This function implements the generational algorithm described in the [FSST paper] Section +/// 4.3. Starting with an empty symbol table, it iteratively compresses the corpus, then attempts +/// to merge symbols when doing so would yield better compression than leaving them unmerged. The +/// resulting table will have at most 255 symbols (the 256th symbol is reserved for the escape +/// code). +/// +/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf +pub fn train(corpus: impl AsRef<[u8]>) -> SymbolTable { + let mut table = SymbolTable::default(); + // TODO(aduffy): handle truncating/sampling if corpus > requires sample size. + let sample = corpus.as_ref(); + for _generation in 0..MAX_GENERATIONS { + let counter = table.compress_count(sample); + table = table.optimize(counter); + } + + table +} + +impl SymbolTable { + /// Compress the text using the current symbol table. Count the code occurrences + /// and code-pair occurrences to allow us to calculate apparent gain. + fn compress_count(&self, sample: &[u8]) -> Counter { + let mut counter = Counter::new(); + let len = sample.len(); + let mut prev_code = self.find_longest_symbol(sample); + counter.record_count1(prev_code); + let mut pos = self.symbols[prev_code.0 as usize].len(); + + while pos < len { + let code = self.find_longest_symbol(&sample[pos..len]); + counter.record_count1(code); + counter.record_count2(prev_code, code); + pos += self.symbols[code.0 as usize].len(); + prev_code = code; + } + + counter + } + + /// Using a set of counters and the existing set of symbols, build a new + /// set of symbols/codes that optimizes the gain over the distribution in `counter`. + fn optimize(&self, counters: Counter) -> Self { + let mut res = SymbolTable::default(); + let mut pqueue = BinaryHeap::new(); + for code1 in 0..511 { + let code1 = Code::from_u16(code1); + let symbol1 = self.symbols[code1.0 as usize]; + let gain = counters.count1(code1) * symbol1.len(); + pqueue.push(Candidate { + symbol: symbol1, + gain, + }); + + for code2 in 0..511 { + let code2 = Code::from_u16(code2); + let symbol2 = &self.symbols[code2.0 as usize]; + // If either symbol is zero-length, or if merging would yield a symbol of + // length greater than 8, skip. + if symbol1.len() + symbol2.len() >= 8 || symbol1.is_empty() || symbol2.is_empty() { + continue; + } + let new_symbol = symbol1.concat(symbol2); + // as`sert the symbol is not empty + assert!( + !new_symbol.is_empty(), + "symbol made by merging {:?} and {:?} is empty", + symbol1, + symbol2, + ); + let gain = counters.count2(code1, code2); + pqueue.push(Candidate { + symbol: new_symbol, + gain, + }) + } + } + + // Pop the 255 best symbols. + pqueue + .iter() + .take(255) + .for_each(|candidate| res.insert(candidate.symbol)); + + res + } +} + +/// A candidate for inclusion in a symbol table. +/// +/// This is really only useful for the `optimize` step of training. +struct Candidate { + gain: usize, + symbol: Symbol, +} + +impl Candidate { + fn comparable_form(&self) -> (usize, usize) { + (self.gain, self.symbol.len()) + } +} + +impl Eq for Candidate {} + +impl PartialEq for Candidate { + fn eq(&self, other: &Self) -> bool { + self.comparable_form().eq(&other.comparable_form()) + } +} + +impl PartialOrd for Candidate { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Candidate { + fn cmp(&self, other: &Self) -> Ordering { + let self_ord = (self.gain, self.symbol.len()); + let other_ord = (other.gain, other.symbol.len()); + + self_ord.cmp(&other_ord) + } +} + +#[cfg(test)] +mod test { + use crate::{train, Code}; + + #[test] + fn test_builder() { + // Train a SymbolTable on the toy string + let text = "hello world"; + let table = train(text.as_bytes()); + + // Use the table to compress a string, see the values + let compressed = table.compress(text.as_bytes()); + + // Ensure that the compressed string has no escape bytes + assert!(compressed.iter().all(|b| *b != Code::ESCAPE_CODE)); + + // Ensure that we can compress a string with no values seen at training time. + let compressed = table.compress("xyz123".as_bytes()); + assert_eq!( + compressed, + vec![ + Code::ESCAPE_CODE, + b'x', + Code::ESCAPE_CODE, + b'y', + Code::ESCAPE_CODE, + b'z', + Code::ESCAPE_CODE, + b'1', + Code::ESCAPE_CODE, + b'2', + Code::ESCAPE_CODE, + b'3', + ] + ) + } +} diff --git a/src/fsst.rs b/src/fsst.rs deleted file mode 100644 index 960db9d..0000000 --- a/src/fsst.rs +++ /dev/null @@ -1,325 +0,0 @@ -use std::cmp::min; - -const FSST_CODE_MAX: u16 = 256; -const FSST_CODE_MASK: u16 = FSST_CODE_MAX - 1; -const FSST_LEN_BITS: u32 = 12; -const FSST_CODE_BITS: u32 = 9; -const FSST_CODE_BASE: u16 = 256; -const FSST_HASH_LOG2SIZE: usize = 10; -const FSST_HASH_PRIME: u64 = 2971215073; -const FSST_SHIFT: u32 = 15; -const FSST_ICL_FREE: u64 = (15 << 28) | ((FSST_CODE_MASK as u64) << 16); -const FSST_MAXHEADER: usize = 8 + 1 + 8 + 2048 + 1; -const FSST_ESC: u8 = 255; - -#[inline(always)] -fn fsst_unaligned_load(v: &[u8]) -> u64 { - let mut ret: u64 = 0; - unsafe { - std::ptr::copy_nonoverlapping(v.as_ptr(), &mut ret as *mut u64 as *mut u8, 8); - } - ret -} - -#[inline(always)] -fn fsst_hash(w: u64) -> u64 { - ((w * FSST_HASH_PRIME) ^ ((w * FSST_HASH_PRIME) >> FSST_SHIFT)) -} - -#[derive(Clone, Copy)] -struct Symbol { - val: [u8; 8], - icl: u64, -} - -impl Symbol { - const MAX_LENGTH: usize = 8; - - fn new() -> Self { - Symbol { val: [0; 8], icl: 0 } - } - - fn from_byte(c: u8, code: u16) -> Self { - let mut s = Symbol::new(); - s.val[0] = c; - s.set_code_len(code, 1); - s - } - - fn from_slice(input: &[u8]) -> Self { - let mut s = Symbol::new(); - let len = min(input.len(), Self::MAX_LENGTH); - s.val[..len].copy_from_slice(&input[..len]); - s.set_code_len(FSST_CODE_MASK, len as u32); - s - } - - fn set_code_len(&mut self, code: u16, len: u32) { - self.icl = (len << 28) as u64 | (code as u64) << 16 | ((8 - len) * 8) as u64; - } - - fn length(&self) -> u32 { - (self.icl >> 28) as u32 - } - - fn code(&self) -> u16 { - ((self.icl >> 16) & FSST_CODE_MASK as u64) as u16 - } - - fn ignored_bits(&self) -> u32 { - self.icl as u32 - } - - fn first(&self) -> u8 { - self.val[0] - } - - fn first2(&self) -> u16 { - u16::from_le_bytes([self.val[0], self.val[1]]) - } - - fn hash(&self) -> usize { - let v = u32::from_le_bytes([self.val[0], self.val[1], self.val[2], self.val[3]]); - fsst_hash(v as u64) as usize - } -} - -struct SymbolTable { - short_codes: [u16; 65536], - byte_codes: [u16; 256], - symbols: Vec, - hash_tab: Vec, - n_symbols: u16, - suffix_lim: u16, - terminator: u16, - zero_terminated: bool, - len_histo: [u16; FSST_CODE_BITS as usize], -} - -impl SymbolTable { - fn new() -> Self { - let mut st = SymbolTable { - short_codes: [0; 65536], - byte_codes: [0; 256], - symbols: vec![Symbol::new(); FSST_CODE_MAX as usize], - hash_tab: vec![Symbol::new(); 1 << FSST_HASH_LOG2SIZE], - n_symbols: 0, - suffix_lim: FSST_CODE_MAX, - terminator: 0, - zero_terminated: false, - len_histo: [0; FSST_CODE_BITS as usize], - }; - - for i in 0..256 { - st.symbols[i] = Symbol::from_byte(i as u8, i as u16 | (1 << FSST_LEN_BITS)); - } - - for i in 256..FSST_CODE_MAX as usize { - st.symbols[i] = Symbol::from_byte(0, FSST_CODE_MASK); - } - - for i in 0..256 { - st.byte_codes[i] = (1 << FSST_LEN_BITS) | i as u16; - } - - for i in 0..65536 { - st.short_codes[i] = (1 << FSST_LEN_BITS) | (i & 255) as u16; - } - - st - } - - fn clear(&mut self) { - self.len_histo = [0; FSST_CODE_BITS as usize]; - for i in FSST_CODE_BASE as usize..FSST_CODE_BASE as usize + self.n_symbols as usize { - let symbol = &self.symbols[i]; - if symbol.length() == 1 { - let val = symbol.first(); - self.byte_codes[val as usize] = (1 << FSST_LEN_BITS) | val as u16; - } else if symbol.length() == 2 { - let val = symbol.first2(); - self.short_codes[val as usize] = (1 << FSST_LEN_BITS) | (val & 255); - } else { - let idx = symbol.hash() & ((1 << FSST_HASH_LOG2SIZE) - 1); - self.hash_tab[idx] = Symbol::new(); - self.hash_tab[idx].icl = FSST_ICL_FREE; - } - } - self.n_symbols = 0; - } - - fn hash_insert(&mut self, s: Symbol) -> bool { - let idx = s.hash() & ((1 << FSST_HASH_LOG2SIZE) - 1); - let taken = self.hash_tab[idx].icl < FSST_ICL_FREE; - if taken { - return false; - } - self.hash_tab[idx] = s; - true - } - - fn add(&mut self, mut s: Symbol) -> bool { - assert!(FSST_CODE_BASE + self.n_symbols < FSST_CODE_MAX); - let len = s.length(); - s.set_code_len(FSST_CODE_BASE + self.n_symbols, len); - if len == 1 { - self.byte_codes[s.first() as usize] = FSST_CODE_BASE + self.n_symbols + (1 << FSST_LEN_BITS); - } else if len == 2 { - self.short_codes[s.first2() as usize] = FSST_CODE_BASE + self.n_symbols + (2 << FSST_LEN_BITS); - } else if !self.hash_insert(s) { - return false; - } - self.symbols[FSST_CODE_BASE as usize + self.n_symbols as usize] = s; - self.len_histo[len as usize - 1] += 1; - self.n_symbols += 1; - true - } - - fn find_longest_symbol(&self, s: Symbol) -> u16 { - let idx = s.hash() & ((1 << FSST_HASH_LOG2SIZE) - 1); - if self.hash_tab[idx].icl <= s.icl && self.hash_tab[idx].val == s.val { - return (self.hash_tab[idx].icl >> 16) & FSST_CODE_MASK as u64; - } - if s.length() >= 2 { - let code = self.short_codes[s.first2() as usize] & FSST_CODE_MASK; - if code >= FSST_CODE_BASE { - return code; - } - } - self.byte_codes[s.first() as usize] & FSST_CODE_MASK - } - - fn find_longest_symbol_slice(&self, cur: &[u8], end: &[u8]) -> u16 { - self.find_longest_symbol(Symbol::from_slice(&cur[..min(cur.len(), end.len())])) - } -} - -struct Counters { - count1: Vec, - count2: Vec>, -} - -impl Counters { - fn new() -> Self { - Counters { - count1: vec![0; FSST_CODE_MAX as usize], - count2: vec![vec![0; FSST_CODE_MAX as usize]; FSST_CODE_MAX as usize], - } - } - - fn count1_set(&mut self, pos1: usize, val: u16) { - self.count1[pos1] = val; - } - - fn count1_inc(&mut self, pos1: usize) { - self.count1[pos1] += 1; - } - - fn count2_inc(&mut self, pos1: usize, pos2: usize) { - self.count2[pos1][pos2] += 1; - } - - fn count1_get_next(&self, pos1: &mut usize) -> u32 { - self.count1[*pos1] as u32 - } - - fn count2_get_next(&self, pos1: usize, pos2: &mut usize) -> u32 { - self.count2[pos1][*pos2] as u32 - } - - fn backup1(&self, buf: &mut [u8]) { - unsafe { - std::ptr::copy_nonoverlapping( - self.count1.as_ptr() as *const u8, - buf.as_mut_ptr(), - FSST_CODE_MAX as usize * std::mem::size_of::(), - ); - } - } - - fn restore1(&mut self, buf: &[u8]) { - unsafe { - std::ptr::copy_nonoverlapping( - buf.as_ptr(), - self.count1.as_mut_ptr() as *mut u8, - FSST_CODE_MAX as usize * std::mem::size_of::(), - ); - } - } -} - -struct Encoder { - symbol_table: SymbolTable, - counters: Counters, -} - -impl Encoder { - fn new() -> Self { - Encoder { - symbol_table: SymbolTable::new(), - counters: Counters::new(), - } - } - - pub fn compress(&self, input: &[u8], output: &mut [u8]) -> (usize, usize) { - let mut in_pos = 0; - let mut out_pos = 0; - - while in_pos < input.len() && out_pos < output.len() { - let symbol = self.symbol_table.find_longest_symbol_slice(&input[in_pos..], &input[input.len()..]); - let code = symbol & FSST_CODE_MASK; - let len = (symbol >> FSST_LEN_BITS) as usize; - - if code < FSST_CODE_BASE { - // Escape byte - if out_pos + 2 > output.len() { - break; - } - output[out_pos] = FSST_ESC; - output[out_pos + 1] = input[in_pos]; - out_pos += 2; - in_pos += 1; - } else { - if out_pos + 1 > output.len() { - break; - } - output[out_pos] = code as u8; - out_pos += 1; - in_pos += len; - } - } - - (in_pos, out_pos) - } -} - -impl SymbolTable { - pub fn decompress(&self, input: &[u8], output: &mut [u8]) -> usize { - let mut in_pos = 0; - let mut out_pos = 0; - - while in_pos < input.len() && out_pos < output.len() { - let code = input[in_pos] as u16; - in_pos += 1; - - if code == FSST_ESC as u16 { - if in_pos >= input.len() { - break; - } - output[out_pos] = input[in_pos]; - in_pos += 1; - out_pos += 1; - } else { - let symbol = &self.symbols[code as usize]; - let len = symbol.length() as usize; - if out_pos + len > output.len() { - break; - } - output[out_pos..out_pos + len].copy_from_slice(&symbol.val[..len]); - out_pos += len; - } - } - - out_pos - } -} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 44a4684..a545ac1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,12 +1,280 @@ -mod fsst; +#![doc = include_str!("../README.md")] +use std::fmt::{Debug, Formatter}; -#[cfg(test)] -mod tests { - use super::*; +pub use builder::*; - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); +mod builder; +mod longest; + +/// A Symbol wraps a set of values of +#[derive(Copy, Clone)] +pub union Symbol { + bytes: [u8; 8], + num: u64, +} + +impl Debug for Symbol { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", unsafe { self.num }) + } +} + +impl Symbol { + /// Zero value for `Symbol`. + pub const ZERO: Self = Self::zero(); + + /// Constructor for a `Symbol` from an 8-element byte slice. + pub fn from_slice(slice: &[u8; 8]) -> Self { + Self { bytes: *slice } + } + + /// Return a zero symbol + const fn zero() -> Self { + Self { num: 0 } + } + + /// Create a new single-byte symbol + pub fn from_u8(value: u8) -> Self { + Self { + bytes: [value, 0, 0, 0, 0, 0, 0, 0], + } + } +} + +impl Symbol { + /// Calculate the length of the symbol in bytes. + /// + /// Each symbol has the capacity to hold up to 8 bytes of data, but the symbols + /// can contain fewer bytes, padded with 0x00. + pub fn len(&self) -> usize { + let numeric = unsafe { self.num }; + // For little-endian platforms, this counts the number of *trailing* zeros + let null_bytes = (numeric.leading_zeros() >> 3) as usize; + + size_of::() - null_bytes + } + + /// Returns true if the symbol does not encode any bytes. + /// + /// Note that this should only be true for the zero code. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Create a ew + pub fn as_slice(&self) -> &[u8] { + let len = self.len(); + // SAFETY: constructors will not allow building a struct where len > 8. + unsafe { &self.bytes[0..len] } + } + + /// Returns true if the symbol is a prefix of the provided text. + pub fn is_prefix(&self, text: &[u8]) -> bool { + text.starts_with(self.as_slice()) + } + + /// Return a new `Symbol` by logically concatenating ourselves with another `Symbol`. + pub fn concat(&self, other: &Self) -> Self { + let new_len = self.len() + other.len(); + assert!(new_len <= 8, "cannot build symbol with length > 8"); + + let self_len = self.len(); + let mut result = *self; + unsafe { result.bytes[self_len..new_len].copy_from_slice(other.as_slice()) }; + + result + } +} + +/// Codes used to map symbols to bytes. +/// +/// Logically, codes can range from 0-255 inclusive. Physically, we represent them as a 9-bit +/// value packed into a `u16`. +/// +/// Physically in-memory, `Code(0)` through `Code(255)` corresponds to escape sequences of raw bytes +/// 0 through 255. `Code(256)` through `Code(511)` represent the actual codes -255. +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct Code(u16); + +impl Code { + /// Maximum code value for the in-memory `Code` representation. + pub const CODE_MAX: u16 = 512; + + /// Maximum code value. Code 255 is reserved as the [escape code][`Self::ESCAPE_CODE`]. + pub const MAX_CODE: u8 = 254; + + /// Code used to indicate bytes that are not in the symbol table. + /// + /// When compressing a string that cannot fully be expressed with the symbol table, the compressed + /// output will contain an `ESCAPE` byte followed by a raw byte. At decompression time, the presence + /// of `ESCAPE` indicates that the next byte should be appended directly to the result instead of + /// being looked up in the symbol table. + pub const ESCAPE_CODE: u8 = 255; + + /// Create a new code representing an escape byte. + pub fn new_escaped(byte: u8) -> Self { + Self(byte as u16) + } + + /// Create a new code representing a symbol. + pub fn new_symbol(code: u8) -> Self { + assert_ne!( + code, + Code::ESCAPE_CODE, + "code {code} cannot be used for symbol, reserved for ESCAPE" + ); + + Self((code as u16) + 256) + } + + /// Create a `Code` directly from a `u16` value. + /// + /// # Panics + /// Panic if the value is ≥ the defined `CODE_MAX`. + pub fn from_u16(code: u16) -> Self { + assert!(code < Self::CODE_MAX, "code value higher than CODE_MAX"); + + Self(code) + } + + /// Returns true if the code is for an escape byte. + #[inline] + pub fn is_escape(&self) -> bool { + self.0 <= 255 + } +} + +/// The static symbol table used for compression and decompression. +/// +/// The `SymbolTable` is the central component of FSST. You can create a SymbolTable either by +/// default, or by [training] it on an input corpus of text. +/// +/// Example usage: +/// +/// ``` +/// use fsst_rs::{Symbol, SymbolTable}; +/// let mut table = SymbolTable::default(); +/// table.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', 0, 0, 0])); +/// +/// let compressed = table.compress("hello".as_bytes()); +/// assert_eq!(compressed, vec![0u8]); +/// ``` +/// +/// training: [`train`] +#[derive(Clone, Debug)] +pub struct SymbolTable { + /// Table mapping codes to symbols. + pub(crate) symbols: [Symbol; 511], + + /// Indicates the number of entries in the symbol table that have been populated. + /// + /// This value is always at least 256, as the first 256 entries in the `table` are the escape + /// bytes. + pub(crate) n_symbols: usize, +} + +impl Default for SymbolTable { + fn default() -> Self { + let mut table = Self { + symbols: [Symbol::ZERO; 511], + n_symbols: 0, + }; + + // Populate the escape byte entries. + for byte in 0..=255 { + table.symbols[byte as usize] = Symbol::from_u8(byte); + } + table.n_symbols = 256; + + table + } +} + +/// The core structure of the FSST codec, holding a mapping between `Symbol`s and `Code`s. +/// +/// The symbol table is trained on a corpus of data in the form of a single byte array, building up +/// a mapping of 1-byte "codes" to sequences of up to `N` plaintext bytse, or "symbols". +impl SymbolTable { + /// Insert a new symbol at the end of the table. + /// + /// # Panics + /// Panics if the table is already full. + pub fn insert(&mut self, symbol: Symbol) { + assert!( + self.n_symbols < self.symbols.len(), + "cannot insert into full symbol table" + ); + self.symbols[self.n_symbols] = symbol; + self.n_symbols += 1; + } + + /// Return a new encoded sequence of data bytes instead. + pub fn compress(&self, plaintext: &[u8]) -> Vec { + let mut values = Vec::with_capacity(2 * plaintext.len()); + let len = plaintext.len(); + let mut pos = 0; + while pos < len { + // println!("COMPRESS pos={pos} len={len} in_progress_size={}", values.len()); + let next_code = self.find_longest_symbol(&plaintext[pos..len]); + if next_code.is_escape() { + // Case 1 -escape: push an ESCAPE followed by the next byte. + // println!("ESCAPE"); + values.push(Code::ESCAPE_CODE); + values.push(next_code.0 as u8); + pos += 1; + } else { + // Case 2 - code: push the code, increment position by symbol length + let symbol = self.symbols[next_code.0 as usize]; + // println!("APPEND symbol={:?} len={}", symbol.as_slice(), symbol.len()); + values.push(next_code.0 as u8); + pos += symbol.len(); + } + } + + values + } + + /// Decompress the provided byte slice into a [`String`] using the symbol table. + pub fn decompress(&self, compressed: &[u8]) -> Vec { + let mut decoded: Vec = Vec::with_capacity(size_of::() * compressed.len()); + let ptr = decoded.as_mut_ptr(); + + let mut in_pos = 0; + let mut out_pos = 0; + + while in_pos < compressed.len() && out_pos < (decoded.capacity() + size_of::()) { + let code = compressed[in_pos]; + if code == Code::ESCAPE_CODE { + // Advance by one, do raw write. + in_pos += 1; + // SAFETY: out_pos is always 8 bytes or more from the end of decoded buffer + unsafe { + let write_addr = ptr.byte_offset(out_pos as isize); + write_addr.write(compressed[in_pos]); + } + out_pos += 1; + in_pos += 1; + } else { + let symbol = self.symbols[256 + code as usize]; + // SAFETY: out_pos is always 8 bytes or more from the end of decoded buffer + unsafe { + let write_addr = ptr.byte_offset(out_pos as isize) as *mut u64; + // Perform 8 byte unaligned write. + write_addr.write_unaligned(symbol.num); + } + in_pos += 1; + out_pos += symbol.len(); + } + } + + assert!( + in_pos >= compressed.len(), + "decompression should exhaust input before output" + ); + + // SAFETY: we enforce in the loop condition that out_pos <= decoded.capacity() + unsafe { decoded.set_len(out_pos) }; + + decoded } } diff --git a/src/longest.rs b/src/longest.rs new file mode 100644 index 0000000..445a88a --- /dev/null +++ b/src/longest.rs @@ -0,0 +1,24 @@ +use crate::{Code, SymbolTable}; + +/// Find the longest substring. + +impl SymbolTable { + // NOTE(aduffy): if you don't disable inlining, this function won't show up in profiles. + #[inline(never)] + pub(crate) fn find_longest_symbol(&self, text: &[u8]) -> Code { + debug_assert!(!text.is_empty(), "text must not be empty"); + + // Find the code that best maps to the provided text table here. + let mut best_code = Code::new_escaped(text[0]); + let mut best_overlap = 1; + for code in 0..511 { + let symbol = &self.symbols[code as usize]; + if symbol.is_prefix(text) && symbol.len() > best_overlap { + best_code = Code::from_u16(code); + best_overlap = symbol.len(); + } + } + + best_code + } +}