diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index d065c554bbf..daf38f2523f 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -61,39 +61,39 @@ jobs: submodules: true - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - - name: Test arrow-buffer with all features + - name: Test arrow-buffer run: cargo test -p arrow-buffer --all-features - - name: Test arrow-data with all features + - name: Test arrow-data run: cargo test -p arrow-data --all-features - - name: Test arrow-schema with all features + - name: Test arrow-schema run: cargo test -p arrow-schema --all-features - - name: Test arrow-array with all features + - name: Test arrow-array run: cargo test -p arrow-array --all-features - - name: Test arrow-select with all features + - name: Test arrow-select run: cargo test -p arrow-select --all-features - - name: Test arrow-cast with all features + - name: Test arrow-cast run: cargo test -p arrow-cast --all-features - - name: Test arrow-ipc with all features + - name: Test arrow-ipc run: cargo test -p arrow-ipc --all-features - - name: Test arrow-csv with all features + - name: Test arrow-csv run: cargo test -p arrow-csv --all-features - - name: Test arrow-json with all features + - name: Test arrow-json run: cargo test -p arrow-json --all-features - - name: Test arrow-avro with all features + - name: Test arrow-avro run: cargo test -p arrow-avro --all-features - - name: Test arrow-string with all features + - name: Test arrow-string run: cargo test -p arrow-string --all-features - - name: Test arrow-ord with all features + - name: Test arrow-ord run: cargo test -p arrow-ord --all-features - - name: Test arrow-arith with all features + - name: Test arrow-arith run: cargo test -p arrow-arith --all-features - - name: Test arrow-row with all features + - name: Test arrow-row run: cargo test -p arrow-row --all-features - - name: Test arrow-integration-test with all features + - name: Test arrow-integration-test run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features run: cargo test -p arrow - - name: Test arrow with all features except pyarrow + - name: Test arrow except pyarrow run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,chrono-tz - name: Run examples run: | @@ -163,37 +163,139 @@ jobs: uses: ./.github/actions/setup-builder - name: Setup Clippy run: rustup component add clippy - - name: Clippy arrow-buffer with all features - run: cargo clippy -p arrow-buffer --all-targets --all-features -- -D warnings - - name: Clippy arrow-data with all features - run: cargo clippy -p arrow-data --all-targets --all-features -- -D warnings - - name: Clippy arrow-schema with all features - run: cargo clippy -p arrow-schema --all-targets --all-features -- -D warnings - - name: Clippy arrow-array with all features - run: cargo clippy -p arrow-array --all-targets --all-features -- -D warnings - - name: Clippy arrow-select with all features - run: cargo clippy -p arrow-select --all-targets --all-features -- -D warnings - - name: Clippy arrow-cast with all features - run: cargo clippy -p arrow-cast --all-targets --all-features -- -D warnings - - name: Clippy arrow-ipc with all features - run: cargo clippy -p arrow-ipc --all-targets --all-features -- -D warnings - - name: Clippy arrow-csv with all features - run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings - - name: Clippy arrow-json with all features - run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings - - name: Clippy arrow-avro with all features - run: cargo clippy -p arrow-avro --all-targets --all-features -- -D warnings - - name: Clippy arrow-string with all features - run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings - - name: Clippy arrow-ord with all features - run: cargo clippy -p arrow-ord --all-targets --all-features -- -D warnings - - name: Clippy arrow-arith with all features - run: cargo clippy -p arrow-arith --all-targets --all-features -- -D warnings - - name: Clippy arrow-row with all features - run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings - - name: Clippy arrow with all features - run: cargo clippy -p arrow --all-features --all-targets -- -D warnings - - name: Clippy arrow-integration-test with all features - run: cargo clippy -p arrow-integration-test --all-targets --all-features -- -D warnings - - name: Clippy arrow-integration-testing with all features - run: cargo clippy -p arrow-integration-testing --all-targets --all-features -- -D warnings + - name: Clippy arrow-buffer + run: | + mod=arrow-buffer + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-data + run: | + mod=arrow-data + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-schema + run: | + mod=arrow-schema + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-array + run: | + mod=arrow-array + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-select + run: | + mod=arrow-select + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-cast + run: | + mod=arrow-cast + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-ipc + run: | + mod=arrow-ipc + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-csv + run: | + mod=arrow-csv + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-json + run: | + mod=arrow-json + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-avro + run: | + mod=arrow-avro + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-string + run: | + mod=arrow-string + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-ord + run: | + mod=arrow-ord + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-arith + run: | + mod=arrow-arith + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-row + run: | + mod=arrow-row + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow + run: | + mod=arrow + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-integration-test + run: | + mod=arrow-integration-test + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-integration-testing + run: | + mod=arrow-integration-testing + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 4729779c43b..8823924eb55 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -35,7 +35,6 @@ bench = false [dependencies] arrow-array = { workspace = true } -arrow-buffer = { workspace = true } arrow-cast = { workspace = true } arrow-schema = { workspace = true } chrono = { workspace = true } @@ -45,6 +44,7 @@ lazy_static = { version = "1.4", default-features = false } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } [dev-dependencies] +arrow-buffer = { workspace = true } tempfile = "3.3" futures = "0.3" tokio = { version = "1.27", default-features = false, features = ["io-util"] } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 012d3947f02..fbb295036a9 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -43,11 +43,11 @@ base64 = { version = "0.22", default-features = false, features = ["std"] } bytes = { version = "1", default-features = false } futures = { version = "0.3", default-features = false, features = ["alloc"] } once_cell = { version = "1", optional = true } -paste = { version = "1.0" } +paste = { version = "1.0" , optional = true } prost = { version = "0.13.1", default-features = false, features = ["prost-derive"] } # For Timestamp type prost-types = { version = "0.13.1", default-features = false } -tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } +tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"], optional = true } tonic = { version = "0.12.3", default-features = false, features = ["transport", "codegen", "prost"] } # CLI-related dependencies @@ -61,11 +61,10 @@ all-features = true [features] default = [] -flight-sql-experimental = ["dep:arrow-arith", "dep:arrow-data", "dep:arrow-ord", "dep:arrow-row", "dep:arrow-select", "dep:arrow-string", "dep:once_cell"] +flight-sql-experimental = ["dep:arrow-arith", "dep:arrow-data", "dep:arrow-ord", "dep:arrow-row", "dep:arrow-select", "dep:arrow-string", "dep:once_cell", "dep:paste"] tls = ["tonic/tls"] - # Enable CLI tools -cli = ["dep:anyhow", "arrow-array/chrono-tz", "arrow-cast/prettyprint", "dep:clap", "dep:tracing-log", "dep:tracing-subscriber", "tonic/tls-webpki-roots"] +cli = ["arrow-array/chrono-tz", "arrow-cast/prettyprint", "tonic/tls-webpki-roots", "dep:anyhow", "dep:clap", "dep:tracing-log", "dep:tracing-subscriber"] [dev-dependencies] arrow-cast = { workspace = true, features = ["prettyprint"] } @@ -75,6 +74,9 @@ http-body = "1.0.0" hyper-util = "0.1" pin-project-lite = "0.2" tempfile = "3.3" +tracing-log = { version = "0.2" } +tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "env-filter", "fmt"] } +tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } tokio-stream = { version = "0.1", features = ["net"] } tower = { version = "0.5.0", features = ["util"] } uuid = { version = "1.10.0", features = ["v4"] } diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index ccd32b433cf..6358227a891 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,5 +32,5 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -prost-build = { version = "=0.13.3", default-features = false } +prost-build = { version = "=0.13.4", default-features = false } tonic-build = { version = "=0.12.3", default-features = false, features = ["transport", "prost"] } diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 9f18416c06e..81fa99faeb9 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -38,6 +38,8 @@ //! [Flight SQL]: https://arrow.apache.org/docs/format/FlightSql.html #![allow(rustdoc::invalid_html_tags)] #![warn(missing_docs)] +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] use arrow_ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions}; use arrow_schema::{ArrowError, Schema}; diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 0ebbc05a8b8..8654b4b9273 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -36,18 +36,17 @@ logging = ["tracing-subscriber"] [dependencies] arrow = { path = "../arrow", default-features = false, features = ["test_utils", "ipc", "ipc_compression", "json", "ffi"] } arrow-flight = { path = "../arrow-flight", default-features = false } -arrow-buffer = { path = "../arrow-buffer", default-features = false } arrow-integration-test = { path = "../arrow-integration-test", default-features = false } -async-trait = { version = "0.1.41", default-features = false } clap = { version = "4", default-features = false, features = ["std", "derive", "help", "error-context", "usage"] } futures = { version = "0.3", default-features = false } prost = { version = "0.13", default-features = false } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } -tokio = { version = "1.0", default-features = false } +tokio = { version = "1.0", default-features = false, features = [ "rt-multi-thread"] } tonic = { version = "0.12", default-features = false } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["fmt"], optional = true } flate2 = { version = "1", default-features = false, features = ["rust_backend"] } [dev-dependencies] +arrow-buffer = { path = "../arrow-buffer", default-features = false } tempfile = { version = "3", default-features = false } diff --git a/arrow-integration-testing/src/bin/arrow-file-to-stream.rs b/arrow-integration-testing/src/bin/arrow-file-to-stream.rs index 3e027faef91..661f0a047db 100644 --- a/arrow-integration-testing/src/bin/arrow-file-to-stream.rs +++ b/arrow-integration-testing/src/bin/arrow-file-to-stream.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow::error::Result; use arrow::ipc::reader::FileReader; use arrow::ipc::writer::StreamWriter; diff --git a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs index cc3dd2110e3..6a901cc63ba 100644 --- a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs +++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow::error::{ArrowError, Result}; use arrow::ipc::reader::FileReader; use arrow::ipc::writer::FileWriter; diff --git a/arrow-integration-testing/src/bin/arrow-stream-to-file.rs b/arrow-integration-testing/src/bin/arrow-stream-to-file.rs index 07ac5c7ddd4..8b4bb332781 100644 --- a/arrow-integration-testing/src/bin/arrow-stream-to-file.rs +++ b/arrow-integration-testing/src/bin/arrow-stream-to-file.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use std::io; use arrow::error::Result; diff --git a/arrow-integration-testing/src/bin/flight-test-integration-client.rs b/arrow-integration-testing/src/bin/flight-test-integration-client.rs index b8bbb952837..0d16fe3b403 100644 --- a/arrow-integration-testing/src/bin/flight-test-integration-client.rs +++ b/arrow-integration-testing/src/bin/flight-test-integration-client.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow_integration_testing::flight_client_scenarios; use clap::Parser; type Error = Box; diff --git a/arrow-integration-testing/src/bin/flight-test-integration-server.rs b/arrow-integration-testing/src/bin/flight-test-integration-server.rs index 5310d07d4f8..94be7130979 100644 --- a/arrow-integration-testing/src/bin/flight-test-integration-server.rs +++ b/arrow-integration-testing/src/bin/flight-test-integration-server.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow_integration_testing::flight_server_scenarios; use clap::Parser; diff --git a/arrow-integration-testing/src/lib.rs b/arrow-integration-testing/src/lib.rs index c8ce01e9f13..e669690ef4f 100644 --- a/arrow-integration-testing/src/lib.rs +++ b/arrow-integration-testing/src/lib.rs @@ -17,6 +17,8 @@ //! Common code used in the integration test binaries +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] #![warn(missing_docs)] use serde_json::Value; diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index a2894c02029..8d74d2f97d7 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -39,7 +39,7 @@ arrow-buffer = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } arrow-select = { workspace = true } -half = { version = "2.1", default-features = false, features = ["num-traits"] } [dev-dependencies] +half = { version = "2.1", default-features = false, features = ["num-traits"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index 3754afb4dbc..90d99684d26 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -33,12 +33,6 @@ name = "arrow_row" path = "src/lib.rs" bench = false -[target.'cfg(target_arch = "wasm32")'.dependencies] -ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } - -[target.'cfg(not(target_arch = "wasm32"))'.dependencies] -ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } - [dependencies] arrow-array = { workspace = true } arrow-buffer = { workspace = true } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 32b2384afe7..8860cd61c5b 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -56,8 +56,6 @@ arrow-string = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } pyo3 = { version = "0.23", default-features = false, optional = true } -chrono = { workspace = true, optional = true } - [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"] @@ -72,7 +70,7 @@ prettyprint = ["arrow-cast/prettyprint"] # not the core arrow code itself. Be aware that `rand` must be kept as # an optional dependency for supporting compile to wasm32-unknown-unknown # target without assuming an environment containing JavaScript. -test_utils = ["rand", "dep:chrono"] +test_utils = ["dep:rand"] pyarrow = ["pyo3", "ffi"] # force_validate runs full data validation for all arrays that are created # this is not enabled by default as it is too computationally expensive diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index ec7990d3d76..5c4fcff13de 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -250,6 +250,9 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("cast decimal128 to decimal128 512", |b| { b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(30, 5))) }); + c.bench_function("cast decimal128 to decimal128 512 lower precision", |b| { + b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(6, 5))) + }); c.bench_function("cast decimal128 to decimal256 512", |b| { b.iter(|| cast_array(&decimal128_array, DataType::Decimal256(50, 5))) }); diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index a0473362513..bcc8e0b9224 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -77,6 +77,8 @@ http-body-util = "0.1" rand = "0.8" tempfile = "3.1.0" regex = "1.11.1" +# The "gzip" feature for reqwest is enabled for an integration test. +reqwest = { version = "0.12", features = ["gzip"] } http = "1.1.0" [[test]] diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 76d1c1f22f5..1b7ce5aa7a7 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -671,6 +671,10 @@ impl ClientOptions { builder = builder.danger_accept_invalid_certs(true) } + // Reqwest will remove the `Content-Length` header if it is configured to + // transparently decompress the body via the non-default `gzip` feature. + builder = builder.no_gzip(); + builder .https_only(!self.allow_http.get()?) .build() diff --git a/object_store/tests/http.rs b/object_store/tests/http.rs new file mode 100644 index 00000000000..a9b3145bb66 --- /dev/null +++ b/object_store/tests/http.rs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests the HTTP store implementation + +#[cfg(feature = "http")] +use object_store::{http::HttpBuilder, path::Path, GetOptions, GetRange, ObjectStore}; + +/// Tests that even when reqwest has the `gzip` feature enabled, the HTTP store +/// does not error on a missing `Content-Length` header. +#[tokio::test] +#[cfg(feature = "http")] +async fn test_http_store_gzip() { + let http_store = HttpBuilder::new() + .with_url("https://raw.githubusercontent.com/apache/arrow-rs/refs/heads/main") + .build() + .unwrap(); + + let _ = http_store + .get_opts( + &Path::parse("LICENSE.txt").unwrap(), + GetOptions { + range: Some(GetRange::Bounded(0..100)), + ..Default::default() + }, + ) + .await + .unwrap(); +} diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8d0be5f9f8c..16de0ba7898 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -574,7 +574,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// anticipated encoded size. #[cfg(feature = "arrow")] pub(crate) fn get_estimated_total_bytes(&self) -> u64 { - self.column_metrics.total_bytes_written + self.data_pages + .iter() + .map(|page| page.data().len() as u64) + .sum::() + + self.column_metrics.total_bytes_written + self.encoder.estimated_data_page_size() as u64 + self.encoder.estimated_dict_page_size().unwrap_or_default() as u64 } @@ -3422,6 +3426,26 @@ mod tests { assert!(stats.max_bytes_opt().is_none()); } + #[test] + #[cfg(feature = "arrow")] + fn test_column_writer_get_estimated_total_bytes() { + let page_writer = get_test_page_writer(); + let props = Default::default(); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + assert_eq!(writer.get_estimated_total_bytes(), 0); + + writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); + writer.add_data_page().unwrap(); + let size_with_one_page = writer.get_estimated_total_bytes(); + assert_eq!(size_with_one_page, 20); + + writer.write_batch(&[5, 6, 7, 8], None, None).unwrap(); + writer.add_data_page().unwrap(); + let size_with_two_pages = writer.get_estimated_total_bytes(); + // different pages have different compressed lengths + assert_eq!(size_with_two_pages, 20 + 21); + } + fn write_multiple_pages( column_descr: &Arc, pages: &[&[Option]],