Skip to content

Commit

Permalink
Merge branch 'apache:main' into panic
Browse files Browse the repository at this point in the history
  • Loading branch information
jp0317 authored Dec 19, 2024
2 parents 78994df + 2887cc1 commit 55f3f64
Show file tree
Hide file tree
Showing 14 changed files with 627 additions and 117 deletions.
27 changes: 18 additions & 9 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,18 +101,19 @@ jobs:
- name: Format arrow
run: cargo fmt --all -- --check
- name: Format parquet
# Many modules in parquet are skipped, so check parquet separately. If this check fails, run:
# cargo fmt -p parquet -- --config skip_children=true `find ./parquet -name "*.rs" \! -name format.rs`
# from the top level arrow-rs directory and check in the result.
# Many modules in parquet are skipped, so check parquet separately
# https://github.com/apache/arrow-rs/issues/6179
working-directory: parquet
run: cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs`
run: |
# if this fails, run this from the parquet directory:
# cargo fmt -p parquet -- --config skip_children=true `find . -name "*.rs" \! -name format.rs`
cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs`
- name: Format object_store
working-directory: object_store
run: cargo fmt --all -- --check

msrv:
name: Verify MSRV
name: Verify MSRV (Minimum Supported Rust Version)
runs-on: ubuntu-latest
container:
image: amd64/rust
Expand All @@ -126,13 +127,19 @@ jobs:
run: cargo update -p ahash --precise 0.8.7
- name: Check arrow
working-directory: arrow
run: cargo msrv --log-target stdout verify
run: |
# run `cd arrow; cargo msrv verify` to see problematic dependencies
cargo msrv verify --output-format=json
- name: Check parquet
working-directory: parquet
run: cargo msrv --log-target stdout verify
run: |
# run `cd parquet; cargo msrv verify` to see problematic dependencies
cargo msrv verify --output-format=json
- name: Check arrow-flight
working-directory: arrow-flight
run: cargo msrv --log-target stdout verify
run: |
# run `cd arrow-flight; cargo msrv verify` to see problematic dependencies
cargo msrv verify --output-format=json
- name: Downgrade object_store dependencies
working-directory: object_store
# Necessary because tokio 1.30.0 updates MSRV to 1.63
Expand All @@ -142,4 +149,6 @@ jobs:
cargo update -p url --precise 2.5.0
- name: Check object_store
working-directory: object_store
run: cargo msrv --log-target stdout verify
run: |
# run `cd object_store; cargo msrv verify` to see problematic dependencies
cargo msrv verify --output-format=json
170 changes: 170 additions & 0 deletions CHANGELOG-old.md

Large diffs are not rendered by default.

173 changes: 94 additions & 79 deletions CHANGELOG.md

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ exclude = [
]

[workspace.package]
version = "53.3.0"
version = "54.0.0"
homepage = "https://github.com/apache/arrow-rs"
repository = "https://github.com/apache/arrow-rs"
authors = ["Apache Arrow <dev@arrow.apache.org>"]
Expand All @@ -77,20 +77,20 @@ edition = "2021"
rust-version = "1.62"

[workspace.dependencies]
arrow = { version = "53.3.0", path = "./arrow", default-features = false }
arrow-arith = { version = "53.3.0", path = "./arrow-arith" }
arrow-array = { version = "53.3.0", path = "./arrow-array" }
arrow-buffer = { version = "53.3.0", path = "./arrow-buffer" }
arrow-cast = { version = "53.3.0", path = "./arrow-cast" }
arrow-csv = { version = "53.3.0", path = "./arrow-csv" }
arrow-data = { version = "53.3.0", path = "./arrow-data" }
arrow-ipc = { version = "53.3.0", path = "./arrow-ipc" }
arrow-json = { version = "53.3.0", path = "./arrow-json" }
arrow-ord = { version = "53.3.0", path = "./arrow-ord" }
arrow-row = { version = "53.3.0", path = "./arrow-row" }
arrow-schema = { version = "53.3.0", path = "./arrow-schema" }
arrow-select = { version = "53.3.0", path = "./arrow-select" }
arrow-string = { version = "53.3.0", path = "./arrow-string" }
parquet = { version = "53.3.0", path = "./parquet", default-features = false }
arrow = { version = "54.0.0", path = "./arrow", default-features = false }
arrow-arith = { version = "54.0.0", path = "./arrow-arith" }
arrow-array = { version = "54.0.0", path = "./arrow-array" }
arrow-buffer = { version = "54.0.0", path = "./arrow-buffer" }
arrow-cast = { version = "54.0.0", path = "./arrow-cast" }
arrow-csv = { version = "54.0.0", path = "./arrow-csv" }
arrow-data = { version = "54.0.0", path = "./arrow-data" }
arrow-ipc = { version = "54.0.0", path = "./arrow-ipc" }
arrow-json = { version = "54.0.0", path = "./arrow-json" }
arrow-ord = { version = "54.0.0", path = "./arrow-ord" }
arrow-row = { version = "54.0.0", path = "./arrow-row" }
arrow-schema = { version = "54.0.0", path = "./arrow-schema" }
arrow-select = { version = "54.0.0", path = "./arrow-select" }
arrow-string = { version = "54.0.0", path = "./arrow-string" }
parquet = { version = "54.0.0", path = "./parquet", default-features = false }

chrono = { version = "0.4.34", default-features = false, features = ["clock"] }
42 changes: 35 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,14 @@ is described in the [contributing] guide.

Planned Release Schedule

| Approximate Date | Version | Notes |
| ---------------- | -------- | --------------------------------------- |
| Nov 2024 | `53.3.0` | Minor, NO breaking API changes |
| Dec 2024 | `54.0.0` | Major, potentially breaking API changes |
| Jan 2025 | `54.1.0` | Minor, NO breaking API changes |
| Feb 2025 | `54.2.0` | Minor, NO breaking API changes |
| Mar 2025 | `55.0.0` | Major, potentially breaking API changes |
| Approximate Date | Version | Notes |
| ---------------- | -------- | ------------------------------------------ |
| Nov 2024 | `53.3.0` | Minor, NO breaking API changes |
| Dec 2024 | `54.0.0` | Major, potentially breaking API changes |
| Jan 2025 | `53.4.0` | Minor, NO breaking API changes (`53` line) |
| Jan 2025 | `54.1.0` | Minor, NO breaking API changes |
| Feb 2025 | `54.2.0` | Minor, NO breaking API changes |
| Mar 2025 | `55.0.0` | Major, potentially breaking API changes |

[this ticket]: https://github.com/apache/arrow-rs/issues/5368
[semantic versioning]: https://semver.org/
Expand All @@ -82,6 +83,33 @@ versions approximately every 2 months.

[`object_store`]: https://crates.io/crates/object_store

### Deprecation Guidelines

Minor releases may deprecate, but not remove APIs. Deprecating APIs allows
downstream Rust programs to still compile, but generate compiler warnings. This
gives downstream crates time to migrate prior to API removal.

To deprecate an API:

- Mark the API as deprecated using `#[deprecated]` and specify the exact arrow-rs version in which it was deprecated
- Concisely describe the preferred API to help the user transition

The deprecated version is the next version which will be released (please
consult the list above). To mark the API as deprecated, use the
`#[deprecated(since = "...", note = "...")]` attribute.

Foe example

```rust
#[deprecated(since = "51.0.0", note = "Use `date_part` instead")]
```

In general, deprecated APIs will remain in the codebase for at least two major releases after
they were deprecated (typically between 6 - 9 months later). For example, an API
deprecated in `51.3.0` can be removed in `54.0.0` (or later). Deprecated APIs
may be removed earlier or later than these guidelines at the discretion of the
maintainers.

## Related Projects

There are several related crates in different repositories
Expand Down
2 changes: 1 addition & 1 deletion arrow-flight/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Add this to your Cargo.toml:

```toml
[dependencies]
arrow-flight = "53.3.0"
arrow-flight = "54.0.0"
```

Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information.
Expand Down
8 changes: 8 additions & 0 deletions arrow-schema/src/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,14 @@ pub enum DataType {
/// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
/// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
/// ```
///
/// Timezone string parsing
/// -----------------------
/// When feature `chrono-tz` is not enabled, allowed timezone strings are fixed offsets of the form "+09:00", "-09" or "+0930".
///
/// When feature `chrono-tz` is enabled, additional strings supported by [chrono_tz](https://docs.rs/chrono-tz/latest/chrono_tz/)
/// are also allowed, which include [IANA database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
/// timezones.
Timestamp(TimeUnit, Option<Arc<str>>),
/// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
/// in days.
Expand Down
25 changes: 25 additions & 0 deletions arrow-schema/src/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,19 @@ impl Field {
}

/// Returns whether this `Field`'s dictionary is ordered, if this is a dictionary type.
///
/// # Example
/// ```
/// # use arrow_schema::{DataType, Field};
/// // non dictionaries do not have a dict is ordered flat
/// let field = Field::new("c1", DataType::Int64, false);
/// assert_eq!(field.dict_is_ordered(), None);
/// // by default dictionary is not ordered
/// let field = Field::new("c1", DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)), false);
/// assert_eq!(field.dict_is_ordered(), Some(false));
/// let field = field.with_dict_is_ordered(true);
/// assert_eq!(field.dict_is_ordered(), Some(true));
/// ```
#[inline]
pub const fn dict_is_ordered(&self) -> Option<bool> {
match self.data_type {
Expand All @@ -434,6 +447,18 @@ impl Field {
}
}

/// Set the is ordered field for this `Field`, if it is a dictionary.
///
/// Does nothing if this is not a dictionary type.
///
/// See [`Field::dict_is_ordered`] for more information.
pub fn with_dict_is_ordered(mut self, dict_is_ordered: bool) -> Self {
if matches!(self.data_type, DataType::Dictionary(_, _)) {
self.dict_is_ordered = dict_is_ordered;
};
self
}

/// Merge this field into self if it is compatible.
///
/// Struct fields are merged recursively.
Expand Down
2 changes: 1 addition & 1 deletion arrow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ This crate is tested with the latest stable version of Rust. We do not currently

The `arrow` crate follows the [SemVer standard] defined by Cargo and works well
within the Rust crate ecosystem. See the [repository README] for more details on
the release schedule and version.
the release schedule, version and deprecation policy.

[SemVer standard]: https://doc.rust-lang.org/cargo/reference/semver.html
[repository README]: https://github.com/apache/arrow-rs
Expand Down
4 changes: 2 additions & 2 deletions dev/release/update_change_log.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@

set -e

SINCE_TAG="53.2.0"
FUTURE_RELEASE="53.3.0"
SINCE_TAG="53.3.0"
FUTURE_RELEASE="54.0.0"

SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)"
Expand Down
2 changes: 1 addition & 1 deletion parquet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ This crate is tested with the latest stable version of Rust. We do not currently

The `parquet` crate follows the [SemVer standard] defined by Cargo and works well
within the Rust crate ecosystem. See the [repository README] for more details on
the release schedule and version.
the release schedule, version and deprecation policy.

[semver standard]: https://doc.rust-lang.org/cargo/reference/semver.html
[repository readme]: https://github.com/apache/arrow-rs
Expand Down
68 changes: 68 additions & 0 deletions parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -989,6 +989,21 @@ mod tests {
assert_eq!(original_schema.fields()[1], reader.schema().fields()[0]);
}

#[test]
fn test_arrow_reader_single_column_by_name() {
let file = get_test_file("parquet/generated_simple_numerics/blogs.parquet");

let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
let original_schema = Arc::clone(builder.schema());

let mask = ProjectionMask::columns(builder.parquet_schema(), ["blog_id"]);
let reader = builder.with_projection(mask).build().unwrap();

// Verify that the schema was correctly parsed
assert_eq!(1, reader.schema().fields().len());
assert_eq!(original_schema.fields()[1], reader.schema().fields()[0]);
}

#[test]
fn test_null_column_reader_test() {
let mut file = tempfile::tempfile().unwrap();
Expand Down Expand Up @@ -2563,6 +2578,59 @@ mod tests {
}
}

#[test]
// same as test_read_structs but constructs projection mask via column names
fn test_read_structs_by_name() {
let testdata = arrow::util::test_util::parquet_test_data();
let path = format!("{testdata}/nested_structs.rust.parquet");
let file = File::open(&path).unwrap();
let record_batch_reader = ParquetRecordBatchReader::try_new(file, 60).unwrap();

for batch in record_batch_reader {
batch.unwrap();
}

let file = File::open(&path).unwrap();
let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();

let mask = ProjectionMask::columns(
builder.parquet_schema(),
["roll_num.count", "PC_CUR.mean", "PC_CUR.sum"],
);
let projected_reader = builder
.with_projection(mask)
.with_batch_size(60)
.build()
.unwrap();

let expected_schema = Schema::new(vec![
Field::new(
"roll_num",
ArrowDataType::Struct(Fields::from(vec![Field::new(
"count",
ArrowDataType::UInt64,
false,
)])),
false,
),
Field::new(
"PC_CUR",
ArrowDataType::Struct(Fields::from(vec![
Field::new("mean", ArrowDataType::Int64, false),
Field::new("sum", ArrowDataType::Int64, false),
])),
false,
),
]);

assert_eq!(&expected_schema, projected_reader.schema().as_ref());

for batch in projected_reader {
let batch = batch.unwrap();
assert_eq!(batch.schema().as_ref(), &expected_schema);
}
}

#[test]
fn test_read_maps() {
let testdata = arrow::util::test_util::parquet_test_data();
Expand Down
Loading

0 comments on commit 55f3f64

Please sign in to comment.