Skip to content

Commit

Permalink
Clarify interval comparison behavior with documentation and tests (#5192
Browse files Browse the repository at this point in the history
)

* Clarify interval comparison behavior with documentation and tests

* refine language
  • Loading branch information
alamb authored Dec 8, 2023
1 parent 7e28913 commit c5a9953
Show file tree
Hide file tree
Showing 4 changed files with 255 additions and 4 deletions.
8 changes: 7 additions & 1 deletion arrow-array/src/array/primitive_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -352,12 +352,18 @@ pub type Time64MicrosecondArray = PrimitiveArray<Time64MicrosecondType>;
pub type Time64NanosecondArray = PrimitiveArray<Time64NanosecondType>;

/// A [`PrimitiveArray`] of “calendar” intervals in months
///
/// See [`IntervalYearMonthType`] for details on representation and caveats.
pub type IntervalYearMonthArray = PrimitiveArray<IntervalYearMonthType>;

/// A [`PrimitiveArray`] of “calendar” intervals in days and milliseconds
///
/// See [`IntervalDayTimeType`] for details on representation and caveats.
pub type IntervalDayTimeArray = PrimitiveArray<IntervalDayTimeType>;

/// A [`PrimitiveArray`] of “calendar” intervals in months, days, and nanoseconds
/// A [`PrimitiveArray`] of “calendar” intervals in months, days, and nanoseconds.
///
/// See [`IntervalMonthDayNanoType`] for details on representation and caveats.
pub type IntervalMonthDayNanoArray = PrimitiveArray<IntervalMonthDayNanoType>;

/// A [`PrimitiveArray`] of elapsed durations in seconds
Expand Down
71 changes: 68 additions & 3 deletions arrow-array/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -213,19 +213,84 @@ make_type!(
IntervalYearMonthType,
i32,
DataType::Interval(IntervalUnit::YearMonth),
"A “calendar” interval type in months."
"A “calendar” interval stored as the number of whole months."
);
make_type!(
IntervalDayTimeType,
i64,
DataType::Interval(IntervalUnit::DayTime),
"A “calendar” interval type in days and milliseconds."
r#"A “calendar” interval type in days and milliseconds.
## Representation
This type is stored as a single 64 bit integer, interpreted as two i32 fields:
1. the number of elapsed days
2. The number of milliseconds (no leap seconds),
```text
┌──────────────┬──────────────┐
│ Days │ Milliseconds │
│ (32 bits) │ (32 bits) │
└──────────────┴──────────────┘
0 31 63 bit offset
```
Please see the [Arrow Spec](https://github.com/apache/arrow/blob/081b4022fe6f659d8765efc82b3f4787c5039e3c/format/Schema.fbs#L406-L408) for more details
## Note on Comparing and Ordering for Calendar Types
Values of `IntervalDayTimeType` are compared using their binary representation,
which can lead to surprising results. Please see the description of ordering on
[`IntervalMonthDayNanoType`] for more details
"#
);
make_type!(
IntervalMonthDayNanoType,
i128,
DataType::Interval(IntervalUnit::MonthDayNano),
"A “calendar” interval type in months, days, and nanoseconds."
r#"A “calendar” interval type in months, days, and nanoseconds.
## Representation
This type is stored as a single 128 bit integer,
interpreted as three different signed integral fields:
1. The number of months (32 bits)
2. The number days (32 bits)
2. The number of nanoseconds (64 bits).
Nanoseconds does not allow for leap seconds.
Each field is independent (e.g. there is no constraint that the quantity of
nanoseconds represents less than a day's worth of time).
```text
┌──────────────────────────────┬─────────────┬──────────────┐
│ Nanos │ Days │ Months │
│ (64 bits) │ (32 bits) │ (32 bits) │
└──────────────────────────────┴─────────────┴──────────────┘
0 63 95 127 bit offset
```
Please see the [Arrow Spec](https://github.com/apache/arrow/blob/081b4022fe6f659d8765efc82b3f4787c5039e3c/format/Schema.fbs#L409-L415) for more details
## Note on Comparing and Ordering for Calendar Types
Values of `IntervalMonthDayNanoType` are compared using their binary representation,
which can lead to surprising results.
Spans of time measured in calendar units are not fixed in absolute size (e.g.
number of seconds) which makes defining comparisons and ordering non trivial.
For example `1 month` is 28 days for February but `1 month` is 31 days
in December.
This makes the seemingly simple operation of comparing two intervals
complicated in practice. For example is `1 month` more or less than `30 days`? The
answer depends on what month you are talking about.
This crate defines comparisons for calendar types using their binary
representation which is fast and efficient, but leads
to potentially surprising results.
For example a
`IntervalMonthDayNano` of `1 month` will compare as **greater** than a
`IntervalMonthDayNano` of `100 days` because the binary representation of `1 month`
is larger than the binary representation of 100 days.
"#
);
make_type!(
DurationSecondType,
Expand Down
113 changes: 113 additions & 0 deletions arrow-ord/src/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1407,6 +1407,48 @@ mod tests {
vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10],
vec![false, false, true, false, false, false, false, true, false, false]
);

cmp_vec!(
eq,
eq_dyn,
IntervalYearMonthArray,
vec![
IntervalYearMonthType::make_value(1, 2),
IntervalYearMonthType::make_value(2, 1),
// 1 year
IntervalYearMonthType::make_value(1, 0),
],
vec![
IntervalYearMonthType::make_value(1, 2),
IntervalYearMonthType::make_value(1, 2),
// NB 12 months is treated as equal to a year (as the underlying
// type stores number of months)
IntervalYearMonthType::make_value(0, 12),
],
vec![true, false, true]
);

cmp_vec!(
eq,
eq_dyn,
IntervalMonthDayNanoArray,
vec![
IntervalMonthDayNanoType::make_value(1, 2, 3),
IntervalMonthDayNanoType::make_value(3, 2, 1),
// 1 month
IntervalMonthDayNanoType::make_value(1, 0, 0),
IntervalMonthDayNanoType::make_value(1, 0, 0),
],
vec![
IntervalMonthDayNanoType::make_value(1, 2, 3),
IntervalMonthDayNanoType::make_value(1, 2, 3),
// 30 days is not treated as a month
IntervalMonthDayNanoType::make_value(0, 30, 0),
// 100 days
IntervalMonthDayNanoType::make_value(0, 100, 0),
],
vec![true, false, false, false]
);
}

#[test]
Expand Down Expand Up @@ -1660,6 +1702,77 @@ mod tests {
vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10],
vec![false, false, false, true, true, false, false, false, true, true]
);

cmp_vec!(
lt,
lt_dyn,
IntervalDayTimeArray,
vec![
IntervalDayTimeType::make_value(1, 0),
IntervalDayTimeType::make_value(0, 1000),
IntervalDayTimeType::make_value(1, 1000),
IntervalDayTimeType::make_value(1, 3000),
// 90M milliseconds
IntervalDayTimeType::make_value(0, 90_000_000),
],
vec![
IntervalDayTimeType::make_value(0, 1000),
IntervalDayTimeType::make_value(1, 0),
IntervalDayTimeType::make_value(10, 0),
IntervalDayTimeType::make_value(2, 1),
// NB even though 1 day is less than 90M milliseconds long,
// it compares as greater because the underlying type stores
// days and milliseconds as different fields
IntervalDayTimeType::make_value(0, 12),
],
vec![false, true, true, true ,false]
);

cmp_vec!(
lt,
lt_dyn,
IntervalYearMonthArray,
vec![
IntervalYearMonthType::make_value(1, 2),
IntervalYearMonthType::make_value(2, 1),
IntervalYearMonthType::make_value(1, 2),
// 1 year
IntervalYearMonthType::make_value(1, 0),
],
vec![
IntervalYearMonthType::make_value(1, 2),
IntervalYearMonthType::make_value(1, 2),
IntervalYearMonthType::make_value(2, 1),
// NB 12 months is treated as equal to a year (as the underlying
// type stores number of months)
IntervalYearMonthType::make_value(0, 12),
],
vec![false, false, true, false]
);

cmp_vec!(
lt,
lt_dyn,
IntervalMonthDayNanoArray,
vec![
IntervalMonthDayNanoType::make_value(1, 2, 3),
IntervalMonthDayNanoType::make_value(3, 2, 1),
// 1 month
IntervalMonthDayNanoType::make_value(1, 0, 0),
IntervalMonthDayNanoType::make_value(1, 2, 0),
IntervalMonthDayNanoType::make_value(1, 0, 0),
],
vec![
IntervalMonthDayNanoType::make_value(1, 2, 3),
IntervalMonthDayNanoType::make_value(1, 2, 3),
IntervalMonthDayNanoType::make_value(2, 0, 0),
// 30 days is not treated as a month
IntervalMonthDayNanoType::make_value(0, 30, 0),
// 100 days (note is treated as greater than 1 month as the underlying integer representation)
IntervalMonthDayNanoType::make_value(0, 100, 0),
],
vec![false, false, true, false, false]
);
}

#[test]
Expand Down
67 changes: 67 additions & 0 deletions arrow-ord/src/ord.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,73 @@ pub mod tests {
assert_eq!(Ordering::Greater, cmp(1, 0));
}

#[test]
fn test_interval_day_time() {
let array = IntervalDayTimeArray::from(vec![
// 0 days, 1 second
IntervalDayTimeType::make_value(0, 1000),
// 1 day, 2 milliseconds
IntervalDayTimeType::make_value(1, 2),
// 90M milliseconds (which is more than is in 1 day)
IntervalDayTimeType::make_value(0, 90_000_000),
]);

let cmp = build_compare(&array, &array).unwrap();

assert_eq!(Ordering::Less, cmp(0, 1));
assert_eq!(Ordering::Greater, cmp(1, 0));

// somewhat confusingly, while 90M milliseconds is more than 1 day,
// it will compare less as the comparison is done on the underlying
// values not field by field
assert_eq!(Ordering::Greater, cmp(1, 2));
assert_eq!(Ordering::Less, cmp(2, 1));
}

#[test]
fn test_interval_year_month() {
let array = IntervalYearMonthArray::from(vec![
// 1 year, 0 months
IntervalYearMonthType::make_value(1, 0),
// 0 years, 13 months
IntervalYearMonthType::make_value(0, 13),
// 1 year, 1 month
IntervalYearMonthType::make_value(1, 1),
]);

let cmp = build_compare(&array, &array).unwrap();

assert_eq!(Ordering::Less, cmp(0, 1));
assert_eq!(Ordering::Greater, cmp(1, 0));

// the underlying representation is months, so both quantities are the same
assert_eq!(Ordering::Equal, cmp(1, 2));
assert_eq!(Ordering::Equal, cmp(2, 1));
}

#[test]
fn test_interval_month_day_nano() {
let array = IntervalMonthDayNanoArray::from(vec![
// 100 days
IntervalMonthDayNanoType::make_value(0, 100, 0),
// 1 month
IntervalMonthDayNanoType::make_value(1, 0, 0),
// 100 day, 1 nanoseconds
IntervalMonthDayNanoType::make_value(0, 100, 2),
]);

let cmp = build_compare(&array, &array).unwrap();

assert_eq!(Ordering::Less, cmp(0, 1));
assert_eq!(Ordering::Greater, cmp(1, 0));

// somewhat confusingly, while 100 days is more than 1 month in all cases
// it will compare less as the comparison is done on the underlying
// values not field by field
assert_eq!(Ordering::Greater, cmp(1, 2));
assert_eq!(Ordering::Less, cmp(2, 1));
}

#[test]
fn test_decimal() {
let array = vec![Some(5_i128), Some(2_i128), Some(3_i128)]
Expand Down

0 comments on commit c5a9953

Please sign in to comment.