-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add ColumnStatistics::Sum
#14074
base: main
Are you sure you want to change the base?
Add ColumnStatistics::Sum
#14074
Changes from 6 commits
ee13b64
d1d0996
04b7cea
2449d2b
fbf3188
42ac6aa
87c6a5c
8431b91
579e046
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,7 +21,7 @@ use std::fmt::{self, Debug, Display}; | |
|
||
use crate::{Result, ScalarValue}; | ||
|
||
use arrow_schema::{Schema, SchemaRef}; | ||
use arrow_schema::{DataType, Schema, SchemaRef}; | ||
|
||
/// Represents a value with a degree of certainty. `Precision` is used to | ||
/// propagate information the precision of statistical values. | ||
|
@@ -170,24 +170,63 @@ impl Precision<ScalarValue> { | |
pub fn add(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> { | ||
match (self, other) { | ||
(Precision::Exact(a), Precision::Exact(b)) => { | ||
if let Ok(result) = a.add(b) { | ||
Precision::Exact(result) | ||
} else { | ||
Precision::Absent | ||
} | ||
a.add(b).map(Precision::Exact).unwrap_or(Precision::Absent) | ||
} | ||
(Precision::Inexact(a), Precision::Exact(b)) | ||
| (Precision::Exact(a), Precision::Inexact(b)) | ||
| (Precision::Inexact(a), Precision::Inexact(b)) => { | ||
if let Ok(result) = a.add(b) { | ||
Precision::Inexact(result) | ||
} else { | ||
Precision::Absent | ||
} | ||
| (Precision::Inexact(a), Precision::Inexact(b)) => a | ||
.add(b) | ||
.map(Precision::Inexact) | ||
.unwrap_or(Precision::Absent), | ||
(_, _) => Precision::Absent, | ||
} | ||
} | ||
|
||
/// Calculates the difference of two (possibly inexact) [`ScalarValue`] values, | ||
/// conservatively propagating exactness information. If one of the input | ||
/// values is [`Precision::Absent`], the result is `Absent` too. | ||
pub fn sub(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> { | ||
match (self, other) { | ||
(Precision::Exact(a), Precision::Exact(b)) => { | ||
a.add(b).map(Precision::Exact).unwrap_or(Precision::Absent) | ||
} | ||
(Precision::Inexact(a), Precision::Exact(b)) | ||
| (Precision::Exact(a), Precision::Inexact(b)) | ||
| (Precision::Inexact(a), Precision::Inexact(b)) => a | ||
.add(b) | ||
.map(Precision::Inexact) | ||
.unwrap_or(Precision::Absent), | ||
(_, _) => Precision::Absent, | ||
} | ||
} | ||
|
||
/// Calculates the multiplication of two (possibly inexact) [`ScalarValue`] values, | ||
/// conservatively propagating exactness information. If one of the input | ||
/// values is [`Precision::Absent`], the result is `Absent` too. | ||
pub fn multiply(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> { | ||
match (self, other) { | ||
(Precision::Exact(a), Precision::Exact(b)) => a | ||
.mul_checked(b) | ||
.map(Precision::Exact) | ||
.unwrap_or(Precision::Absent), | ||
(Precision::Inexact(a), Precision::Exact(b)) | ||
| (Precision::Exact(a), Precision::Inexact(b)) | ||
| (Precision::Inexact(a), Precision::Inexact(b)) => a | ||
.mul_checked(b) | ||
.map(Precision::Inexact) | ||
.unwrap_or(Precision::Absent), | ||
(_, _) => Precision::Absent, | ||
} | ||
} | ||
|
||
/// Casts the value to the given data type, propagating exactness information. | ||
pub fn cast_to(&self, data_type: &DataType) -> Result<Precision<ScalarValue>> { | ||
match self { | ||
Precision::Exact(value) => value.cast_to(data_type).map(Precision::Exact), | ||
Precision::Inexact(value) => value.cast_to(data_type).map(Precision::Inexact), | ||
Precision::Absent => Ok(Precision::Absent), | ||
} | ||
} | ||
gatesn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Debug for Precision<T> { | ||
|
@@ -210,6 +249,18 @@ impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Display for Precision<T> { | |
} | ||
} | ||
|
||
impl From<Precision<usize>> for Precision<ScalarValue> { | ||
fn from(value: Precision<usize>) -> Self { | ||
match value { | ||
Precision::Exact(v) => Precision::Exact(ScalarValue::UInt64(Some(v as u64))), | ||
Precision::Inexact(v) => { | ||
Precision::Inexact(ScalarValue::UInt64(Some(v as u64))) | ||
} | ||
Precision::Absent => Precision::Absent, | ||
} | ||
} | ||
} | ||
|
||
/// Statistics for a relation | ||
/// Fields are optional and can be inexact because the sources | ||
/// sometimes provide approximate estimates for performance reasons | ||
|
@@ -401,6 +452,11 @@ impl Display for Statistics { | |
} else { | ||
s | ||
}; | ||
let s = if cs.sum_value != Precision::Absent { | ||
format!("{} Sum={}", s, cs.sum_value) | ||
} else { | ||
s | ||
}; | ||
let s = if cs.null_count != Precision::Absent { | ||
format!("{} Null={}", s, cs.null_count) | ||
} else { | ||
|
@@ -436,6 +492,8 @@ pub struct ColumnStatistics { | |
pub max_value: Precision<ScalarValue>, | ||
/// Minimum value of column | ||
pub min_value: Precision<ScalarValue>, | ||
/// Sum value of a column | ||
pub sum_value: Precision<ScalarValue>, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I think we mentioned in #13736 my only real concern with this addition is that it will make However, I think the "right" fix for that is to move to using a different statistics representation (e.g. |
||
/// Number of distinct values | ||
pub distinct_count: Precision<usize>, | ||
} | ||
|
@@ -458,6 +516,7 @@ impl ColumnStatistics { | |
null_count: Precision::Absent, | ||
max_value: Precision::Absent, | ||
min_value: Precision::Absent, | ||
sum_value: Precision::Absent, | ||
distinct_count: Precision::Absent, | ||
} | ||
} | ||
|
@@ -469,6 +528,7 @@ impl ColumnStatistics { | |
self.null_count = self.null_count.to_inexact(); | ||
self.max_value = self.max_value.to_inexact(); | ||
self.min_value = self.min_value.to_inexact(); | ||
self.sum_value = self.sum_value.to_inexact(); | ||
self.distinct_count = self.distinct_count.to_inexact(); | ||
self | ||
} | ||
|
@@ -646,6 +706,7 @@ mod tests { | |
null_count: Precision::Exact(null_count), | ||
max_value: Precision::Exact(ScalarValue::Int64(Some(42))), | ||
min_value: Precision::Exact(ScalarValue::Int64(Some(64))), | ||
sum_value: Precision::Exact(ScalarValue::Int64(Some(4600))), | ||
distinct_count: Precision::Exact(100), | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@alamb one question I have is whether this should return a Result, or we should assume that a failed cast implies overflow and therefore return
Precision::Absent
?The caller (currently in cross-join) unwraps to
Absent
, I just didn't know whether to internalize that here.Edit: I decided it was better to propagate the error and allow the caller to decide. It was more useful in a couple of places.