From 789e80ee5d0aeda3114eb26df8906eed8c0f1570 Mon Sep 17 00:00:00 2001 From: Matthew Gapp <61894094+matthewgapp@users.noreply.github.com> Date: Sun, 10 Dec 2023 14:47:45 -0800 Subject: [PATCH 1/4] initial stab at supporting decimal types by formating then parsing --- arrow-json/Cargo.toml | 26 ++++++++++++++++++-------- arrow-json/src/writer.rs | 27 +++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index dd232f197ead..1609c807c61a 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -34,11 +34,17 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { workspace = true } -arrow-buffer = { workspace = true } -arrow-cast = { workspace = true } -arrow-data = { workspace = true } -arrow-schema = { workspace = true } +arrow-array = { version = "49" } +# arrow-buffer = { workspace = true } +# arrow-cast = { workspace = true } +# arrow-data = { workspace = true } +# arrow-schema = { workspace = true } + +arrow-buffer = { version = "49" } +arrow-cast = { version = "49" } +arrow-data = { version = "49" } +arrow-schema = { version = "49" } + half = { version = "2.1", default-features = false } indexmap = { version = "2.0", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } @@ -49,15 +55,19 @@ lexical-core = { version = "0.8", default-features = false } [dev-dependencies] tempfile = "3.3" -flate2 = { version = "1", default-features = false, features = ["rust_backend"] } +flate2 = { version = "1", default-features = false, features = [ + "rust_backend", +] } serde = { version = "1.0", default-features = false, features = ["derive"] } futures = "0.3" tokio = { version = "1.27", default-features = false, features = ["io-util"] } bytes = "1.4" criterion = { version = "0.5", default-features = false } -rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } +rand = { version = "0.8", default-features = false, features = [ + "std", + "std_rng", +] } [[bench]] name = "serde" harness = false - diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index cabda5e2dca8..890760de1c06 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -469,11 +469,34 @@ fn set_column_for_json_rows( row.insert(col_name.to_string(), serde_json::Value::Object(obj)); } } + DataType::Decimal128(_precision, _scale) => { + let options = FormatOptions::default(); + let formatter = ArrayFormatter::try_new(array.as_ref(), &options)?; + let nulls = array.nulls(); + rows.iter_mut() + .enumerate() + .filter_map(|(idx, maybe_row)| maybe_row.as_mut().map(|row| (idx, row))) + .for_each(|(idx, row)| { + let maybe_value = nulls.map(|x| x.is_valid(idx)).unwrap_or(true).then(|| { + Value::Number( + serde_json::Number::from_f64( + formatter.value(idx).to_string().parse::().unwrap(), + ) + .unwrap(), + ) + }); + if let Some(j) = maybe_value { + row.insert(col_name.to_string(), j); + } else if explicit_nulls { + row.insert(col_name.to_string(), Value::Null); + } + }); + } _ => { return Err(ArrowError::JsonError(format!( - "data type {:?} not supported in nested map for json writer", + "data type {:?} not supported for json writer", array.data_type() - ))) + ))); } } Ok(()) From d08d373e3cda5218e4ee2b9e592d11b11df34044 Mon Sep 17 00:00:00 2001 From: Matthew Gapp <61894094+matthewgapp@users.noreply.github.com> Date: Sun, 10 Dec 2023 15:05:07 -0800 Subject: [PATCH 2/4] support 256 and clean up logic --- arrow-json/src/writer.rs | 78 ++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 22 deletions(-) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 890760de1c06..a637c193d28a 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -469,28 +469,8 @@ fn set_column_for_json_rows( row.insert(col_name.to_string(), serde_json::Value::Object(obj)); } } - DataType::Decimal128(_precision, _scale) => { - let options = FormatOptions::default(); - let formatter = ArrayFormatter::try_new(array.as_ref(), &options)?; - let nulls = array.nulls(); - rows.iter_mut() - .enumerate() - .filter_map(|(idx, maybe_row)| maybe_row.as_mut().map(|row| (idx, row))) - .for_each(|(idx, row)| { - let maybe_value = nulls.map(|x| x.is_valid(idx)).unwrap_or(true).then(|| { - Value::Number( - serde_json::Number::from_f64( - formatter.value(idx).to_string().parse::().unwrap(), - ) - .unwrap(), - ) - }); - if let Some(j) = maybe_value { - row.insert(col_name.to_string(), j); - } else if explicit_nulls { - row.insert(col_name.to_string(), Value::Null); - } - }); + DataType::Decimal128(_precision, _scale) | DataType::Decimal256(_precision, _scale) => { + to_json_number_via_f64(rows, array, col_name, explicit_nulls)?; } _ => { return Err(ArrowError::JsonError(format!( @@ -502,6 +482,60 @@ fn set_column_for_json_rows( Ok(()) } +fn to_json_number_via_f64( + rows: &mut [Option>], + array: &ArrayRef, + col_name: &str, + explicit_nulls: bool, +) -> Result<(), ArrowError> { + let options = FormatOptions::default(); + let formatter = ArrayFormatter::try_new(array.as_ref(), &options)?; + let nulls = array.nulls(); + let rows = rows + .iter_mut() + .enumerate() + .filter_map(|(idx, maybe_row)| maybe_row.as_mut().map(|row| (idx, row))); + + for (idx, row) in rows { + let maybe_value = nulls + .map(|x| x.is_valid(idx)) + .unwrap_or(true) + .then(|| { + let num = formatter + .value(idx) + .to_string() + .parse::() + .map_err(|e| { + ArrowError::ParseError(format!( + "Cannot convert {} to f64: {}", + formatter.value(idx), + e + )) + }); + + num.and_then(|num| { + serde_json::Number::from_f64(num) + .ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot convert {} to f64", + formatter.value(idx) + )) + }) + .map(Value::Number) + }) + }) + // pivot the Option to Result