From 8de9de75d8ef0969ed9b74152c3c8f9c140da1ce Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 29 Dec 2023 12:33:37 +0000 Subject: [PATCH 1/4] fix uuid derive --- parquet_derive/src/parquet_field.rs | 75 ++++++++++++++++++++--------- parquet_derive_test/Cargo.toml | 1 + parquet_derive_test/src/lib.rs | 5 ++ 3 files changed, 57 insertions(+), 24 deletions(-) diff --git a/parquet_derive/src/parquet_field.rs b/parquet_derive/src/parquet_field.rs index 3ab85a8972f5..82f381fbdf24 100644 --- a/parquet_derive/src/parquet_field.rs +++ b/parquet_derive/src/parquet_field.rs @@ -316,25 +316,23 @@ impl Field { let logical_type = self.ty.logical_type(); let repetition = self.ty.repetition(); let converted_type = self.ty.converted_type(); + let length = self.ty.length(); + + let mut builder = quote! { + ParquetType::primitive_type_builder(#field_name, #physical_type) + .with_logical_type(#logical_type) + .with_repetition(#repetition) + }; if let Some(converted_type) = converted_type { - quote! { - fields.push(ParquetType::primitive_type_builder(#field_name, #physical_type) - .with_logical_type(#logical_type) - .with_repetition(#repetition) - .with_converted_type(#converted_type) - .build().unwrap().into() - ) - } - } else { - quote! { - fields.push(ParquetType::primitive_type_builder(#field_name, #physical_type) - .with_logical_type(#logical_type) - .with_repetition(#repetition) - .build().unwrap().into() - ) - } + builder = quote! { #builder.with_converted_type(#converted_type) }; + } + + if let Some(length) = length { + builder = quote! { #builder.with_length(#length) }; } + + quote! { fields.push(#builder.build().unwrap().into()) } } fn option_into_vals(&self) -> proc_macro2::TokenStream { @@ -394,7 +392,7 @@ impl Field { quote! { rec.#field_name.signed_duration_since(::chrono::NaiveDate::from_ymd(1970, 1, 1)).num_days() as i32 } } Some(ThirdPartyType::Uuid) => { - quote! { (&rec.#field_name.to_string()[..]).into() } + quote! { rec.#field_name.as_bytes().to_vec().into() } } _ => { if self.is_a_byte_buf { @@ -430,7 +428,7 @@ impl Field { } } Some(ThirdPartyType::Uuid) => { - quote! { ::uuid::Uuid::parse_str(vals[i].data().convert()).unwrap() } + quote! { ::uuid::Uuid::from_bytes(vals[i].data().try_into().unwrap()) } } _ => match &self.ty { Type::TypePath(_) => match self.ty.last_part().as_str() { @@ -638,11 +636,40 @@ impl Type { } "f32" => BasicType::FLOAT, "f64" => BasicType::DOUBLE, - "String" | "str" | "Uuid" => BasicType::BYTE_ARRAY, + "String" | "str" => BasicType::BYTE_ARRAY, + "Uuid" => BasicType::FIXED_LEN_BYTE_ARRAY, f => unimplemented!("{} currently is not supported", f), } } + fn length(&self) -> Option { + let last_part = self.last_part(); + let leaf_type = self.leaf_type_recursive(); + + match leaf_type { + Type::Array(ref first_type) => { + if let Type::TypePath(_) = **first_type { + if last_part == "u8" { + return Some(1); + } + } + } + Type::Vec(ref first_type) | Type::Slice(ref first_type) => { + if let Type::TypePath(_) = **first_type { + if last_part == "u8" { + return None; + } + } + } + _ => (), + } + + match last_part.trim() { + "Uuid" => Some(16), + _ => None, + } + } + fn logical_type(&self) -> proc_macro2::TokenStream { let last_part = self.last_part(); let leaf_type = self.leaf_type_recursive(); @@ -1328,8 +1355,8 @@ mod test { let when = Field::from(&fields[0]); assert_eq!(when.writer_snippet().to_string(),(quote!{ { - let vals : Vec<_> = records.iter().map(|rec| (&rec.unique_id.to_string()[..]).into() ).collect(); - if let ColumnWriter::ByteArrayColumnWriter(ref mut typed) = column_writer.untyped() { + let vals : Vec<_> = records.iter().map(|rec| rec.unique_id.as_bytes().to_vec().into() ).collect(); + if let ColumnWriter::FixedLenByteArrayColumnWriter(ref mut typed) = column_writer.untyped() { typed.write_batch(&vals[..], None, None) ?; } else { panic!("Schema and struct disagree on type for {}" , stringify!{ unique_id }) @@ -1349,7 +1376,7 @@ mod test { } }).collect(); - if let ColumnWriter::ByteArrayColumnWriter(ref mut typed) = column_writer.untyped() { + if let ColumnWriter::FixedLenByteArrayColumnWriter(ref mut typed) = column_writer.untyped() { typed.write_batch(&vals[..], Some(&definition_levels[..]), None) ?; } else { panic!("Schema and struct disagree on type for {}" , stringify!{ maybe_unique_id }) @@ -1371,13 +1398,13 @@ mod test { assert_eq!(when.reader_snippet().to_string(),(quote!{ { let mut vals = Vec::new(); - if let ColumnReader::ByteArrayColumnReader(mut typed) = column_reader { + if let ColumnReader::FixedLenByteArrayColumnReader(mut typed) = column_reader { typed.read_records(num_records, None, None, &mut vals)?; } else { panic!("Schema and struct disagree on type for {}", stringify!{ unique_id }); } for (i, r) in &mut records[..num_records].iter_mut().enumerate() { - r.unique_id = ::uuid::Uuid::parse_str(vals[i].data().convert()).unwrap(); + r.unique_id = ::uuid::Uuid::from_bytes(vals[i].data().try_into().unwrap()); } } }).to_string()); diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index a5d2e76d4503..a3a4b58fea95 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -32,3 +32,4 @@ rust-version = { workspace = true } parquet = { workspace = true } parquet_derive = { path = "../parquet_derive", default-features = false } chrono = { workspace = true } +uuid = { version = "1", features = ["v4"] } diff --git a/parquet_derive_test/src/lib.rs b/parquet_derive_test/src/lib.rs index 25734813a8d8..3743c6b55c7c 100644 --- a/parquet_derive_test/src/lib.rs +++ b/parquet_derive_test/src/lib.rs @@ -42,6 +42,7 @@ struct ACompleteRecord<'a> { pub borrowed_maybe_a_string: &'a Option, pub borrowed_maybe_a_str: &'a Option<&'a str>, pub now: chrono::NaiveDateTime, + pub uuid: uuid::Uuid, pub byte_vec: Vec, pub maybe_byte_vec: Option>, pub borrowed_byte_vec: &'a [u8], @@ -61,6 +62,7 @@ struct APartiallyCompleteRecord { pub double: f64, pub now: chrono::NaiveDateTime, pub date: chrono::NaiveDate, + pub uuid: uuid::Uuid, pub byte_vec: Vec, } @@ -105,6 +107,7 @@ mod tests { OPTIONAL BINARY borrowed_maybe_a_string (STRING); OPTIONAL BINARY borrowed_maybe_a_str (STRING); REQUIRED INT64 now (TIMESTAMP_MILLIS); + REQUIRED FIXED_LEN_BYTE_ARRAY (16) uuid (UUID); REQUIRED BINARY byte_vec; OPTIONAL BINARY maybe_byte_vec; REQUIRED BINARY borrowed_byte_vec; @@ -144,6 +147,7 @@ mod tests { borrowed_maybe_a_string: &maybe_a_string, borrowed_maybe_a_str: &maybe_a_str, now: chrono::Utc::now().naive_local(), + uuid: uuid::Uuid::new_v4(), byte_vec: vec![0x65, 0x66, 0x67], maybe_byte_vec: Some(vec![0x88, 0x89, 0x90]), borrowed_byte_vec: &borrowed_byte_vec, @@ -179,6 +183,7 @@ mod tests { double: std::f64::NAN, now: chrono::Utc::now().naive_local(), date: chrono::naive::NaiveDate::from_ymd_opt(2015, 3, 14).unwrap(), + uuid: uuid::Uuid::new_v4(), byte_vec: vec![0x65, 0x66, 0x67], }]; From 6249554cd722ec41cb30b00c4f8d9cc8b15e57cd Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 16 May 2024 12:11:21 +0100 Subject: [PATCH 2/4] fix byte array length handling --- parquet_derive/src/parquet_field.rs | 44 ++++++++++++----------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/parquet_derive/src/parquet_field.rs b/parquet_derive/src/parquet_field.rs index 82f381fbdf24..fa758201a1dc 100644 --- a/parquet_derive/src/parquet_field.rs +++ b/parquet_derive/src/parquet_field.rs @@ -136,7 +136,7 @@ impl Field { Type::Option(_) => unimplemented!("Unsupported nesting encountered"), Type::Reference(_, ref second_type) | Type::Vec(ref second_type) - | Type::Array(ref second_type) + | Type::Array(ref second_type, _) | Type::Slice(ref second_type) => match **second_type { Type::TypePath(_) => Some(self.optional_definition_levels()), _ => unimplemented!("Unsupported nesting encountered"), @@ -144,11 +144,11 @@ impl Field { }, Type::Reference(_, ref first_type) | Type::Vec(ref first_type) - | Type::Array(ref first_type) + | Type::Array(ref first_type, _) | Type::Slice(ref first_type) => match **first_type { Type::TypePath(_) => None, Type::Vec(ref second_type) - | Type::Array(ref second_type) + | Type::Array(ref second_type, _) | Type::Slice(ref second_type) => match **second_type { Type::TypePath(_) => None, Type::Reference(_, ref third_type) => match **third_type { @@ -161,7 +161,7 @@ impl Field { match **second_type { Type::TypePath(_) => Some(self.optional_definition_levels()), Type::Vec(ref third_type) - | Type::Array(ref third_type) + | Type::Array(ref third_type, _) | Type::Slice(ref third_type) => match **third_type { Type::TypePath(_) => Some(self.optional_definition_levels()), Type::Reference(_, ref fourth_type) => match **fourth_type { @@ -467,7 +467,7 @@ impl Field { #[allow(clippy::large_enum_variant)] #[derive(Debug, PartialEq)] enum Type { - Array(Box), + Array(Box, syn::Expr), Option(Box), Slice(Box), Vec(Box), @@ -540,7 +540,7 @@ impl Type { Type::TypePath(_) => parent_ty.unwrap_or(ty), Type::Option(ref first_type) | Type::Vec(ref first_type) - | Type::Array(ref first_type) + | Type::Array(ref first_type, _) | Type::Slice(ref first_type) | Type::Reference(_, ref first_type) => { Type::leaf_type_recursive_helper(first_type, Some(ty)) @@ -558,7 +558,7 @@ impl Type { Type::TypePath(ref type_) => type_, Type::Option(ref first_type) | Type::Vec(ref first_type) - | Type::Array(ref first_type) + | Type::Array(ref first_type, _) | Type::Slice(ref first_type) | Type::Reference(_, ref first_type) => match **first_type { Type::TypePath(ref type_) => type_, @@ -605,7 +605,7 @@ impl Type { let leaf_type = self.leaf_type_recursive(); match leaf_type { - Type::Array(ref first_type) => { + Type::Array(ref first_type, _length) => { if let Type::TypePath(_) = **first_type { if last_part == "u8" { return BasicType::FIXED_LEN_BYTE_ARRAY; @@ -642,30 +642,22 @@ impl Type { } } - fn length(&self) -> Option { + fn length(&self) -> Option { let last_part = self.last_part(); let leaf_type = self.leaf_type_recursive(); - match leaf_type { - Type::Array(ref first_type) => { - if let Type::TypePath(_) = **first_type { - if last_part == "u8" { - return Some(1); - } + // `[u8; N]` => Some(N) + if let Type::Array(ref first_type, length) = leaf_type { + if let Type::TypePath(_) = **first_type { + if last_part == "u8" { + return Some(length.clone()); } } - Type::Vec(ref first_type) | Type::Slice(ref first_type) => { - if let Type::TypePath(_) = **first_type { - if last_part == "u8" { - return None; - } - } - } - _ => (), } match last_part.trim() { - "Uuid" => Some(16), + // Uuid => [u8; 16] => Some(16) + "Uuid" => Some(syn::parse_quote!(16)), _ => None, } } @@ -675,7 +667,7 @@ impl Type { let leaf_type = self.leaf_type_recursive(); match leaf_type { - Type::Array(ref first_type) => { + Type::Array(ref first_type, _length) => { if let Type::TypePath(_) = **first_type { if last_part == "u8" { return quote! { None }; @@ -816,7 +808,7 @@ impl Type { fn from_type_array(f: &syn::Field, ta: &syn::TypeArray) -> Self { let inner_type = Type::from_type(f, ta.elem.as_ref()); - Type::Array(Box::new(inner_type)) + Type::Array(Box::new(inner_type), ta.len.clone()) } fn from_type_slice(f: &syn::Field, ts: &syn::TypeSlice) -> Self { From 00b82662a638d85faf297ac56b2ed8b35b2d1dae Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 16 May 2024 14:51:30 +0100 Subject: [PATCH 3/4] test lengths --- parquet_derive/src/parquet_field.rs | 41 ++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/parquet_derive/src/parquet_field.rs b/parquet_derive/src/parquet_field.rs index fa758201a1dc..05950e69027f 100644 --- a/parquet_derive/src/parquet_field.rs +++ b/parquet_derive/src/parquet_field.rs @@ -1110,6 +1110,7 @@ mod test { a_fix_byte_buf: [u8; 10], a_complex_option: ::std::option::Option<&Vec>, a_complex_vec: &::std::vec::Vec<&Option>, + a_uuid: ::uuid::Uuid, } }; @@ -1129,7 +1130,45 @@ mod test { BasicType::BYTE_ARRAY, BasicType::FIXED_LEN_BYTE_ARRAY, BasicType::BYTE_ARRAY, - BasicType::INT32 + BasicType::INT32, + BasicType::FIXED_LEN_BYTE_ARRAY, + ] + ) + } + + #[test] + fn test_type_length() { + let snippet: proc_macro2::TokenStream = quote! { + struct LotsOfInnerTypes { + a_buf: ::std::vec::Vec, + a_number: i32, + a_verbose_option: ::std::option::Option, + a_silly_string: String, + a_fix_byte_buf: [u8; 10], + a_complex_option: ::std::option::Option<&Vec>, + a_complex_vec: &::std::vec::Vec<&Option>, + a_uuid: ::uuid::Uuid, + } + }; + + let fields = extract_fields(snippet); + let converted_fields: Vec<_> = fields.iter().map(Type::from).collect(); + let lengths: Vec<_> = converted_fields + .iter() + .map(|ty| ty.length()) + .collect(); + + assert_eq!( + lengths, + vec![ + None, + None, + None, + None, + Some(syn::parse_quote!(10)), + None, + None, + Some(syn::parse_quote!(16)), ] ) } From 68168810de297e5f19b1beee129489fb9c1188e7 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Sat, 18 May 2024 10:28:59 +0100 Subject: [PATCH 4/4] fmt --- parquet_derive/src/parquet_field.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/parquet_derive/src/parquet_field.rs b/parquet_derive/src/parquet_field.rs index 05950e69027f..9fff76c42d1d 100644 --- a/parquet_derive/src/parquet_field.rs +++ b/parquet_derive/src/parquet_field.rs @@ -1153,10 +1153,7 @@ mod test { let fields = extract_fields(snippet); let converted_fields: Vec<_> = fields.iter().map(Type::from).collect(); - let lengths: Vec<_> = converted_fields - .iter() - .map(|ty| ty.length()) - .collect(); + let lengths: Vec<_> = converted_fields.iter().map(|ty| ty.length()).collect(); assert_eq!( lengths,