From 8de9de75d8ef0969ed9b74152c3c8f9c140da1ce Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 29 Dec 2023 12:33:37 +0000 Subject: [PATCH] fix uuid derive --- parquet_derive/src/parquet_field.rs | 75 ++++++++++++++++++++--------- parquet_derive_test/Cargo.toml | 1 + parquet_derive_test/src/lib.rs | 5 ++ 3 files changed, 57 insertions(+), 24 deletions(-) diff --git a/parquet_derive/src/parquet_field.rs b/parquet_derive/src/parquet_field.rs index 3ab85a8972f5..82f381fbdf24 100644 --- a/parquet_derive/src/parquet_field.rs +++ b/parquet_derive/src/parquet_field.rs @@ -316,25 +316,23 @@ impl Field { let logical_type = self.ty.logical_type(); let repetition = self.ty.repetition(); let converted_type = self.ty.converted_type(); + let length = self.ty.length(); + + let mut builder = quote! { + ParquetType::primitive_type_builder(#field_name, #physical_type) + .with_logical_type(#logical_type) + .with_repetition(#repetition) + }; if let Some(converted_type) = converted_type { - quote! { - fields.push(ParquetType::primitive_type_builder(#field_name, #physical_type) - .with_logical_type(#logical_type) - .with_repetition(#repetition) - .with_converted_type(#converted_type) - .build().unwrap().into() - ) - } - } else { - quote! { - fields.push(ParquetType::primitive_type_builder(#field_name, #physical_type) - .with_logical_type(#logical_type) - .with_repetition(#repetition) - .build().unwrap().into() - ) - } + builder = quote! { #builder.with_converted_type(#converted_type) }; + } + + if let Some(length) = length { + builder = quote! { #builder.with_length(#length) }; } + + quote! { fields.push(#builder.build().unwrap().into()) } } fn option_into_vals(&self) -> proc_macro2::TokenStream { @@ -394,7 +392,7 @@ impl Field { quote! { rec.#field_name.signed_duration_since(::chrono::NaiveDate::from_ymd(1970, 1, 1)).num_days() as i32 } } Some(ThirdPartyType::Uuid) => { - quote! { (&rec.#field_name.to_string()[..]).into() } + quote! { rec.#field_name.as_bytes().to_vec().into() } } _ => { if self.is_a_byte_buf { @@ -430,7 +428,7 @@ impl Field { } } Some(ThirdPartyType::Uuid) => { - quote! { ::uuid::Uuid::parse_str(vals[i].data().convert()).unwrap() } + quote! { ::uuid::Uuid::from_bytes(vals[i].data().try_into().unwrap()) } } _ => match &self.ty { Type::TypePath(_) => match self.ty.last_part().as_str() { @@ -638,11 +636,40 @@ impl Type { } "f32" => BasicType::FLOAT, "f64" => BasicType::DOUBLE, - "String" | "str" | "Uuid" => BasicType::BYTE_ARRAY, + "String" | "str" => BasicType::BYTE_ARRAY, + "Uuid" => BasicType::FIXED_LEN_BYTE_ARRAY, f => unimplemented!("{} currently is not supported", f), } } + fn length(&self) -> Option { + let last_part = self.last_part(); + let leaf_type = self.leaf_type_recursive(); + + match leaf_type { + Type::Array(ref first_type) => { + if let Type::TypePath(_) = **first_type { + if last_part == "u8" { + return Some(1); + } + } + } + Type::Vec(ref first_type) | Type::Slice(ref first_type) => { + if let Type::TypePath(_) = **first_type { + if last_part == "u8" { + return None; + } + } + } + _ => (), + } + + match last_part.trim() { + "Uuid" => Some(16), + _ => None, + } + } + fn logical_type(&self) -> proc_macro2::TokenStream { let last_part = self.last_part(); let leaf_type = self.leaf_type_recursive(); @@ -1328,8 +1355,8 @@ mod test { let when = Field::from(&fields[0]); assert_eq!(when.writer_snippet().to_string(),(quote!{ { - let vals : Vec<_> = records.iter().map(|rec| (&rec.unique_id.to_string()[..]).into() ).collect(); - if let ColumnWriter::ByteArrayColumnWriter(ref mut typed) = column_writer.untyped() { + let vals : Vec<_> = records.iter().map(|rec| rec.unique_id.as_bytes().to_vec().into() ).collect(); + if let ColumnWriter::FixedLenByteArrayColumnWriter(ref mut typed) = column_writer.untyped() { typed.write_batch(&vals[..], None, None) ?; } else { panic!("Schema and struct disagree on type for {}" , stringify!{ unique_id }) @@ -1349,7 +1376,7 @@ mod test { } }).collect(); - if let ColumnWriter::ByteArrayColumnWriter(ref mut typed) = column_writer.untyped() { + if let ColumnWriter::FixedLenByteArrayColumnWriter(ref mut typed) = column_writer.untyped() { typed.write_batch(&vals[..], Some(&definition_levels[..]), None) ?; } else { panic!("Schema and struct disagree on type for {}" , stringify!{ maybe_unique_id }) @@ -1371,13 +1398,13 @@ mod test { assert_eq!(when.reader_snippet().to_string(),(quote!{ { let mut vals = Vec::new(); - if let ColumnReader::ByteArrayColumnReader(mut typed) = column_reader { + if let ColumnReader::FixedLenByteArrayColumnReader(mut typed) = column_reader { typed.read_records(num_records, None, None, &mut vals)?; } else { panic!("Schema and struct disagree on type for {}", stringify!{ unique_id }); } for (i, r) in &mut records[..num_records].iter_mut().enumerate() { - r.unique_id = ::uuid::Uuid::parse_str(vals[i].data().convert()).unwrap(); + r.unique_id = ::uuid::Uuid::from_bytes(vals[i].data().try_into().unwrap()); } } }).to_string()); diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index a5d2e76d4503..a3a4b58fea95 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -32,3 +32,4 @@ rust-version = { workspace = true } parquet = { workspace = true } parquet_derive = { path = "../parquet_derive", default-features = false } chrono = { workspace = true } +uuid = { version = "1", features = ["v4"] } diff --git a/parquet_derive_test/src/lib.rs b/parquet_derive_test/src/lib.rs index 25734813a8d8..3743c6b55c7c 100644 --- a/parquet_derive_test/src/lib.rs +++ b/parquet_derive_test/src/lib.rs @@ -42,6 +42,7 @@ struct ACompleteRecord<'a> { pub borrowed_maybe_a_string: &'a Option, pub borrowed_maybe_a_str: &'a Option<&'a str>, pub now: chrono::NaiveDateTime, + pub uuid: uuid::Uuid, pub byte_vec: Vec, pub maybe_byte_vec: Option>, pub borrowed_byte_vec: &'a [u8], @@ -61,6 +62,7 @@ struct APartiallyCompleteRecord { pub double: f64, pub now: chrono::NaiveDateTime, pub date: chrono::NaiveDate, + pub uuid: uuid::Uuid, pub byte_vec: Vec, } @@ -105,6 +107,7 @@ mod tests { OPTIONAL BINARY borrowed_maybe_a_string (STRING); OPTIONAL BINARY borrowed_maybe_a_str (STRING); REQUIRED INT64 now (TIMESTAMP_MILLIS); + REQUIRED FIXED_LEN_BYTE_ARRAY (16) uuid (UUID); REQUIRED BINARY byte_vec; OPTIONAL BINARY maybe_byte_vec; REQUIRED BINARY borrowed_byte_vec; @@ -144,6 +147,7 @@ mod tests { borrowed_maybe_a_string: &maybe_a_string, borrowed_maybe_a_str: &maybe_a_str, now: chrono::Utc::now().naive_local(), + uuid: uuid::Uuid::new_v4(), byte_vec: vec![0x65, 0x66, 0x67], maybe_byte_vec: Some(vec![0x88, 0x89, 0x90]), borrowed_byte_vec: &borrowed_byte_vec, @@ -179,6 +183,7 @@ mod tests { double: std::f64::NAN, now: chrono::Utc::now().naive_local(), date: chrono::naive::NaiveDate::from_ymd_opt(2015, 3, 14).unwrap(), + uuid: uuid::Uuid::new_v4(), byte_vec: vec![0x65, 0x66, 0x67], }];