diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index b00d2c88e1a7..a8dbeded9ce7 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -62,3 +62,7 @@ criterion = { version = "0.5", default-features = false } [[bench]] name = "occupancy" harness = false + +[[bench]] +name = "gc_view_types" +harness = false diff --git a/arrow-array/benches/gc_view_types.rs b/arrow-array/benches/gc_view_types.rs new file mode 100644 index 000000000000..4b74a8f60b06 --- /dev/null +++ b/arrow-array/benches/gc_view_types.rs @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::StringViewArray; +use criterion::*; + +fn gen_view_array(size: usize) -> StringViewArray { + StringViewArray::from_iter((0..size).map(|v| match v % 3 { + 0 => Some("small"), + 1 => Some("larger than 12 bytes array"), + 2 => None, + _ => unreachable!("unreachable"), + })) +} + +fn criterion_benchmark(c: &mut Criterion) { + let array = gen_view_array(100_000); + + c.bench_function("gc view types all", |b| { + b.iter(|| { + black_box(array.gc()); + }); + }); + + let sliced = array.slice(0, 100_000 / 2); + c.bench_function("gc view types slice half", |b| { + b.iter(|| { + black_box(sliced.gc()); + }); + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 79f2d47587a6..88033b83cbec 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -265,6 +265,56 @@ impl GenericByteViewArray { phantom: Default::default(), } } + + /// Returns a "compacted" version of this array + /// + /// The original array will *not* be modified + /// + /// # Garbage Collection + /// + /// Before GC: + /// ```text + /// ┌──────┐ + /// │......│ + /// │......│ + /// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer + /// │ View 1 │─ ─ ─ ─ │......│ with data that + /// ├────────────────────┤ │......│ is not referred + /// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or + /// └────────────────────┘ │......│ View 2 + /// │......│ + /// 2 views, refer to │......│ + /// small portions of a └──────┘ + /// large buffer + /// ``` + /// + /// After GC: + /// + /// ```text + /// ┌────────────────────┐ ┌─────┐ After gc, only + /// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is + /// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by + /// │ View 2 │─ ─ ─ ─ └─────┘ the views is + /// └────────────────────┘ left + /// + /// + /// 2 views + /// ``` + /// This method will compact the data buffers by recreating the view array and only include the data + /// that is pointed to by the views. + /// + /// Note that it will copy the array regardless of whether the original array is compact. + /// Use with caution as this can be an expensive operation, only use it when you are sure that the view + /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing. + pub fn gc(&self) -> Self { + let mut builder = GenericByteViewBuilder::::with_capacity(self.len()); + + for v in self.iter() { + builder.append_option(v); + } + + builder.finish() + } } impl Debug for GenericByteViewArray { @@ -645,4 +695,39 @@ mod tests { StringViewArray::new(views, buffers, None); } + + #[test] + fn test_gc() { + let test_data = [ + Some("longer than 12 bytes"), + Some("short"), + Some("t"), + Some("longer than 12 bytes"), + None, + Some("short"), + ]; + + let array = { + let mut builder = StringViewBuilder::new().with_block_size(8); // create multiple buffers + test_data.into_iter().for_each(|v| builder.append_option(v)); + builder.finish() + }; + assert!(array.buffers.len() > 1); + + fn check_gc(to_test: &StringViewArray) { + let gc = to_test.gc(); + assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len()); + + to_test.iter().zip(gc.iter()).for_each(|(a, b)| { + assert_eq!(a, b); + }); + assert_eq!(to_test.len(), gc.len()); + } + + check_gc(&array); + check_gc(&array.slice(1, 3)); + check_gc(&array.slice(2, 1)); + check_gc(&array.slice(2, 2)); + check_gc(&array.slice(3, 1)); + } }