From 0dcbd6cdfab6ef883ef736829a64bafe2d2dd723 Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Thu, 12 Oct 2023 17:09:00 -0500 Subject: [PATCH] Refactored parquet writing code (#4541) Read vector/array columns without ungrouping --- .../io/deephaven/vector/VectorFactory.java | 57 ++ .../base/PlainBinaryChunkedWriter.java | 8 +- .../base/PlainBooleanChunkedWriter.java | 6 +- .../base/PlainDoubleChunkedWriter.java | 4 +- .../parquet/base/PlainFloatChunkedWriter.java | 5 +- .../parquet/base/PlainIntChunkedWriter.java | 4 +- .../parquet/base/PlainLongChunkedWriter.java | 5 +- .../parquet/base/RleIntChunkedWriter.java | 5 +- .../DictionarySizeExceededException.java | 2 +- .../parquet/table/ParquetInstructions.java | 4 +- .../parquet/table/ParquetTableWriter.java | 563 ++++-------------- .../deephaven/parquet/table/ParquetTools.java | 4 +- .../VariablePageSizeColumnChunkPageStore.java | 2 +- .../transfer/ArrayAndVectorTransfer.java | 66 ++ .../table/transfer/BooleanArrayTransfer.java | 42 ++ .../table/transfer/BooleanTransfer.java | 23 +- .../table/transfer/BooleanVectorTransfer.java | 39 ++ .../table/transfer/ByteArrayTransfer.java | 41 ++ .../parquet/table/transfer/ByteTransfer.java | 11 +- .../table/transfer/ByteVectorTransfer.java | 38 ++ .../table/transfer/CharArrayTransfer.java | 36 ++ .../parquet/table/transfer/CharTransfer.java | 11 +- .../table/transfer/CharVectorTransfer.java | 33 + .../table/transfer/CodecArrayTransfer.java | 31 + .../parquet/table/transfer/CodecTransfer.java | 19 +- .../table/transfer/CodecVectorTransfer.java | 31 + ...ctEncodedStringArrayAndVectorTransfer.java | 82 +++ .../DictEncodedStringArrayTransfer.java | 24 + .../transfer/DictEncodedStringTransfer.java | 52 ++ .../DictEncodedStringVectorTransfer.java | 27 + .../table/transfer/DoubleArrayTransfer.java | 38 ++ .../table/transfer/DoubleTransfer.java | 19 +- .../table/transfer/DoubleVectorTransfer.java | 37 ++ .../table/transfer/EncodedTransfer.java | 143 ----- .../table/transfer/FloatArrayTransfer.java | 38 ++ .../parquet/table/transfer/FloatTransfer.java | 19 +- .../table/transfer/FloatVectorTransfer.java | 37 ++ .../table/transfer/InstantArrayTransfer.java | 38 ++ .../table/transfer/InstantVectorTransfer.java | 35 ++ .../table/transfer/IntArrayTransfer.java | 33 + .../IntCastablePrimitiveTransfer.java | 41 +- .../parquet/table/transfer/IntTransfer.java | 19 +- .../table/transfer/IntVectorTransfer.java | 32 + .../table/transfer/LongArrayTransfer.java | 38 ++ .../parquet/table/transfer/LongTransfer.java | 19 +- .../table/transfer/LongVectorTransfer.java | 37 ++ .../ObjectArrayAndVectorTransfer.java | 126 ++++ .../table/transfer/ObjectArrayTransfer.java | 29 + .../table/transfer/ObjectTransfer.java | 80 +++ .../table/transfer/ObjectVectorTransfer.java | 34 ++ .../PrimitiveArrayAndVectorTransfer.java | 104 ++++ .../table/transfer/PrimitiveTransfer.java | 41 +- .../transfer/PrimitiveVectorTransfer.java | 29 + .../table/transfer/ShortArrayTransfer.java | 41 ++ .../parquet/table/transfer/ShortTransfer.java | 11 +- .../table/transfer/ShortVectorTransfer.java | 38 ++ .../table/transfer/StringArrayTransfer.java | 21 + .../table/transfer/StringDictionary.java | 95 +++ .../table/transfer/StringTransfer.java | 14 +- .../table/transfer/StringVectorTransfer.java | 21 + .../table/transfer/TransferObject.java | 234 ++++++-- .../table/transfer/VariableWidthTransfer.java | 201 +++++++ .../table/ParquetTableReadWriteTest.java | 190 ++++-- py/server/tests/test_parquet.py | 24 +- .../ReplicateParquetTransferObjects.java | 15 +- 65 files changed, 2410 insertions(+), 836 deletions(-) create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ArrayAndVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/BooleanArrayTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/BooleanVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ByteArrayTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ByteVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CharArrayTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CharVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CodecArrayTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CodecVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringArrayAndVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringArrayTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DoubleArrayTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DoubleVectorTransfer.java delete mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/EncodedTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/FloatArrayTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/FloatVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/InstantArrayTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/InstantVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntArrayTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/LongArrayTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/LongVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectArrayAndVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectArrayTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/PrimitiveArrayAndVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/PrimitiveVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ShortArrayTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ShortVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringArrayTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringDictionary.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringVectorTransfer.java create mode 100644 extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/VariableWidthTransfer.java diff --git a/engine/vector/src/main/java/io/deephaven/vector/VectorFactory.java b/engine/vector/src/main/java/io/deephaven/vector/VectorFactory.java index 4c8ee75b6bb..172fc487c96 100644 --- a/engine/vector/src/main/java/io/deephaven/vector/VectorFactory.java +++ b/engine/vector/src/main/java/io/deephaven/vector/VectorFactory.java @@ -14,6 +14,12 @@ public enum VectorFactory { // @formatter:off Boolean() { + @Override + @NotNull + public Class> vectorType() { + throw new UnsupportedOperationException("Vector is not implemented for primitive booleans"); + } + @Override @NotNull public final Vector vectorWrap(@NotNull final Object array) { @@ -28,6 +34,12 @@ public Vector vectorWrap(@NotNull final Object array, int offset, int capacit }, Char() { + @Override + @NotNull + public Class> vectorType() { + return CharVector.class; + } + @Override @NotNull public final CharVector vectorWrap(@NotNull final Object array) { @@ -42,6 +54,12 @@ public CharVector vectorWrap(@NotNull final Object array, int offset, int capaci }, Byte() { + @Override + @NotNull + public Class> vectorType() { + return ByteVector.class; + } + @Override @NotNull public final ByteVector vectorWrap(@NotNull final Object array) { @@ -56,6 +74,12 @@ public ByteVector vectorWrap(@NotNull final Object array, int offset, int capaci }, Short() { + @Override + @NotNull + public Class> vectorType() { + return ShortVector.class; + } + @Override @NotNull public final ShortVector vectorWrap(@NotNull final Object array) { @@ -70,6 +94,12 @@ public ShortVector vectorWrap(@NotNull final Object array, int offset, int capac }, Int() { + @Override + @NotNull + public Class> vectorType() { + return IntVector.class; + } + @Override @NotNull public final IntVector vectorWrap(@NotNull final Object array) { @@ -84,6 +114,12 @@ public IntVector vectorWrap(@NotNull final Object array, int offset, int capacit }, Long() { + @Override + @NotNull + public Class> vectorType() { + return LongVector.class; + } + @Override @NotNull public final LongVector vectorWrap(@NotNull final Object array) { @@ -98,6 +134,12 @@ public LongVector vectorWrap(@NotNull final Object array, int offset, int capaci }, Float() { + @Override + @NotNull + public Class> vectorType() { + return FloatVector.class; + } + @Override @NotNull public final FloatVector vectorWrap(@NotNull final Object array) { @@ -112,6 +154,12 @@ public FloatVector vectorWrap(@NotNull final Object array, int offset, int capac }, Double() { + @Override + @NotNull + public Class> vectorType() { + return DoubleVector.class; + } + @Override @NotNull public final DoubleVector vectorWrap(@NotNull final Object array) { @@ -126,6 +174,13 @@ public DoubleVector vectorWrap(@NotNull final Object array, int offset, int capa }, Object() { + @Override + @NotNull + public Class> vectorType() { + //noinspection unchecked + return (Class>) (Object) ObjectVector.class; + } + @Override @NotNull public final ObjectVector vectorWrap(@NotNull final Object array) { @@ -149,6 +204,8 @@ public static VectorFactory forElementType(@NotNull final Class clazz) { return BY_ELEMENT_TYPE.get(clazz); } + public abstract @NotNull Class> vectorType(); + @NotNull public abstract Vector vectorWrap(@NotNull Object array); diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainBinaryChunkedWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainBinaryChunkedWriter.java index a11bab7ba5d..d44900e8c40 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainBinaryChunkedWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainBinaryChunkedWriter.java @@ -27,13 +27,15 @@ public class PlainBinaryChunkedWriter extends AbstractBulkValuesWriter private final ByteBufferAllocator allocator; - ByteBuffer innerBuffer; + private ByteBuffer innerBuffer; + private IntBuffer nullOffsets; - public PlainBinaryChunkedWriter(final int pageSize, @NotNull final ByteBufferAllocator allocator) { + PlainBinaryChunkedWriter(final int pageSize, @NotNull final ByteBufferAllocator allocator) { innerBuffer = allocator.allocate(pageSize); innerBuffer.order(ByteOrder.LITTLE_ENDIAN); this.allocator = allocator; innerBuffer.mark(); + nullOffsets = IntBuffer.allocate(4); } @Override @@ -121,7 +123,7 @@ public WriteResult writeBulkFilterNulls(@NotNull final Binary[] bulkValues, public @NotNull WriteResult writeBulkVectorFilterNulls(@NotNull Binary[] bulkValues, final int nonNullLeafCount, @NotNull final Statistics statistics) { - IntBuffer nullOffsets = IntBuffer.allocate(4); + nullOffsets.clear(); for (int i = 0; i < nonNullLeafCount; i++) { if (bulkValues[i] != null) { final Binary v = bulkValues[i]; diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainBooleanChunkedWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainBooleanChunkedWriter.java index 4bdbd217063..514a51bf39b 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainBooleanChunkedWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainBooleanChunkedWriter.java @@ -21,9 +21,11 @@ */ public class PlainBooleanChunkedWriter extends AbstractBulkValuesWriter { private final BooleanPlainValuesWriter writer; + private IntBuffer nullOffsets; - public PlainBooleanChunkedWriter() { + PlainBooleanChunkedWriter() { writer = new BooleanPlainValuesWriter(); + nullOffsets = IntBuffer.allocate(4); } @Override @@ -107,7 +109,7 @@ public WriteResult writeBulkFilterNulls(@NotNull ByteBuffer bulkValues, public @NotNull WriteResult writeBulkVectorFilterNulls(@NotNull ByteBuffer bulkValues, final int rowCount, @NotNull final Statistics statistics) { - IntBuffer nullOffsets = IntBuffer.allocate(4); + nullOffsets.clear(); int i = 0; while (bulkValues.hasRemaining()) { final byte next = bulkValues.get(); diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainDoubleChunkedWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainDoubleChunkedWriter.java index 583db477019..9d716d94819 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainDoubleChunkedWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainDoubleChunkedWriter.java @@ -32,11 +32,13 @@ public class PlainDoubleChunkedWriter extends AbstractBulkValuesWriter statistics) { ensureCapacityFor(bulkValues); int i = 0; - IntBuffer nullOffsets = IntBuffer.allocate(4); + nullOffsets.clear(); while (bulkValues.hasRemaining()) { final double v = bulkValues.get(); if (v != QueryConstants.NULL_DOUBLE) { diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainFloatChunkedWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainFloatChunkedWriter.java index 6bb4f2d1198..08537613d53 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainFloatChunkedWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainFloatChunkedWriter.java @@ -32,11 +32,12 @@ public class PlainFloatChunkedWriter extends AbstractBulkValuesWriter statistics) { ensureCapacityFor(bulkValues); int i = 0; - IntBuffer nullOffsets = IntBuffer.allocate(4); + nullOffsets.clear(); while (bulkValues.hasRemaining()) { final float v = bulkValues.get(); if (v != QueryConstants.NULL_FLOAT) { diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainIntChunkedWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainIntChunkedWriter.java index 6ec8f96690e..734fefbea4d 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainIntChunkedWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainIntChunkedWriter.java @@ -33,11 +33,13 @@ public class PlainIntChunkedWriter extends AbstractBulkValuesWriter { private IntBuffer targetBuffer; private ByteBuffer innerBuffer; + private IntBuffer nullOffsets; PlainIntChunkedWriter(final int targetPageSize, @NotNull final ByteBufferAllocator allocator, final int nullValue) { this.allocator = allocator; this.nullValue = nullValue; realloc(targetPageSize); + nullOffsets = IntBuffer.allocate(4); } PlainIntChunkedWriter(final int targetPageSize, @NotNull final ByteBufferAllocator allocator) { @@ -133,7 +135,7 @@ public WriteResult writeBulkVectorFilterNulls(@NotNull final IntBuffer bulkValue @NotNull final Statistics statistics) { ensureCapacityFor(bulkValues); int i = 0; - IntBuffer nullOffsets = IntBuffer.allocate(4); + nullOffsets.clear(); while (bulkValues.hasRemaining()) { final int v = bulkValues.get(); if (v != nullValue) { diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainLongChunkedWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainLongChunkedWriter.java index 9eead6b352d..2ea9b6af637 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainLongChunkedWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/PlainLongChunkedWriter.java @@ -32,11 +32,12 @@ public class PlainLongChunkedWriter extends AbstractBulkValuesWriter private LongBuffer targetBuffer; private ByteBuffer innerBuffer; - + private IntBuffer nullOffsets; PlainLongChunkedWriter(final int targetPageSize, @NotNull final ByteBufferAllocator allocator) { this.allocator = allocator; realloc(targetPageSize); + nullOffsets = IntBuffer.allocate(4); } @Override @@ -128,7 +129,7 @@ public WriteResult writeBulkVectorFilterNulls(@NotNull final LongBuffer bulkValu @NotNull final Statistics statistics) { ensureCapacityFor(bulkValues); int i = 0; - IntBuffer nullOffsets = IntBuffer.allocate(4); + nullOffsets.clear(); while (bulkValues.hasRemaining()) { final long v = bulkValues.get(); if (v != QueryConstants.NULL_LONG) { diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RleIntChunkedWriter.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RleIntChunkedWriter.java index a9ef7901718..18a9ab82ba5 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RleIntChunkedWriter.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/RleIntChunkedWriter.java @@ -29,13 +29,14 @@ public class RleIntChunkedWriter extends AbstractBulkValuesWriter { private final RunLengthBitPackingHybridEncoder encoder; private final byte bitWidth; + private IntBuffer nullOffsets; RleIntChunkedWriter(int pageSize, ByteBufferAllocator allocator, byte bitWidth) { encoder = new RunLengthBitPackingHybridEncoder(bitWidth, pageSize, pageSize, allocator); this.bitWidth = bitWidth; + nullOffsets = IntBuffer.allocate(4); } - @Override public final void writeInteger(int v) { try { @@ -132,7 +133,7 @@ public WriteResult writeBulkFilterNulls(@NotNull IntBuffer bulkValues, public @NotNull WriteResult writeBulkVectorFilterNulls(@NotNull IntBuffer bulkValues, final int rowCount, @NotNull final Statistics statistics) { - IntBuffer nullOffsets = IntBuffer.allocate(4); + nullOffsets.clear(); int i = 0; while (bulkValues.hasRemaining()) { int v = bulkValues.get(); diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/DictionarySizeExceededException.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/DictionarySizeExceededException.java index e5c0fb37563..f4aab5025c3 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/DictionarySizeExceededException.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/DictionarySizeExceededException.java @@ -3,7 +3,7 @@ import io.deephaven.UncheckedDeephavenException; import org.jetbrains.annotations.NotNull; -final class DictionarySizeExceededException extends UncheckedDeephavenException { +public final class DictionarySizeExceededException extends UncheckedDeephavenException { public DictionarySizeExceededException(@NotNull final String message) { super(message); } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetInstructions.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetInstructions.java index c455b5acaa0..5d84ea2fdc9 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetInstructions.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetInstructions.java @@ -83,10 +83,10 @@ public static int getDefaltMaximumDictionarySize() { return defaultMaximumDictionarySize; } - private static final int MIN_TARGET_PAGE_SIZE = + public static final int MIN_TARGET_PAGE_SIZE = Configuration.getInstance().getIntegerWithDefault("Parquet.minTargetPageSize", 2 << 10); private static final int DEFAULT_TARGET_PAGE_SIZE = - Configuration.getInstance().getIntegerWithDefault("Parquet.defaultTargetPageSize", 8 << 10); + Configuration.getInstance().getIntegerWithDefault("Parquet.defaultTargetPageSize", 1 << 20); private static volatile int defaultTargetPageSize = DEFAULT_TARGET_PAGE_SIZE; private static final boolean DEFAULT_IS_REFRESHING = false; diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java index c0a29302f5f..61ebaa14b91 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTableWriter.java @@ -3,29 +3,20 @@ */ package io.deephaven.parquet.table; -import gnu.trove.impl.Constants; -import gnu.trove.list.array.TIntArrayList; -import gnu.trove.map.hash.TObjectIntHashMap; import io.deephaven.UncheckedDeephavenException; import io.deephaven.api.ColumnName; import io.deephaven.api.RawString; import io.deephaven.api.Selectable; import io.deephaven.api.agg.Aggregation; -import io.deephaven.base.verify.Assert; -import io.deephaven.chunk.*; -import io.deephaven.chunk.attributes.Values; import io.deephaven.engine.liveness.LivenessScopeStack; -import io.deephaven.engine.rowset.RowSequence; import io.deephaven.engine.rowset.RowSet; import io.deephaven.engine.rowset.TrackingRowSet; import io.deephaven.engine.table.*; -import io.deephaven.engine.table.impl.CodecLookup; import io.deephaven.engine.table.impl.QueryTable; import io.deephaven.engine.table.impl.select.FormulaColumn; import io.deephaven.engine.table.impl.select.NullSelectColumn; import io.deephaven.engine.table.impl.select.SelectColumn; import io.deephaven.engine.table.impl.select.SourceColumn; -import io.deephaven.engine.table.impl.sources.ReinterpretUtils; import io.deephaven.parquet.base.ColumnWriter; import io.deephaven.parquet.base.ParquetFileWriter; import io.deephaven.parquet.base.RowGroupWriter; @@ -39,41 +30,29 @@ import io.deephaven.util.QueryConstants; import io.deephaven.util.SafeCloseable; import io.deephaven.util.annotations.VisibleForTesting; -import io.deephaven.util.type.TypeUtils; import io.deephaven.vector.Vector; import org.apache.commons.lang3.tuple.Pair; import org.apache.parquet.bytes.HeapByteBufferAllocator; -import org.apache.parquet.column.statistics.IntStatistics; import org.apache.parquet.column.statistics.Statistics; -import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; import org.jetbrains.annotations.NotNull; import java.io.File; import java.io.IOException; -import java.lang.reflect.Field; import java.nio.*; import java.nio.file.Path; import java.nio.file.Paths; -import java.time.Instant; import java.util.*; -import java.util.function.IntSupplier; - -import static io.deephaven.util.QueryConstants.NULL_INT; /** * API for writing DH tables in parquet format */ public class ParquetTableWriter { - private static final int INITIAL_DICTIONARY_SIZE = 1 << 8; - public static final String METADATA_KEY = "deephaven"; - - private static final int LOCAL_CHUNK_SIZE = 1024; - public static final String BEGIN_POS = "dh_begin_pos"; public static final String END_POS = "dh_end_pos"; public static final String GROUPING_KEY = "dh_key"; - public static final String PARQUET_FILE_EXTENSION = ".parquet"; /** @@ -187,10 +166,9 @@ public static void write( // An example is the necessary precision and scale for a BigDecimal column writen as decimal logical type. final Map> computedCache = new HashMap<>(); final ParquetFileWriter parquetFileWriter = getParquetFileWriter(computedCache, definition, tableRowSet, - columnSourceMap, path, writeInstructions, tableMeta, - tableInfoBuilder); - - write(t, definition, writeInstructions, parquetFileWriter, computedCache); + columnSourceMap, path, writeInstructions, tableMeta, tableInfoBuilder); + // Given the transformation, do not use the original table's "definition" for writing + write(t, writeInstructions, parquetFileWriter, computedCache); } } @@ -200,14 +178,12 @@ public static void write( * tables created are properly cleaned up. * * @param table The table to write - * @param definition The table definition * @param writeInstructions Write instructions for customizations while writing * @param parquetFileWriter the writer * @throws IOException For file writing related errors */ private static void write( @NotNull final Table table, - @NotNull final TableDefinition definition, @NotNull final ParquetInstructions writeInstructions, @NotNull final ParquetFileWriter parquetFileWriter, @NotNull final Map> computedCache) throws IOException { @@ -217,13 +193,13 @@ private static void write( if (nRows > 0) { final RowGroupWriter rowGroupWriter = parquetFileWriter.addRowGroup(nRows); for (final Map.Entry> nameToSource : columnSourceMap.entrySet()) { - final String name = nameToSource.getKey(); + final String columnName = nameToSource.getKey(); final ColumnSource columnSource = nameToSource.getValue(); try { - writeColumnSource(computedCache, tableRowSet, rowGroupWriter, name, columnSource, - definition.getColumn(name), writeInstructions); + writeColumnSource(tableRowSet, writeInstructions, rowGroupWriter, computedCache, columnName, + columnSource); } catch (IllegalAccessException e) { - throw new RuntimeException("Failed to write column " + name, e); + throw new RuntimeException("Failed to write column " + columnName, e); } } } @@ -237,7 +213,8 @@ private static void write( * * @param table the input table * @param definition the table definition being written - * @return a transformed view of the input table. + * @return a transformed view of the input table. The table definition for the transformed view can be different + * from the definition of the input table. */ @NotNull private static Table pretransformTable(@NotNull final Table table, @NotNull final TableDefinition definition) { @@ -341,458 +318,142 @@ private static ParquetFileWriter getParquetFileWriter( writeInstructions.getCompressionCodecName(), extraMetaData); } - private interface ColumnWriteHelper { - - boolean isVectorFormat(); - - IntSupplier valuePageSizeSupplier(); - } - - /** - * ColumnWriteHelper for columns of "flat" data with no nesting or vector encoding. - */ - private static class FlatColumnWriterHelper implements ColumnWriteHelper { - - /** - * The maximum page size for values. - */ - private final int maxValuePageSize; - - private FlatColumnWriterHelper(final int maxValuePageSize) { - this.maxValuePageSize = maxValuePageSize; - } - - public boolean isVectorFormat() { - return false; - } - - public IntSupplier valuePageSizeSupplier() { - return () -> maxValuePageSize; - } - } - - /** - * This is a helper struct storing useful data required to write column source in the parquet file, particularly - * helpful for writing array/vector data. - */ - private static class VectorColumnWriterHelper implements ColumnWriteHelper { - - /** - * The source for per-row array/vector lengths. - */ - private final ColumnSource lengthSource; - - /** - * The RowSet for (ungrouped) values. - */ - private final RowSet valueRowSet; - - /** - * The size of each value page. Parallel to {@link #lengthPageSizes}. - */ - private final TIntArrayList valuePageSizes; - - /** - * The size of each length page. Parallel to {@link #valuePageSizes}. - */ - private final TIntArrayList lengthPageSizes; - - private VectorColumnWriterHelper( - @NotNull final ColumnSource lengthSource, - @NotNull final RowSet valueRowSet) { - this.lengthSource = lengthSource; - this.valueRowSet = valueRowSet; - valuePageSizes = new TIntArrayList(); - lengthPageSizes = new TIntArrayList(); - } - - public boolean isVectorFormat() { - return true; - } - - public IntSupplier lengthPageSizeSupplier() { - return lengthPageSizes.iterator()::next; - } - - public IntSupplier valuePageSizeSupplier() { - return valuePageSizes.iterator()::next; - } - } - @VisibleForTesting static void writeColumnSource( - @NotNull final Map> computedCache, - @NotNull final TrackingRowSet tableRowSet, + @NotNull final RowSet tableRowSet, + @NotNull final ParquetInstructions writeInstructions, @NotNull final RowGroupWriter rowGroupWriter, - @NotNull final String name, - @NotNull final ColumnSource columnSourceIn, - @NotNull final ColumnDefinition columnDefinition, - @NotNull final ParquetInstructions writeInstructions) throws IllegalAccessException, IOException { - ColumnSource valueSource = columnSourceIn; - - final ColumnWriteHelper helper; - int maxValuesPerPage = 0; - int maxRowsPerPage = 0; - int pageCount; - if (columnSourceIn.getComponentType() != null - && !CodecLookup.explicitCodecPresent(writeInstructions.getCodecName(columnDefinition.getName())) - && !CodecLookup.codecRequired(columnDefinition)) { - final VectorColumnWriterHelper vectorHelper; - final int targetValuesPerPage = getTargetRowsPerPage( - valueSource.getComponentType(), - writeInstructions.getTargetPageSize()); - final HashMap> columns = new HashMap<>(); - columns.put("array", valueSource); - { - final Table lengthsTable = new QueryTable(tableRowSet, columns); - final ColumnSource lengthSource = lengthsTable - .view("len= ((Object)array) == null ? null : (int)array." - + (Vector.class.isAssignableFrom(valueSource.getType()) ? "size()" : "length")) - .getColumnSource("len"); - final Table ungroupedArrays = lengthsTable.ungroup("array"); - vectorHelper = new VectorColumnWriterHelper(lengthSource, ungroupedArrays.getRowSet()); - helper = vectorHelper; - valueSource = ungroupedArrays.getColumnSource("array"); - } - - // This is the count of items contained in all arrays from the original table as we process. - int valuesInPage = 0; - - // This is the count of rows in the original table as we process them - int rowsInPage = 0; - try (final ChunkSource.GetContext context = vectorHelper.lengthSource.makeGetContext(LOCAL_CHUNK_SIZE); - final RowSequence.Iterator it = tableRowSet.getRowSequenceIterator()) { - while (it.hasMore()) { - final RowSequence rs = it.getNextRowSequenceWithLength(LOCAL_CHUNK_SIZE); - final IntChunk lengthChunk = - vectorHelper.lengthSource.getChunk(context, rs).asIntChunk(); - for (int chunkPos = 0; chunkPos < lengthChunk.size(); chunkPos++) { - final int curLength = lengthChunk.get(chunkPos); - if (curLength != NULL_INT) { - // If this array puts us past the target number of items within a page then we'll record the - // current values into the page lists above and restart our counts. - if ((valuesInPage + curLength > targetValuesPerPage || rowsInPage + 1 > targetValuesPerPage) - && (valuesInPage > 0 || rowsInPage > 0)) { - // Record the current item count and original row count into the parallel page arrays. - vectorHelper.valuePageSizes.add(valuesInPage); - vectorHelper.lengthPageSizes.add(rowsInPage); - maxValuesPerPage = Math.max(valuesInPage, maxValuesPerPage); - maxRowsPerPage = Math.max(rowsInPage, maxRowsPerPage); - - // Reset the counts to compute these values for the next page. - rowsInPage = 0; - valuesInPage = 0; - } - valuesInPage += curLength; - } - rowsInPage++; - } - } - } - - // If there are any leftover, accumulate the last page. - if (rowsInPage > 0) { - maxValuesPerPage = Math.max(valuesInPage, maxValuesPerPage); - maxRowsPerPage = Math.max(rowsInPage, maxRowsPerPage); - vectorHelper.valuePageSizes.add(valuesInPage); - vectorHelper.lengthPageSizes.add(rowsInPage); - } - pageCount = vectorHelper.valuePageSizes.size(); - } else { - final long tableSize = tableRowSet.size(); - final int targetPageSize = getTargetRowsPerPage( - valueSource.getType(), writeInstructions.getTargetPageSize()); - maxValuesPerPage = maxRowsPerPage = (int) Math.min(tableSize, targetPageSize); - helper = new FlatColumnWriterHelper(maxValuesPerPage); - pageCount = Math.toIntExact((tableSize + targetPageSize - 1) / targetPageSize); - } - - Class columnType = valueSource.getType(); - if (columnType == Instant.class) { - // noinspection unchecked - valueSource = (ColumnSource) ReinterpretUtils.instantToLongSource( - (ColumnSource) valueSource); - columnType = valueSource.getType(); - } else if (columnType == Boolean.class) { - // noinspection unchecked - valueSource = (ColumnSource) ReinterpretUtils.booleanToByteSource( - (ColumnSource) valueSource); - } - + @NotNull final Map> computedCache, + @NotNull final String columnName, + @NotNull ColumnSource columnSource) throws IllegalAccessException, IOException { try (final ColumnWriter columnWriter = rowGroupWriter.addColumn( - writeInstructions.getParquetColumnNameFromColumnNameOrDefault(name))) { + writeInstructions.getParquetColumnNameFromColumnNameOrDefault(columnName))) { boolean usedDictionary = false; - if (valueSource.getType() == String.class) { - usedDictionary = tryEncodeDictionary(writeInstructions, - tableRowSet, - columnDefinition, - columnWriter, - valueSource, - helper, - maxValuesPerPage, - maxRowsPerPage, - pageCount); + if (String.class.equals(columnSource.getType()) || String.class.equals(columnSource.getComponentType())) { + usedDictionary = + tryEncodeDictionary(tableRowSet, writeInstructions, columnWriter, columnName, columnSource); } - if (!usedDictionary) { - encodePlain(writeInstructions, - tableRowSet, - columnDefinition, - columnType, - columnWriter, - valueSource, - helper, - computedCache, - maxValuesPerPage, - maxRowsPerPage, - pageCount); + encodePlain(tableRowSet, writeInstructions, columnWriter, computedCache, columnName, columnSource); } } } - private static void encodePlain(@NotNull final ParquetInstructions writeInstructions, - @NotNull final RowSet tableRowSet, - @NotNull final ColumnDefinition columnDefinition, - @NotNull final Class columnType, - @NotNull final ColumnWriter columnWriter, - @NotNull final ColumnSource valueSource, - @NotNull final ColumnWriteHelper writingHelper, - @NotNull final Map> computedCache, - final int maxValuesPerPage, - final int maxRowsPerPage, - final int pageCount) throws IOException { - try (final TransferObject transferObject = TransferObject.create(computedCache, - tableRowSet, - valueSource, - columnDefinition, - maxValuesPerPage, - columnType, - writeInstructions)) { - final VectorColumnWriterHelper vectorHelper = writingHelper.isVectorFormat() - ? (VectorColumnWriterHelper) writingHelper - : null; - final Statistics statistics = columnWriter.getStats(); - // @formatter:off - try (final RowSequence.Iterator lengthRowSetIterator = vectorHelper != null - ? tableRowSet.getRowSequenceIterator() - : null; - final ChunkSource.GetContext lengthSourceContext = vectorHelper != null - ? vectorHelper.lengthSource.makeGetContext(maxRowsPerPage) - : null; - final RowSequence.Iterator valueRowSetIterator = vectorHelper != null - ? vectorHelper.valueRowSet.getRowSequenceIterator() - : tableRowSet.getRowSequenceIterator()) { - // @formatter:on - - final IntBuffer repeatCount = vectorHelper != null - ? IntBuffer.allocate(maxRowsPerPage) - : null; - final IntSupplier lengthPageSizeGetter = vectorHelper != null - ? vectorHelper.lengthPageSizeSupplier() - : null; - final IntSupplier valuePageSizeGetter = writingHelper.valuePageSizeSupplier(); - for (int step = 0; step < pageCount; ++step) { - final RowSequence rs = - valueRowSetIterator.getNextRowSequenceWithLength(valuePageSizeGetter.getAsInt()); - transferObject.fetchData(rs); - if (vectorHelper != null) { - final IntChunk lenChunk = vectorHelper.lengthSource.getChunk( - lengthSourceContext, - lengthRowSetIterator.getNextRowSequenceWithLength(lengthPageSizeGetter.getAsInt())) - .asIntChunk(); - lenChunk.copyToTypedBuffer(0, repeatCount, 0, lenChunk.size()); - repeatCount.limit(lenChunk.size()); - // TODO(deephaven-core:DH-4495): Add support for paginating vector data - // We do not paginate vector data, because our parquet reading code expects all elements from a - // single array or a vector to be on the same page (refer classes ToVectorPage and ToArrayPage - // for more details). - int numValuesBuffered = transferObject.transferAllToBuffer(); - columnWriter.addVectorPage(transferObject.getBuffer(), repeatCount, numValuesBuffered, - statistics); - repeatCount.clear(); - } else { - // Split a single page into multiple if we are not able to fit all the entries in one page - do { - int numValuesBuffered = transferObject.transferOnePageToBuffer(); - columnWriter.addPage(transferObject.getBuffer(), numValuesBuffered, statistics); - } while (transferObject.hasMoreDataToBuffer()); - } - } - } - } + /** + * Makes a copy of the given buffer + */ + private static IntBuffer makeCopy(IntBuffer orig) { + IntBuffer copy = IntBuffer.allocate(orig.capacity()); + copy.put(orig).flip(); + return copy; } private static boolean tryEncodeDictionary( - @NotNull final ParquetInstructions writeInstructions, @NotNull final RowSet tableRowSet, - @NotNull final ColumnDefinition columnDefinition, + @NotNull final ParquetInstructions writeInstructions, @NotNull final ColumnWriter columnWriter, - @NotNull final ColumnSource valueSource, - @NotNull final ColumnWriteHelper writingHelper, - final int maxValuesPerPage, - final int maxRowsPerPage, - final int pageCount) throws IOException { - // Note: We only support strings as dictionary pages. Knowing that, we can make some assumptions about chunk - // types and avoid a bunch of lambda and virtual method invocations. If we decide to support more, than - // these assumptions will need to be revisited. - Assert.eq(valueSource.getType(), "valueSource.getType()", String.class, "ColumnSource supports dictionary"); - - final boolean useDictionaryHint = writeInstructions.useDictionary(columnDefinition.getName()); + @NotNull final String columnName, + @NotNull final ColumnSource columnSource) throws IOException { + final boolean useDictionaryHint = writeInstructions.useDictionary(columnName); final int maxKeys = useDictionaryHint ? Integer.MAX_VALUE : writeInstructions.getMaximumDictionaryKeys(); final int maxDictSize = useDictionaryHint ? Integer.MAX_VALUE : writeInstructions.getMaximumDictionarySize(); - final VectorColumnWriterHelper vectorHelper = writingHelper.isVectorFormat() - ? (VectorColumnWriterHelper) writingHelper - : null; + // We encode dictionary positions as integers, therefore for a null string, we use NULL_INT as the position + final int NULL_POS = QueryConstants.NULL_INT; final Statistics statistics = columnWriter.getStats(); - try { - final List pageBuffers = new ArrayList<>(); - final BitSet pageBufferHasNull = new BitSet(); - Binary[] encodedKeys = new Binary[Math.min(INITIAL_DICTIONARY_SIZE, maxKeys)]; - - final TObjectIntHashMap keyToPos = - new TObjectIntHashMap<>(Constants.DEFAULT_CAPACITY, - Constants.DEFAULT_LOAD_FACTOR, - QueryConstants.NULL_INT); - int keyCount = 0; - int dictSize = 0; - boolean hasNulls = false; - final IntSupplier valuePageSizeGetter = writingHelper.valuePageSizeSupplier(); - try (final ChunkSource.GetContext context = valueSource.makeGetContext(maxValuesPerPage); - final RowSequence.Iterator it = vectorHelper != null - ? vectorHelper.valueRowSet.getRowSequenceIterator() - : tableRowSet.getRowSequenceIterator()) { - for (int curPage = 0; curPage < pageCount; curPage++) { - boolean pageHasNulls = false; - final RowSequence rs = it.getNextRowSequenceWithLength(valuePageSizeGetter.getAsInt()); - final ObjectChunk chunk = - valueSource.getChunk(context, rs).asObjectChunk(); - final IntBuffer posInDictionary = IntBuffer.allocate(rs.intSize()); - for (int vi = 0; vi < chunk.size(); ++vi) { - final String key = chunk.get(vi); - int dictionaryPos = keyToPos.get(key); - if (dictionaryPos == keyToPos.getNoEntryValue()) { - // Track the min/max statistics while the dictionary is being built. - if (key == null) { - hasNulls = pageHasNulls = true; - } else { - if (keyCount == encodedKeys.length) { - // Copy into an array of double the size with upper limit at maxKeys - if (keyCount == maxKeys) { - throw new DictionarySizeExceededException(String.format( - "Dictionary maximum keys exceeded for %s", columnDefinition.getName())); - } - encodedKeys = Arrays.copyOf(encodedKeys, (int) Math.min(keyCount * 2L, maxKeys)); - } - final Binary encodedKey = Binary.fromString(key); - dictSize += encodedKey.length(); - if (dictSize > maxDictSize) { - throw new DictionarySizeExceededException( - String.format("Dictionary maximum size exceeded for %s", - columnDefinition.getName())); - } - encodedKeys[keyCount] = encodedKey; - statistics.updateStats(encodedKey); - dictionaryPos = keyCount; - keyCount++; - } - keyToPos.put(key, dictionaryPos); - } - posInDictionary.put(dictionaryPos); + final List pageBuffers = new ArrayList<>(); + final List lengthsBuffers = new ArrayList<>(); + final BitSet pageBufferHasNull = new BitSet(); + final boolean isArrayOrVector = (columnSource.getComponentType() != null); + final StringDictionary dictionary = new StringDictionary(maxKeys, maxDictSize, statistics, NULL_POS); + int curPage = 0; + try (final TransferObject transferObject = TransferObject.createDictEncodedStringTransfer( + tableRowSet, columnSource, writeInstructions.getTargetPageSize(), dictionary)) { + boolean done; + do { + // Paginate the data and prepare the dictionary. Then add the dictionary page followed by all data pages + transferObject.transferOnePageToBuffer(); + done = !transferObject.hasMoreDataToBuffer(); + if (done) { + // If done, we store a reference to transfer object's page buffer, else we make copies of all the + // page buffers and write them later + pageBuffers.add(transferObject.getBuffer()); + if (isArrayOrVector) { + lengthsBuffers.add(transferObject.getRepeatCount()); } - pageBuffers.add(posInDictionary); - pageBufferHasNull.set(curPage, pageHasNulls); - } - } - - if (keyCount == 0 && hasNulls) { - // Reset the stats because we will re-encode these in PLAIN encoding. - columnWriter.resetStats(); - return false; - } - - List arraySizeBuffers = null; - if (vectorHelper != null) { - arraySizeBuffers = new ArrayList<>(); - final IntSupplier lengthPageSizeGetter = vectorHelper.lengthPageSizeSupplier(); - try (final ChunkSource.GetContext context = - vectorHelper.lengthSource.makeGetContext(maxRowsPerPage); - final RowSequence.Iterator it = tableRowSet.getRowSequenceIterator()) { - while (it.hasMore()) { - final RowSequence rs = it.getNextRowSequenceWithLength(lengthPageSizeGetter.getAsInt()); - final IntChunk chunk = - vectorHelper.lengthSource.getChunk(context, rs).asIntChunk(); - final IntBuffer newBuffer = IntBuffer.allocate(chunk.size()); - chunk.copyToTypedBuffer(0, newBuffer, 0, chunk.size()); - newBuffer.limit(chunk.size()); - arraySizeBuffers.add(newBuffer); + } else { + pageBuffers.add(makeCopy(transferObject.getBuffer())); + if (isArrayOrVector) { + lengthsBuffers.add(makeCopy(transferObject.getRepeatCount())); } } - } - - columnWriter.addDictionaryPage(encodedKeys, keyCount); - final Iterator arraySizeIt = arraySizeBuffers == null ? null : arraySizeBuffers.iterator(); - // We've already determined min/max statistics while building the dictionary. Now use an integer statistics - // object to track the number of nulls that will be written. - Statistics tmpStats = new IntStatistics(); - for (int i = 0; i < pageBuffers.size(); ++i) { - final IntBuffer pageBuffer = pageBuffers.get(i); - final boolean pageHasNulls = pageBufferHasNull.get(i); - pageBuffer.flip(); - if (vectorHelper != null) { - columnWriter.addVectorPage(pageBuffer, arraySizeIt.next(), pageBuffer.remaining(), tmpStats); - } else if (pageHasNulls) { - columnWriter.addPage(pageBuffer, pageBuffer.remaining(), tmpStats); - } else { - columnWriter.addPageNoNulls(pageBuffer, pageBuffer.remaining(), tmpStats); + if (transferObject.pageHasNull()) { + pageBufferHasNull.set(curPage); } - } - // Add the count of nulls to the overall stats. - statistics.incrementNumNulls(tmpStats.getNumNulls()); - return true; + curPage++; + } while (!done); } catch (final DictionarySizeExceededException ignored) { // Reset the stats because we will re-encode these in PLAIN encoding. columnWriter.resetStats(); - // We discard all the dictionary data accumulated so far and fall back to PLAIN encoding. We could have - // added a dictionary page first with data collected so far and then encoded the remaining data using PLAIN - // encoding (TODO deephaven-core#946). + // TODO(deephaven-core#946): We discard all dictionary data accumulated so far and fall back to PLAIN + // encoding. We could have added a dictionary page first with data collected so far and then encoded the + // remaining data using PLAIN encoding return false; } - } - - /** - * Get the number of rows that fit within the current targetPageSize for the specified type. - * - * @param columnType the column type - * @return the number of rows that fit within the target page size. - */ - private static int getTargetRowsPerPage(@NotNull final Class columnType, - final int targetPageSize) - throws IllegalAccessException { - if (columnType == Boolean.class) { - return targetPageSize * 8; - } - if (columnType == short.class || columnType == char.class || columnType == byte.class) { - return targetPageSize / Integer.BYTES; + if (dictionary.getKeyCount() == 0 && !pageBufferHasNull.isEmpty()) { + // Reset the stats because we will re-encode these in PLAIN encoding. + columnWriter.resetStats(); + return false; } - - if (columnType == String.class) { - // We don't know the length of strings until we read the actual data. Therefore, we take a relaxed estimate - // here and final calculation is done when writing the data. - return targetPageSize; + columnWriter.addDictionaryPage(dictionary.getEncodedKeys(), dictionary.getKeyCount()); + // We've already determined min/max statistics for the strings while building the dictionary. The buffer now + // stores only the offsets in the dictionary, and we don't need statistics for offsets. Therefore, we create a + // temporary integer stats object just to track the number of nulls and pass it to lower layers. + // We use the following fake type object to create proper statistics object + final PrimitiveType fakeObject = Types.optional(PrimitiveType.PrimitiveTypeName.INT32).named("fake"); + final Statistics tmpStats = Statistics.createStats(fakeObject); + final int numPages = pageBuffers.size(); + for (int i = 0; i < numPages; ++i) { + final IntBuffer pageBuffer = pageBuffers.get(i); + if (isArrayOrVector) { + columnWriter.addVectorPage(pageBuffer, lengthsBuffers.get(i), pageBuffer.remaining(), tmpStats); + } else { + final boolean pageHasNulls = pageBufferHasNull.get(i); + if (pageHasNulls) { + columnWriter.addPage(pageBuffer, pageBuffer.remaining(), tmpStats); + } else { + columnWriter.addPageNoNulls(pageBuffer, pageBuffer.remaining(), tmpStats); + } + } } + // Add the count of nulls to the overall stats. + statistics.incrementNumNulls(tmpStats.getNumNulls()); + return true; + } - try { - final Field bytesCountField = TypeUtils.getBoxedType(columnType).getField("BYTES"); - return targetPageSize / ((Integer) bytesCountField.get(null)); - } catch (NoSuchFieldException e) { - // We assume the baseline and go from there - return targetPageSize / 8; + private static void encodePlain( + @NotNull final RowSet tableRowSet, + @NotNull final ParquetInstructions writeInstructions, + @NotNull final ColumnWriter columnWriter, + @NotNull final Map> computedCache, + @NotNull final String columnName, + @NotNull final ColumnSource columnSource) throws IOException { + try (final TransferObject transferObject = TransferObject.create( + tableRowSet, writeInstructions, computedCache, columnName, columnSource)) { + final Statistics statistics = columnWriter.getStats(); + boolean writeVectorPages = (transferObject instanceof ArrayAndVectorTransfer); + do { + int numValuesBuffered = transferObject.transferOnePageToBuffer(); + if (writeVectorPages) { + columnWriter.addVectorPage(transferObject.getBuffer(), transferObject.getRepeatCount(), + numValuesBuffered, statistics); + } else { + columnWriter.addPage(transferObject.getBuffer(), numValuesBuffered, statistics); + } + } while (transferObject.hasMoreDataToBuffer()); } } - - private static Table groupingAsTable(Table tableToSave, String columnName) { final QueryTable coalesced = (QueryTable) tableToSave.coalesce(); final Table tableToGroup = (coalesced.isRefreshing() ? (QueryTable) coalesced.silent() : coalesced) diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java index e230034eec4..201eddfd04f 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java @@ -714,7 +714,7 @@ public static Table readPartitionedTableWithMetadata( return readPartitionedTable(layout, layout.getInstructions(), layout.getTableDefinition()); } - private static final SimpleTypeMap> DB_ARRAY_TYPE_MAP = SimpleTypeMap.create( + private static final SimpleTypeMap> VECTOR_TYPE_MAP = SimpleTypeMap.create( null, CharVector.class, ByteVector.class, ShortVector.class, IntVector.class, LongVector.class, FloatVector.class, DoubleVector.class, ObjectVector.class); @@ -748,7 +748,7 @@ private static ParquetSchemaReader.ColumnDefinitionConsumer makeSchemaReaderCons if (parquetColDef.dhSpecialType == ColumnTypeInfo.SpecialType.StringSet) { colDef = ColumnDefinition.fromGenericType(parquetColDef.name, StringSet.class, null); } else if (parquetColDef.dhSpecialType == ColumnTypeInfo.SpecialType.Vector) { - final Class vectorType = DB_ARRAY_TYPE_MAP.get(baseType); + final Class vectorType = VECTOR_TYPE_MAP.get(baseType); if (vectorType != null) { colDef = ColumnDefinition.fromGenericType(parquetColDef.name, vectorType, baseType); } else { diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/pagestore/VariablePageSizeColumnChunkPageStore.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/pagestore/VariablePageSizeColumnChunkPageStore.java index ef3cc2f81a0..2356393ea6d 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/pagestore/VariablePageSizeColumnChunkPageStore.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/pagestore/VariablePageSizeColumnChunkPageStore.java @@ -52,7 +52,7 @@ private void extendOnePage(final int prevNumPages) { synchronized (this) { int localNumPages = numPages; - // Make sure that no one has has already extended to this page yet. + // Make sure that no one has already extended to this page yet. if (localNumPages == prevNumPages) { Assert.assertion(columnPageReaderIterator.hasNext(), "columnPageReaderIterator.hasNext()", diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ArrayAndVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ArrayAndVectorTransfer.java new file mode 100644 index 00000000000..a934568a441 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ArrayAndVectorTransfer.java @@ -0,0 +1,66 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.util.QueryConstants; +import org.jetbrains.annotations.NotNull; + +import java.nio.IntBuffer; +import java.util.function.Supplier; + +/** + * Base class for all array and vector transfer objects + */ +public abstract class ArrayAndVectorTransfer + extends VariableWidthTransfer { + final IntBuffer repeatCounts; // Stores the lengths of arrays/vectors + + ArrayAndVectorTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int maxValuesPerPage, final int targetPageSize, @NotNull final BUFFER_TYPE buffer) { + super(columnSource, tableRowSet, maxValuesPerPage, targetPageSize, buffer); + this.repeatCounts = IntBuffer.allocate(maxValuesPerPage); + } + + @Override + public final IntBuffer getRepeatCount() { + return repeatCounts; + } + + @Override + final boolean addNullToBuffer() { + if (!repeatCounts.hasRemaining()) { + return false; + } + repeatCounts.put(QueryConstants.NULL_INT); + return true; + } + + @Override + final boolean isBufferEmpty() { + return repeatCounts.position() == 0; + } + + /** + * Helper class for creating a supplier of array data + * + * @param The type of the array + */ + static final class ArrayDataSupplier implements Supplier { + private A[] data; + private int pos = 0; + + void fill(final @NotNull A @NotNull [] data) { + this.data = data; + this.pos = 0; + } + + @Override + public A get() { + return data[pos++]; + } + } +} + diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/BooleanArrayTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/BooleanArrayTransfer.java new file mode 100644 index 00000000000..f6aa1b66bab --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/BooleanArrayTransfer.java @@ -0,0 +1,42 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.util.BooleanUtils; +import org.jetbrains.annotations.NotNull; + +import java.nio.ByteBuffer; + +final class BooleanArrayTransfer extends PrimitiveArrayAndVectorTransfer { + // We encode booleans as bytes here and bit pack them with 8 booleans per byte at the time of writing. + // Therefore, we need to allocate (targetPageSize * 8) bytes for the buffer. + private static final int BYTES_NEEDED_PER_ENCODED_BOOLEAN_VALUE = 1; + private static final int NUM_BIT_PACKED_BOOLEANS_PER_BYTE = 8; + BooleanArrayTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize * NUM_BIT_PACKED_BOOLEANS_PER_BYTE, + targetPageSize * NUM_BIT_PACKED_BOOLEANS_PER_BYTE, + ByteBuffer.allocate(targetPageSize * NUM_BIT_PACKED_BOOLEANS_PER_BYTE), + BYTES_NEEDED_PER_ENCODED_BOOLEAN_VALUE); + } + + @Override + int getSize(final Boolean @NotNull [] data) { + return data.length; + } + + @Override + void resizeBuffer(final int length) { + buffer = ByteBuffer.allocate(length); + } + + @Override + void copyToBuffer(final @NotNull EncodedData data) { + for (Boolean b : data.encodedValues) { + buffer.put(BooleanUtils.booleanAsByte(b)); + } + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/BooleanTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/BooleanTransfer.java index dca79c77ab6..b3725028421 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/BooleanTransfer.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/BooleanTransfer.java @@ -5,27 +5,36 @@ import io.deephaven.chunk.WritableByteChunk; import io.deephaven.chunk.attributes.Values; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.rowset.RowSet; import io.deephaven.engine.table.ColumnSource; import org.jetbrains.annotations.NotNull; import java.nio.ByteBuffer; -class BooleanTransfer extends PrimitiveTransfer, ByteBuffer> { - - public static BooleanTransfer create(@NotNull final ColumnSource columnSource, int targetSize) { - final byte[] backingArray = new byte[targetSize]; +final class BooleanTransfer extends PrimitiveTransfer, ByteBuffer> { + // We encode booleans as bytes here and bit pack them with 8 booleans per byte at the time of writing. + // Therefore, max values per page are (targetPageSize * 8). + static BooleanTransfer create(@NotNull final ColumnSource columnSource, @NotNull final RowSet tableRowSet, + int targetPageSize) { + final int NUM_BIT_PACKED_BOOLEANS_PER_BYTE = 8; + final int maxValuesPerPage = Math.toIntExact(Math.min(tableRowSet.size(), + (long) targetPageSize * NUM_BIT_PACKED_BOOLEANS_PER_BYTE)); + final byte[] backingArray = new byte[maxValuesPerPage]; return new BooleanTransfer( columnSource, + tableRowSet, WritableByteChunk.writableChunkWrap(backingArray), ByteBuffer.wrap(backingArray), - targetSize); + maxValuesPerPage); } private BooleanTransfer( @NotNull final ColumnSource columnSource, + @NotNull final RowSequence tableRowSet, @NotNull final WritableByteChunk chunk, @NotNull final ByteBuffer buffer, - int targetSize) { - super(columnSource, chunk, buffer, targetSize); + int maxValuesPerPage) { + super(columnSource, tableRowSet, chunk, buffer, maxValuesPerPage); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/BooleanVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/BooleanVectorTransfer.java new file mode 100644 index 00000000000..8522b6da7d7 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/BooleanVectorTransfer.java @@ -0,0 +1,39 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.primitive.iterator.CloseableIterator; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.util.BooleanUtils; +import io.deephaven.vector.ObjectVector; +import org.jetbrains.annotations.NotNull; + +import java.nio.ByteBuffer; + +final class BooleanVectorTransfer extends PrimitiveVectorTransfer, ByteBuffer> { + // We encode booleans as bytes here and bit pack them with 8 booleans per byte at the time of writing. + // Therefore, we need to allocate (targetPageSize * 8) bytes for the buffer. + private static final int BYTES_NEEDED_PER_ENCODED_BOOLEAN_VALUE = 1; + private static final int NUM_BIT_PACKED_BOOLEANS_PER_BYTE = 8; + BooleanVectorTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize * NUM_BIT_PACKED_BOOLEANS_PER_BYTE, + targetPageSize * NUM_BIT_PACKED_BOOLEANS_PER_BYTE, + ByteBuffer.allocate(targetPageSize * NUM_BIT_PACKED_BOOLEANS_PER_BYTE), + BYTES_NEEDED_PER_ENCODED_BOOLEAN_VALUE); + } + + @Override + void resizeBuffer(final int length) { + buffer = ByteBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData> data) { + try (final CloseableIterator dataIterator = data.encodedValues.iterator()) { + dataIterator.forEachRemaining((Boolean b) -> buffer.put(BooleanUtils.booleanAsByte(b))); + } + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ByteArrayTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ByteArrayTransfer.java new file mode 100644 index 00000000000..962c8485c71 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ByteArrayTransfer.java @@ -0,0 +1,41 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +/* + * --------------------------------------------------------------------------------------------------------------------- + * AUTO-GENERATED CLASS - DO NOT EDIT MANUALLY - for any changes edit CharArrayTransfer and regenerate + * --------------------------------------------------------------------------------------------------------------------- + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.jetbrains.annotations.NotNull; + +import java.nio.IntBuffer; + +final class ByteArrayTransfer extends PrimitiveArrayAndVectorTransfer { + ByteArrayTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + // We encode primitive bytes as primitive ints + super(columnSource, tableRowSet, targetPageSize / Integer.BYTES, targetPageSize, + IntBuffer.allocate(targetPageSize / Integer.BYTES), Integer.BYTES); + } + + @Override + int getSize(final byte @NotNull [] data) { + return data.length; + } + + @Override + void resizeBuffer(final int length) { + buffer = IntBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + for (byte value : data.encodedValues) { + buffer.put(value); + } + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ByteTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ByteTransfer.java index b54db47668d..32a2e7cf627 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ByteTransfer.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ByteTransfer.java @@ -10,18 +10,19 @@ import io.deephaven.chunk.ByteChunk; import io.deephaven.chunk.attributes.Values; +import io.deephaven.engine.rowset.RowSet; import io.deephaven.engine.table.ColumnSource; import org.jetbrains.annotations.NotNull; -class ByteTransfer extends IntCastablePrimitiveTransfer> { - - public ByteTransfer(@NotNull final ColumnSource columnSource, final int targetSize) { - super(columnSource, targetSize); +final class ByteTransfer extends IntCastablePrimitiveTransfer> { + ByteTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSet tableRowSet, final int targetSize) { + super(columnSource, tableRowSet, targetSize); } @Override public void copyAllFromChunkToBuffer() { - for (int chunkIdx = 0; chunkIdx < chunk.size(); ++chunkIdx) { + final int chunkSize = chunk.size(); + for (int chunkIdx = 0; chunkIdx < chunkSize; ++chunkIdx) { buffer.put(chunk.get(chunkIdx)); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ByteVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ByteVectorTransfer.java new file mode 100644 index 00000000000..e94954c370b --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ByteVectorTransfer.java @@ -0,0 +1,38 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +/* + * --------------------------------------------------------------------------------------------------------------------- + * AUTO-GENERATED CLASS - DO NOT EDIT MANUALLY - for any changes edit CharVectorTransfer and regenerate + * --------------------------------------------------------------------------------------------------------------------- + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.primitive.iterator.CloseablePrimitiveIteratorOfByte; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.vector.ByteVector; +import org.jetbrains.annotations.NotNull; + +import java.nio.IntBuffer; + +final class ByteVectorTransfer extends PrimitiveVectorTransfer { + ByteVectorTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + // We encode primitive bytes as primitive ints + super(columnSource, tableRowSet, targetPageSize / Integer.BYTES, targetPageSize, + IntBuffer.allocate(targetPageSize / Integer.BYTES), Integer.BYTES); + } + + @Override + void resizeBuffer(final int length) { + buffer = IntBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + try (final CloseablePrimitiveIteratorOfByte dataIterator = data.encodedValues.iterator()) { + dataIterator.forEachRemaining((byte value) -> buffer.put(value)); + } + } +} \ No newline at end of file diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CharArrayTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CharArrayTransfer.java new file mode 100644 index 00000000000..673888eb276 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CharArrayTransfer.java @@ -0,0 +1,36 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.jetbrains.annotations.NotNull; + +import java.nio.IntBuffer; + +final class CharArrayTransfer extends PrimitiveArrayAndVectorTransfer { + CharArrayTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + // We encode primitive chars as primitive ints + super(columnSource, tableRowSet, targetPageSize / Integer.BYTES, targetPageSize, + IntBuffer.allocate(targetPageSize / Integer.BYTES), Integer.BYTES); + } + + @Override + int getSize(final char @NotNull [] data) { + return data.length; + } + + @Override + void resizeBuffer(final int length) { + buffer = IntBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + for (char value : data.encodedValues) { + buffer.put(value); + } + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CharTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CharTransfer.java index 7024b8ac858..d7241babb67 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CharTransfer.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CharTransfer.java @@ -5,18 +5,19 @@ import io.deephaven.chunk.CharChunk; import io.deephaven.chunk.attributes.Values; +import io.deephaven.engine.rowset.RowSet; import io.deephaven.engine.table.ColumnSource; import org.jetbrains.annotations.NotNull; -class CharTransfer extends IntCastablePrimitiveTransfer> { - - public CharTransfer(@NotNull final ColumnSource columnSource, final int targetSize) { - super(columnSource, targetSize); +final class CharTransfer extends IntCastablePrimitiveTransfer> { + CharTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSet tableRowSet, final int targetSize) { + super(columnSource, tableRowSet, targetSize); } @Override public void copyAllFromChunkToBuffer() { - for (int chunkIdx = 0; chunkIdx < chunk.size(); ++chunkIdx) { + final int chunkSize = chunk.size(); + for (int chunkIdx = 0; chunkIdx < chunkSize; ++chunkIdx) { buffer.put(chunk.get(chunkIdx)); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CharVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CharVectorTransfer.java new file mode 100644 index 00000000000..805044ac101 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CharVectorTransfer.java @@ -0,0 +1,33 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.primitive.iterator.CloseablePrimitiveIteratorOfChar; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.vector.CharVector; +import org.jetbrains.annotations.NotNull; + +import java.nio.IntBuffer; + +final class CharVectorTransfer extends PrimitiveVectorTransfer { + CharVectorTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + // We encode primitive chars as primitive ints + super(columnSource, tableRowSet, targetPageSize / Integer.BYTES, targetPageSize, + IntBuffer.allocate(targetPageSize / Integer.BYTES), Integer.BYTES); + } + + @Override + void resizeBuffer(final int length) { + buffer = IntBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + try (final CloseablePrimitiveIteratorOfChar dataIterator = data.encodedValues.iterator()) { + dataIterator.forEachRemaining((char value) -> buffer.put(value)); + } + } +} \ No newline at end of file diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CodecArrayTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CodecArrayTransfer.java new file mode 100644 index 00000000000..4db6742b7ac --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CodecArrayTransfer.java @@ -0,0 +1,31 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.util.codec.ObjectCodec; +import org.apache.parquet.io.api.Binary; +import org.jetbrains.annotations.NotNull; + +/** + * Used to encode arrays of objects using a codec provided on construction. The difference between using this class and + * {@link CodecTransfer} is that this class encodes each element of the array individually whereas {@link CodecTransfer} + * will encode the entire array as a single value. + */ +final class CodecArrayTransfer extends ObjectArrayTransfer { + private final ObjectCodec codec; + + CodecArrayTransfer(final @NotNull ColumnSource columnSource, + @NotNull final ObjectCodec codec, + final @NotNull RowSequence tableRowSet, final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize); + this.codec = codec; + } + + @Override + Binary encodeToBinary(VALUE_TYPE value) { + return Binary.fromConstantByteArray(codec.encode(value)); + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CodecTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CodecTransfer.java index 4f0e120dad0..7b693eebddd 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CodecTransfer.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CodecTransfer.java @@ -3,25 +3,24 @@ */ package io.deephaven.parquet.table.transfer; +import io.deephaven.engine.rowset.RowSequence; import io.deephaven.engine.table.ColumnSource; import io.deephaven.util.codec.ObjectCodec; import org.apache.parquet.io.api.Binary; import org.jetbrains.annotations.NotNull; -class CodecTransfer extends EncodedTransfer { - private final ObjectCodec codec; +final class CodecTransfer extends ObjectTransfer { + private final ObjectCodec codec; - public CodecTransfer( - @NotNull final ColumnSource columnSource, - @NotNull final ObjectCodec codec, - final int maxValuesPerPage, - final int targetPageSize) { - super(columnSource, maxValuesPerPage, targetPageSize); + CodecTransfer(@NotNull final ColumnSource columnSource, @NotNull final ObjectCodec codec, + @NotNull final RowSequence tableRowSet, final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize); this.codec = codec; } @Override - Binary encodeToBinary(T value) { - return Binary.fromConstantByteArray(codec.encode(value)); + void encodeDataForBuffering(@NotNull COLUMN_TYPE data, @NotNull final EncodedData encodedData) { + Binary encodedValue = Binary.fromConstantByteArray(codec.encode(data)); + encodedData.fillSingle(encodedValue, encodedValue.length()); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CodecVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CodecVectorTransfer.java new file mode 100644 index 00000000000..30ac3f8e3c2 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/CodecVectorTransfer.java @@ -0,0 +1,31 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.util.codec.ObjectCodec; +import org.apache.parquet.io.api.Binary; +import org.jetbrains.annotations.NotNull; + +/** + * Used to encode vectors of objects using a codec provided on construction. The difference between using this class and + * {@link CodecTransfer} is that this class encodes each element of the vector individually whereas + * {@link CodecTransfer} will encode the entire vector as a single value. + */ +final class CodecVectorTransfer extends ObjectVectorTransfer { + private final ObjectCodec codec; + + CodecVectorTransfer(final @NotNull ColumnSource columnSource, + @NotNull final ObjectCodec codec, + final @NotNull RowSequence tableRowSet, final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize); + this.codec = codec; + } + + @Override + Binary encodeToBinary(VALUE_TYPE value) { + return Binary.fromConstantByteArray(codec.encode(value)); + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringArrayAndVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringArrayAndVectorTransfer.java new file mode 100644 index 00000000000..d73dcdc4608 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringArrayAndVectorTransfer.java @@ -0,0 +1,82 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.jetbrains.annotations.NotNull; + +import java.nio.IntBuffer; +import java.util.function.Supplier; + +/** + * Base class for reading dictionary-encoded string arrays and vectors. This class updates the {@link StringDictionary} + * with all the strings it encounters and generates an integer array of dictionary position values. This class extends + * {@link PrimitiveArrayAndVectorTransfer} to manage the dictionary positions similar to an integer array column. + */ +abstract public class DictEncodedStringArrayAndVectorTransfer + extends PrimitiveArrayAndVectorTransfer { + private final StringDictionary dictionary; + + private boolean pageHasNull; + private int[] dictEncodedValues; + private int numDictEncodedValues; + + DictEncodedStringArrayAndVectorTransfer(@NotNull ColumnSource columnSource, @NotNull RowSequence tableRowSet, + int targetPageSize, @NotNull StringDictionary dictionary) { + super(columnSource, tableRowSet, targetPageSize / Integer.BYTES, targetPageSize, + IntBuffer.allocate(targetPageSize / Integer.BYTES), Integer.BYTES); + this.dictionary = dictionary; + + this.pageHasNull = false; + this.dictEncodedValues = new int[targetPageSize]; + this.numDictEncodedValues = 0; + } + + @Override + public final int transferOnePageToBuffer() { + // Reset state before transferring each page + pageHasNull = false; + return super.transferOnePageToBuffer(); + } + + /** + * Helper method which takes a string supplier (from the array/vector transfer child classes) and number of strings, + * fetches that many strings from the supplier, adds them to the dictionary and populates an IntBuffer with + * dictionary position values. + */ + final void encodeDataForBufferingHelper(@NotNull Supplier strSupplier, final int numStrings, + @NotNull final EncodedData encodedData) { + numDictEncodedValues = 0; + if (numStrings > dictEncodedValues.length) { + dictEncodedValues = new int[numStrings]; + } + int numBytesEncoded = 0; + for (int i = 0; i < numStrings; i++) { + String value = strSupplier.get(); + if (value == null) { + pageHasNull = true; + } else { + numBytesEncoded += Integer.BYTES; + } + int posInDictionary = dictionary.add(value); + dictEncodedValues[numDictEncodedValues++] = posInDictionary; + } + encodedData.fillRepeated(dictEncodedValues, numBytesEncoded, numDictEncodedValues); + } + + @Override + final void resizeBuffer(final int length) { + buffer = IntBuffer.allocate(length); + } + + @Override + final void copyToBuffer(@NotNull final EncodedData data) { + buffer.put(data.encodedValues, 0, data.numValues); + } + + public final boolean pageHasNull() { + return pageHasNull; + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringArrayTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringArrayTransfer.java new file mode 100644 index 00000000000..761cb27fa14 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringArrayTransfer.java @@ -0,0 +1,24 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.jetbrains.annotations.NotNull; + +final class DictEncodedStringArrayTransfer extends DictEncodedStringArrayAndVectorTransfer { + private final ArrayDataSupplier supplier; + + DictEncodedStringArrayTransfer(@NotNull ColumnSource columnSource, @NotNull RowSequence tableRowSet, + int targetPageSize, StringDictionary dictionary) { + super(columnSource, tableRowSet, targetPageSize, dictionary); + supplier = new ArrayDataSupplier<>(); + } + + @Override + void encodeDataForBuffering(@NotNull final String @NotNull [] data, @NotNull final EncodedData encodedData) { + supplier.fill(data); + encodeDataForBufferingHelper(supplier, data.length, encodedData); + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringTransfer.java new file mode 100644 index 00000000000..fee1b6e0193 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringTransfer.java @@ -0,0 +1,52 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.chunk.ObjectChunk; +import io.deephaven.chunk.attributes.Values; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.jetbrains.annotations.NotNull; + +/** + * Transfer object for dictionary encoded string columns. This class updates the {@link StringDictionary} with all the + * strings it encounters and generates an IntBuffer of dictionary position values. The class extends from + * {@link IntCastablePrimitiveTransfer} to manage the dictionary positions similar to an Int column. + */ + +final class DictEncodedStringTransfer extends IntCastablePrimitiveTransfer> { + private final StringDictionary dictionary; + private boolean pageHasNull; + + DictEncodedStringTransfer(@NotNull ColumnSource columnSource, @NotNull RowSequence tableRowSet, + int targetPageSize, StringDictionary dictionary) { + super(columnSource, tableRowSet, targetPageSize); + this.dictionary = dictionary; + this.pageHasNull = false; + } + + @Override + public int transferOnePageToBuffer() { + // Reset state before transferring each page + pageHasNull = false; + return super.transferOnePageToBuffer(); + } + + @Override + public void copyAllFromChunkToBuffer() { + int chunkSize = chunk.size(); + for (int i = 0; i < chunkSize; i++) { + String value = chunk.get(i); + if (value == null) { + pageHasNull = true; + } + int posInDictionary = dictionary.add(value); + buffer.put(posInDictionary); + } + } + + public boolean pageHasNull() { + return pageHasNull; + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringVectorTransfer.java new file mode 100644 index 00000000000..074dc61b301 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DictEncodedStringVectorTransfer.java @@ -0,0 +1,27 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.primitive.iterator.CloseableIterator; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.vector.ObjectVector; +import org.jetbrains.annotations.NotNull; + +import java.util.function.Supplier; + +final class DictEncodedStringVectorTransfer extends DictEncodedStringArrayAndVectorTransfer> { + DictEncodedStringVectorTransfer(@NotNull ColumnSource columnSource, @NotNull RowSequence tableRowSet, + int targetPageSize, StringDictionary dictionary) { + super(columnSource, tableRowSet, targetPageSize, dictionary); + } + + @Override + void encodeDataForBuffering(@NotNull ObjectVector data, @NotNull final EncodedData encodedData) { + try (CloseableIterator iter = data.iterator()) { + Supplier supplier = iter::next; + encodeDataForBufferingHelper(supplier, data.intSize(), encodedData); + } + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DoubleArrayTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DoubleArrayTransfer.java new file mode 100644 index 00000000000..904c8d4cd01 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DoubleArrayTransfer.java @@ -0,0 +1,38 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +/* + * --------------------------------------------------------------------------------------------------------------------- + * AUTO-GENERATED CLASS - DO NOT EDIT MANUALLY - for any changes edit IntArrayTransfer and regenerate + * --------------------------------------------------------------------------------------------------------------------- + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.jetbrains.annotations.NotNull; + +import java.nio.DoubleBuffer; + +final class DoubleArrayTransfer extends PrimitiveArrayAndVectorTransfer { + DoubleArrayTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize / Double.BYTES, targetPageSize, + DoubleBuffer.allocate(targetPageSize / Double.BYTES), Double.BYTES); + } + + @Override + int getSize(final double @NotNull [] data) { + return data.length; + } + + @Override + void resizeBuffer(final int length) { + buffer = DoubleBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + buffer.put(data.encodedValues); + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DoubleTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DoubleTransfer.java index 52a8c7d0103..9165d562423 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DoubleTransfer.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DoubleTransfer.java @@ -10,27 +10,32 @@ import io.deephaven.chunk.WritableDoubleChunk; import io.deephaven.chunk.attributes.Values; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.rowset.RowSet; import io.deephaven.engine.table.ColumnSource; import org.jetbrains.annotations.NotNull; import java.nio.DoubleBuffer; -class DoubleTransfer extends PrimitiveTransfer, DoubleBuffer> { - - public static DoubleTransfer create(@NotNull final ColumnSource columnSource, final int targetSize) { - final double[] backingArray = new double[targetSize]; +final class DoubleTransfer extends PrimitiveTransfer, DoubleBuffer> { + static DoubleTransfer create(@NotNull final ColumnSource columnSource, @NotNull final RowSet tableRowSet, + final int targetPageSize) { + final int maxValuesPerPage = Math.toIntExact(Math.min(tableRowSet.size(), targetPageSize / Double.BYTES)); + final double[] backingArray = new double[maxValuesPerPage]; return new DoubleTransfer( columnSource, + tableRowSet, WritableDoubleChunk.writableChunkWrap(backingArray), DoubleBuffer.wrap(backingArray), - targetSize); + maxValuesPerPage); } private DoubleTransfer( @NotNull final ColumnSource columnSource, + @NotNull final RowSequence tableRowSet, @NotNull final WritableDoubleChunk chunk, @NotNull final DoubleBuffer buffer, - final int targetSize) { - super(columnSource, chunk, buffer, targetSize); + final int maxValuesPerPage) { + super(columnSource, tableRowSet, chunk, buffer, maxValuesPerPage); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DoubleVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DoubleVectorTransfer.java new file mode 100644 index 00000000000..178ecb4a1ce --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/DoubleVectorTransfer.java @@ -0,0 +1,37 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +/* + * --------------------------------------------------------------------------------------------------------------------- + * AUTO-GENERATED CLASS - DO NOT EDIT MANUALLY - for any changes edit IntVectorTransfer and regenerate + * --------------------------------------------------------------------------------------------------------------------- + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.primitive.iterator.CloseablePrimitiveIteratorOfDouble; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.vector.DoubleVector; +import org.jetbrains.annotations.NotNull; + +import java.nio.DoubleBuffer; + +final class DoubleVectorTransfer extends PrimitiveVectorTransfer { + DoubleVectorTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize / Double.BYTES, targetPageSize, + DoubleBuffer.allocate(targetPageSize / Double.BYTES), Double.BYTES); + } + + @Override + void resizeBuffer(final int length) { + buffer = DoubleBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + try (final CloseablePrimitiveIteratorOfDouble dataIterator = data.encodedValues.iterator()) { + dataIterator.forEachRemaining((double value) -> buffer.put(value)); + } + } +} \ No newline at end of file diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/EncodedTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/EncodedTransfer.java deleted file mode 100644 index a9f2ede7a68..00000000000 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/EncodedTransfer.java +++ /dev/null @@ -1,143 +0,0 @@ -/** - * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending - */ -package io.deephaven.parquet.table.transfer; - -import io.deephaven.base.verify.Assert; -import io.deephaven.chunk.ObjectChunk; -import io.deephaven.chunk.attributes.Values; -import io.deephaven.engine.rowset.RowSequence; -import io.deephaven.engine.table.ChunkSource; -import io.deephaven.engine.table.ColumnSource; -import org.apache.parquet.io.api.Binary; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; - -import java.util.Arrays; - -/** - * Used as a base class of transfer objects for types like strings or big integers that need specialized encoding, and - * thus we need to enforce page size limits while writing. - */ -abstract class EncodedTransfer implements TransferObject { - private final ColumnSource columnSource; - private final ChunkSource.GetContext context; - private final Binary[] buffer; - - private ObjectChunk chunk; - - /** - * Number of objects buffered - */ - private int bufferedDataCount; - - /** - * The target size of data to be stored in a single page. This is not a strictly enforced "maximum" page size. - */ - private final int targetPageSize; - - /** - * Index of next object from the chunk to be buffered - */ - private int currentChunkIdx; - - /** - * Encoded value which takes us beyond the page size limit. We cache it to avoid re-encoding. - */ - @Nullable - private Binary cachedEncodedValue; - - public EncodedTransfer( - @NotNull final ColumnSource columnSource, - final int maxValuesPerPage, - final int targetPageSize) { - this.columnSource = columnSource; - this.buffer = new Binary[maxValuesPerPage]; - context = this.columnSource.makeGetContext(maxValuesPerPage); - this.targetPageSize = targetPageSize; - bufferedDataCount = 0; - cachedEncodedValue = null; - } - - @Override - final public void fetchData(@NotNull final RowSequence rs) { - // noinspection unchecked - chunk = (ObjectChunk) columnSource.getChunk(context, rs); - currentChunkIdx = 0; - bufferedDataCount = 0; - } - - @Override - final public int transferAllToBuffer() { - // Assuming this method is called after fetchData() and that the buffer is empty. - Assert.neqNull(chunk, "chunk"); - Assert.eqZero(currentChunkIdx, "currentChunkIdx"); - Assert.eqZero(bufferedDataCount, "bufferedDataCount"); - int chunkSize = chunk.size(); - while (currentChunkIdx < chunkSize) { - final T value = chunk.get(currentChunkIdx++); - buffer[bufferedDataCount++] = value == null ? null : encodeToBinary(value); - } - chunk = null; - return bufferedDataCount; - } - - @Override - final public int transferOnePageToBuffer() { - if (!hasMoreDataToBuffer()) { - return 0; - } - if (bufferedDataCount != 0) { - // Clear any old buffered data - Arrays.fill(buffer, 0, bufferedDataCount, null); - bufferedDataCount = 0; - } - int bufferedDataSize = 0; - int chunkSize = chunk.size(); - while (currentChunkIdx < chunkSize) { - final T value = chunk.get(currentChunkIdx); - if (value == null) { - currentChunkIdx++; - buffer[bufferedDataCount++] = null; - continue; - } - Binary binaryEncodedValue; - if (cachedEncodedValue == null) { - binaryEncodedValue = encodeToBinary(value); - } else { - binaryEncodedValue = cachedEncodedValue; - cachedEncodedValue = null; - } - - // Always buffer the first element, even if it exceeds the target page size. - if (bufferedDataSize != 0 && bufferedDataSize + binaryEncodedValue.length() > targetPageSize) { - cachedEncodedValue = binaryEncodedValue; - break; - } - currentChunkIdx++; - buffer[bufferedDataCount++] = binaryEncodedValue; - bufferedDataSize += binaryEncodedValue.length(); - } - if (currentChunkIdx == chunk.size()) { - chunk = null; - } - return bufferedDataCount; - } - - abstract Binary encodeToBinary(T value); - - @Override - final public boolean hasMoreDataToBuffer() { - return ((chunk != null) && (currentChunkIdx < chunk.size())); - } - - @Override - final public Binary[] getBuffer() { - return buffer; - } - - @Override - final public void close() { - context.close(); - } -} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/FloatArrayTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/FloatArrayTransfer.java new file mode 100644 index 00000000000..ffc23ff6921 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/FloatArrayTransfer.java @@ -0,0 +1,38 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +/* + * --------------------------------------------------------------------------------------------------------------------- + * AUTO-GENERATED CLASS - DO NOT EDIT MANUALLY - for any changes edit IntArrayTransfer and regenerate + * --------------------------------------------------------------------------------------------------------------------- + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.jetbrains.annotations.NotNull; + +import java.nio.FloatBuffer; + +final class FloatArrayTransfer extends PrimitiveArrayAndVectorTransfer { + FloatArrayTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize / Float.BYTES, targetPageSize, + FloatBuffer.allocate(targetPageSize / Float.BYTES), Float.BYTES); + } + + @Override + int getSize(final float @NotNull [] data) { + return data.length; + } + + @Override + void resizeBuffer(final int length) { + buffer = FloatBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + buffer.put(data.encodedValues); + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/FloatTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/FloatTransfer.java index 3bfb9dfbcd7..d0492ae88ef 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/FloatTransfer.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/FloatTransfer.java @@ -10,27 +10,32 @@ import io.deephaven.chunk.WritableFloatChunk; import io.deephaven.chunk.attributes.Values; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.rowset.RowSet; import io.deephaven.engine.table.ColumnSource; import org.jetbrains.annotations.NotNull; import java.nio.FloatBuffer; -class FloatTransfer extends PrimitiveTransfer, FloatBuffer> { - - public static FloatTransfer create(@NotNull final ColumnSource columnSource, final int targetSize) { - final float[] backingArray = new float[targetSize]; +final class FloatTransfer extends PrimitiveTransfer, FloatBuffer> { + static FloatTransfer create(@NotNull final ColumnSource columnSource, @NotNull final RowSet tableRowSet, + final int targetPageSize) { + final int maxValuesPerPage = Math.toIntExact(Math.min(tableRowSet.size(), targetPageSize / Float.BYTES)); + final float[] backingArray = new float[maxValuesPerPage]; return new FloatTransfer( columnSource, + tableRowSet, WritableFloatChunk.writableChunkWrap(backingArray), FloatBuffer.wrap(backingArray), - targetSize); + maxValuesPerPage); } private FloatTransfer( @NotNull final ColumnSource columnSource, + @NotNull final RowSequence tableRowSet, @NotNull final WritableFloatChunk chunk, @NotNull final FloatBuffer buffer, - final int targetSize) { - super(columnSource, chunk, buffer, targetSize); + final int maxValuesPerPage) { + super(columnSource, tableRowSet, chunk, buffer, maxValuesPerPage); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/FloatVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/FloatVectorTransfer.java new file mode 100644 index 00000000000..372e1614130 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/FloatVectorTransfer.java @@ -0,0 +1,37 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +/* + * --------------------------------------------------------------------------------------------------------------------- + * AUTO-GENERATED CLASS - DO NOT EDIT MANUALLY - for any changes edit IntVectorTransfer and regenerate + * --------------------------------------------------------------------------------------------------------------------- + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.primitive.iterator.CloseablePrimitiveIteratorOfFloat; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.vector.FloatVector; +import org.jetbrains.annotations.NotNull; + +import java.nio.FloatBuffer; + +final class FloatVectorTransfer extends PrimitiveVectorTransfer { + FloatVectorTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize / Float.BYTES, targetPageSize, + FloatBuffer.allocate(targetPageSize / Float.BYTES), Float.BYTES); + } + + @Override + void resizeBuffer(final int length) { + buffer = FloatBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + try (final CloseablePrimitiveIteratorOfFloat dataIterator = data.encodedValues.iterator()) { + dataIterator.forEachRemaining((float value) -> buffer.put(value)); + } + } +} \ No newline at end of file diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/InstantArrayTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/InstantArrayTransfer.java new file mode 100644 index 00000000000..514bfd1f691 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/InstantArrayTransfer.java @@ -0,0 +1,38 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.time.DateTimeUtils; +import org.jetbrains.annotations.NotNull; + +import java.nio.LongBuffer; +import java.time.Instant; + +final class InstantArrayTransfer extends PrimitiveArrayAndVectorTransfer { + // We encode Instants as primitive longs + InstantArrayTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize / Long.BYTES, targetPageSize, + LongBuffer.allocate(targetPageSize / Long.BYTES), Long.BYTES); + } + + @Override + int getSize(final Instant @NotNull [] data) { + return data.length; + } + + @Override + void resizeBuffer(final int length) { + buffer = LongBuffer.allocate(length); + } + + @Override + void copyToBuffer(final @NotNull EncodedData data) { + for (Instant t : data.encodedValues) { + buffer.put(DateTimeUtils.epochNanos(t)); + } + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/InstantVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/InstantVectorTransfer.java new file mode 100644 index 00000000000..f6d8ba5f600 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/InstantVectorTransfer.java @@ -0,0 +1,35 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.primitive.iterator.CloseableIterator; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.time.DateTimeUtils; +import io.deephaven.vector.ObjectVector; +import org.jetbrains.annotations.NotNull; + +import java.nio.LongBuffer; +import java.time.Instant; + +final class InstantVectorTransfer extends PrimitiveVectorTransfer, LongBuffer> { + // We encode Instants as primitive longs + InstantVectorTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize / Long.BYTES, targetPageSize, + LongBuffer.allocate(targetPageSize / Long.BYTES), Long.BYTES); + } + + @Override + void resizeBuffer(final int length) { + buffer = LongBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData> data) { + try (final CloseableIterator dataIterator = data.encodedValues.iterator()) { + dataIterator.forEachRemaining((Instant t) -> buffer.put(DateTimeUtils.epochNanos(t))); + } + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntArrayTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntArrayTransfer.java new file mode 100644 index 00000000000..dca832e1f6e --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntArrayTransfer.java @@ -0,0 +1,33 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.jetbrains.annotations.NotNull; + +import java.nio.IntBuffer; + +final class IntArrayTransfer extends PrimitiveArrayAndVectorTransfer { + IntArrayTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize / Integer.BYTES, targetPageSize, + IntBuffer.allocate(targetPageSize / Integer.BYTES), Integer.BYTES); + } + + @Override + int getSize(final int @NotNull [] data) { + return data.length; + } + + @Override + void resizeBuffer(final int length) { + buffer = IntBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + buffer.put(data.encodedValues); + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntCastablePrimitiveTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntCastablePrimitiveTransfer.java index 2950a7508b7..2306cda06e2 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntCastablePrimitiveTransfer.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntCastablePrimitiveTransfer.java @@ -3,6 +3,7 @@ */ package io.deephaven.parquet.table.transfer; +import io.deephaven.base.verify.Assert; import io.deephaven.chunk.ChunkBase; import io.deephaven.chunk.attributes.Values; import io.deephaven.engine.rowset.RowSequence; @@ -20,33 +21,30 @@ abstract class IntCastablePrimitiveTransfer> impleme protected T chunk; protected final IntBuffer buffer; private final ColumnSource columnSource; + private final RowSequence.Iterator tableRowSetIt; private final ChunkSource.GetContext context; + private final int maxValuesPerPage; - IntCastablePrimitiveTransfer(ColumnSource columnSource, int targetSize) { + IntCastablePrimitiveTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { this.columnSource = columnSource; - this.buffer = IntBuffer.allocate(targetSize); - context = columnSource.makeGetContext(targetSize); + this.tableRowSetIt = tableRowSet.getRowSequenceIterator(); + this.maxValuesPerPage = Math.toIntExact(Math.min(tableRowSet.size(), targetPageSize / Integer.BYTES)); + Assert.gtZero(maxValuesPerPage, "maxValuesPerPage"); + this.buffer = IntBuffer.allocate(maxValuesPerPage); + context = columnSource.makeGetContext(maxValuesPerPage); } @Override - final public void fetchData(@NotNull final RowSequence rs) { - // noinspection unchecked - chunk = (T) columnSource.getChunk(context, rs); - } - - @Override - final public int transferAllToBuffer() { - return transferOnePageToBuffer(); - } - - @Override - final public int transferOnePageToBuffer() { + public int transferOnePageToBuffer() { if (!hasMoreDataToBuffer()) { return 0; } buffer.clear(); - // Assuming that all the fetched data will fit in one page. This is because page count is accurately - // calculated for non variable-width types. Check ParquetTableWriter.getTargetRowsPerPage for more details. + // Fetch one page worth of data from the column source + final RowSequence rs = tableRowSetIt.getNextRowSequenceWithLength((long) maxValuesPerPage); + // noinspection unchecked + chunk = (T) columnSource.getChunk(context, rs); copyAllFromChunkToBuffer(); buffer.flip(); int ret = chunk.size(); @@ -61,17 +59,18 @@ final public int transferOnePageToBuffer() { abstract void copyAllFromChunkToBuffer(); @Override - final public boolean hasMoreDataToBuffer() { - return (chunk != null); + public final boolean hasMoreDataToBuffer() { + return tableRowSetIt.hasMore(); } @Override - final public IntBuffer getBuffer() { + public final IntBuffer getBuffer() { return buffer; } @Override - final public void close() { + public final void close() { context.close(); + tableRowSetIt.close(); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntTransfer.java index cee2b4473f7..db43cd19bb0 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntTransfer.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntTransfer.java @@ -5,27 +5,32 @@ import io.deephaven.chunk.WritableIntChunk; import io.deephaven.chunk.attributes.Values; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.rowset.RowSet; import io.deephaven.engine.table.ColumnSource; import org.jetbrains.annotations.NotNull; import java.nio.IntBuffer; -class IntTransfer extends PrimitiveTransfer, IntBuffer> { - - public static IntTransfer create(@NotNull final ColumnSource columnSource, final int targetSize) { - final int[] backingArray = new int[targetSize]; +final class IntTransfer extends PrimitiveTransfer, IntBuffer> { + static IntTransfer create(@NotNull final ColumnSource columnSource, @NotNull final RowSet tableRowSet, + final int targetPageSize) { + final int maxValuesPerPage = Math.toIntExact(Math.min(tableRowSet.size(), targetPageSize / Integer.BYTES)); + final int[] backingArray = new int[maxValuesPerPage]; return new IntTransfer( columnSource, + tableRowSet, WritableIntChunk.writableChunkWrap(backingArray), IntBuffer.wrap(backingArray), - targetSize); + maxValuesPerPage); } private IntTransfer( @NotNull final ColumnSource columnSource, + @NotNull final RowSequence tableRowSet, @NotNull final WritableIntChunk chunk, @NotNull final IntBuffer buffer, - final int targetSize) { - super(columnSource, chunk, buffer, targetSize); + final int maxValuesPerPage) { + super(columnSource, tableRowSet, chunk, buffer, maxValuesPerPage); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntVectorTransfer.java new file mode 100644 index 00000000000..5cfb718f662 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/IntVectorTransfer.java @@ -0,0 +1,32 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.primitive.iterator.CloseablePrimitiveIteratorOfInt; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.vector.IntVector; +import org.jetbrains.annotations.NotNull; + +import java.nio.IntBuffer; + +final class IntVectorTransfer extends PrimitiveVectorTransfer { + IntVectorTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize / Integer.BYTES, targetPageSize, + IntBuffer.allocate(targetPageSize / Integer.BYTES), Integer.BYTES); + } + + @Override + void resizeBuffer(final int length) { + buffer = IntBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + try (final CloseablePrimitiveIteratorOfInt dataIterator = data.encodedValues.iterator()) { + dataIterator.forEachRemaining((int value) -> buffer.put(value)); + } + } +} \ No newline at end of file diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/LongArrayTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/LongArrayTransfer.java new file mode 100644 index 00000000000..b9d15480a09 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/LongArrayTransfer.java @@ -0,0 +1,38 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +/* + * --------------------------------------------------------------------------------------------------------------------- + * AUTO-GENERATED CLASS - DO NOT EDIT MANUALLY - for any changes edit IntArrayTransfer and regenerate + * --------------------------------------------------------------------------------------------------------------------- + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.jetbrains.annotations.NotNull; + +import java.nio.LongBuffer; + +final class LongArrayTransfer extends PrimitiveArrayAndVectorTransfer { + LongArrayTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize / Long.BYTES, targetPageSize, + LongBuffer.allocate(targetPageSize / Long.BYTES), Long.BYTES); + } + + @Override + int getSize(final long @NotNull [] data) { + return data.length; + } + + @Override + void resizeBuffer(final int length) { + buffer = LongBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + buffer.put(data.encodedValues); + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/LongTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/LongTransfer.java index 3e71f50cb18..de31bc10a88 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/LongTransfer.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/LongTransfer.java @@ -10,27 +10,32 @@ import io.deephaven.chunk.WritableLongChunk; import io.deephaven.chunk.attributes.Values; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.rowset.RowSet; import io.deephaven.engine.table.ColumnSource; import org.jetbrains.annotations.NotNull; import java.nio.LongBuffer; -class LongTransfer extends PrimitiveTransfer, LongBuffer> { - - public static LongTransfer create(@NotNull final ColumnSource columnSource, final int targetSize) { - final long[] backingArray = new long[targetSize]; +final class LongTransfer extends PrimitiveTransfer, LongBuffer> { + static LongTransfer create(@NotNull final ColumnSource columnSource, @NotNull final RowSet tableRowSet, + final int targetPageSize) { + final int maxValuesPerPage = Math.toIntExact(Math.min(tableRowSet.size(), targetPageSize / Long.BYTES)); + final long[] backingArray = new long[maxValuesPerPage]; return new LongTransfer( columnSource, + tableRowSet, WritableLongChunk.writableChunkWrap(backingArray), LongBuffer.wrap(backingArray), - targetSize); + maxValuesPerPage); } private LongTransfer( @NotNull final ColumnSource columnSource, + @NotNull final RowSequence tableRowSet, @NotNull final WritableLongChunk chunk, @NotNull final LongBuffer buffer, - final int targetSize) { - super(columnSource, chunk, buffer, targetSize); + final int maxValuesPerPage) { + super(columnSource, tableRowSet, chunk, buffer, maxValuesPerPage); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/LongVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/LongVectorTransfer.java new file mode 100644 index 00000000000..04cfc4ee136 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/LongVectorTransfer.java @@ -0,0 +1,37 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +/* + * --------------------------------------------------------------------------------------------------------------------- + * AUTO-GENERATED CLASS - DO NOT EDIT MANUALLY - for any changes edit IntVectorTransfer and regenerate + * --------------------------------------------------------------------------------------------------------------------- + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.primitive.iterator.CloseablePrimitiveIteratorOfLong; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.vector.LongVector; +import org.jetbrains.annotations.NotNull; + +import java.nio.LongBuffer; + +final class LongVectorTransfer extends PrimitiveVectorTransfer { + LongVectorTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize / Long.BYTES, targetPageSize, + LongBuffer.allocate(targetPageSize / Long.BYTES), Long.BYTES); + } + + @Override + void resizeBuffer(final int length) { + buffer = LongBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + try (final CloseablePrimitiveIteratorOfLong dataIterator = data.encodedValues.iterator()) { + dataIterator.forEachRemaining((long value) -> buffer.put(value)); + } + } +} \ No newline at end of file diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectArrayAndVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectArrayAndVectorTransfer.java new file mode 100644 index 00000000000..e87ae1fa91a --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectArrayAndVectorTransfer.java @@ -0,0 +1,126 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + + import io.deephaven.base.verify.Assert; + import io.deephaven.engine.rowset.RowSequence; + import io.deephaven.engine.table.ColumnSource; + import org.apache.parquet.io.api.Binary; + import org.jetbrains.annotations.NotNull; + + import java.util.Arrays; + import java.util.function.Supplier; + +/** + * Used as a base class of arrays/vectors of transfer objects for types like strings or big integers that need + * specialized encoding. + * @param The type of the data in the column, could be an array/vector + * @param The type of the values in the array/vector + */ +abstract class ObjectArrayAndVectorTransfer + extends ArrayAndVectorTransfer { + /** + * Number of values added to the buffer + */ + private int bufferedDataCount; + /** + * Total number of bytes buffered + */ + private int numBytesBuffered; + + /** + * Used as a temporary buffer for storing references to binary encoded values for a single row before it is copied + * to the main buffer. + */ + private Binary[] encodedDataBuf; + private int encodedDataBufLen; + + ObjectArrayAndVectorTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize, targetPageSize, new Binary[targetPageSize]); + bufferedDataCount = 0; + numBytesBuffered = 0; + + encodedDataBuf = new Binary[targetPageSize]; + encodedDataBufLen = 0; + } + + @Override + public final int transferOnePageToBuffer() { + // Clear any old buffered data + if (bufferedDataCount != 0) { + Arrays.fill(buffer, 0, bufferedDataCount, null); + bufferedDataCount = 0; + numBytesBuffered = 0; + repeatCounts.clear(); + } + // Fill the buffer with data from the table + transferOnePageToBufferHelper(); + repeatCounts.flip(); + return bufferedDataCount; + } + + @Override + final int getNumBytesBuffered() { + return numBytesBuffered; + } + + final void encodeDataForBufferingHelper(@NotNull final Supplier objectSupplier, final int numObjects, + @NotNull final EncodedData encodedData) { + // Allocate a new buffer if needed, or clear the existing one + if (numObjects > encodedDataBuf.length) { + encodedDataBuf = new Binary[numObjects]; + } else { + Arrays.fill(encodedDataBuf, 0, encodedDataBufLen, null); + encodedDataBufLen = 0; + } + int numBytesEncoded = 0; + for (int i = 0; i < numObjects; i++) { + VALUE_TYPE value = objectSupplier.get(); + if (value == null) { + encodedDataBuf[i] = null; + } else { + encodedDataBuf[i] = encodeToBinary(value); + numBytesEncoded += encodedDataBuf[i].length(); + } + } + encodedDataBufLen = numObjects; + encodedData.fillRepeated(encodedDataBuf, numBytesEncoded, numObjects); + } + + /** + * Encode a single value to binary + */ + abstract Binary encodeToBinary(VALUE_TYPE value); + + final boolean addEncodedDataToBuffer(@NotNull final EncodedData data, final boolean force) { + if (force && (repeatCounts.position() != 0 || bufferedDataCount != 0)) { + // This should never happen, because "force" is only set by the caller when adding the very first + // array/vector + //noinspection ThrowableNotThrown + Assert.statementNeverExecuted(); + return false; + } + if (!repeatCounts.hasRemaining()) { + return false; + } + final int numEncodedValues = data.numValues; + if (bufferedDataCount + numEncodedValues > maxValuesPerPage) { + if (force) { + // Resize the buffer, if needed. Assuming the buffer is empty, verified earlier + if (buffer.length < numEncodedValues) { + buffer = new Binary[numEncodedValues]; + } + } else { + return false; + } + } + for (int i = 0; i < numEncodedValues; i++) { + buffer[bufferedDataCount++] = data.encodedValues[i]; + } + numBytesBuffered += data.numBytes; + repeatCounts.put(numEncodedValues); + return true; + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectArrayTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectArrayTransfer.java new file mode 100644 index 00000000000..dda15f99134 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectArrayTransfer.java @@ -0,0 +1,29 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.apache.parquet.io.api.Binary; +import org.jetbrains.annotations.NotNull; + +/** + * Used as a base class of transfer objects for arrays of types like strings or big integers that need specialized + * encoding. + */ +abstract class ObjectArrayTransfer extends ObjectArrayAndVectorTransfer { + private final ArrayDataSupplier supplier; + + ObjectArrayTransfer(final @NotNull ColumnSource columnSource, final @NotNull RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize); + supplier = new ArrayDataSupplier<>(); + } + + @Override + final void encodeDataForBuffering(final VALUE_TYPE @NotNull [] data, @NotNull final EncodedData encodedData) { + supplier.fill(data); + encodeDataForBufferingHelper(supplier, data.length, encodedData); + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectTransfer.java new file mode 100644 index 00000000000..149ddfaebfe --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectTransfer.java @@ -0,0 +1,80 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.base.verify.Assert; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.apache.parquet.io.api.Binary; +import org.jetbrains.annotations.NotNull; + +import java.util.Arrays; + +/** + * Used as a base class of transfer objects for types like strings or big integers that need specialized encoding. + */ +abstract class ObjectTransfer extends VariableWidthTransfer { + /** + * Number of values (null or non-null) added to the buffer + */ + private int bufferedDataCount; + /** + * Total number of bytes buffered + */ + private int numBytesBuffered; + + ObjectTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize, targetPageSize, new Binary[targetPageSize]); + bufferedDataCount = 0; + numBytesBuffered = 0; + } + + @Override + public final int transferOnePageToBuffer() { + // Clear any old buffered data + if (bufferedDataCount != 0) { + Arrays.fill(buffer, 0, bufferedDataCount, null); + bufferedDataCount = 0; + numBytesBuffered = 0; + } + // Fill the buffer with data from the table + transferOnePageToBufferHelper(); + return bufferedDataCount; + } + + @Override + final int getNumBytesBuffered() { + return numBytesBuffered; + } + + @Override + final boolean isBufferEmpty() { + return bufferedDataCount == 0; + } + + @Override + final boolean addNullToBuffer() { + if (bufferedDataCount == maxValuesPerPage) { + return false; + } + buffer[bufferedDataCount++] = null; + return true; + } + + final boolean addEncodedDataToBuffer(@NotNull final EncodedData data, final boolean force) { + if (force && bufferedDataCount != 0) { + // This should never happen, because "force" is only set by the caller when adding the very first object + //noinspection ThrowableNotThrown + Assert.statementNeverExecuted(); + return false; + } + if (bufferedDataCount == maxValuesPerPage) { + return false; + } + buffer[bufferedDataCount++] = data.encodedValues; + numBytesBuffered += data.numBytes; + return true; + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectVectorTransfer.java new file mode 100644 index 00000000000..9a7046a2b09 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ObjectVectorTransfer.java @@ -0,0 +1,34 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.primitive.iterator.CloseableIterator; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.vector.ObjectVector; +import org.apache.parquet.io.api.Binary; +import org.jetbrains.annotations.NotNull; + +import java.util.function.Supplier; + +/** + * Used as a base class of transfer objects for vectors of types like strings or big integers that need specialized + * encoding. + */ +abstract class ObjectVectorTransfer + extends ObjectArrayAndVectorTransfer, VALUE_TYPE> { + ObjectVectorTransfer(final @NotNull ColumnSource columnSource, final @NotNull RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize); + } + + @Override + final void encodeDataForBuffering(final @NotNull ObjectVector data, + @NotNull final EncodedData encodedData) { + try (CloseableIterator iter = data.iterator()) { + Supplier supplier = iter::next; + encodeDataForBufferingHelper(supplier, data.intSize(), encodedData); + } + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/PrimitiveArrayAndVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/PrimitiveArrayAndVectorTransfer.java new file mode 100644 index 00000000000..b53855a3288 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/PrimitiveArrayAndVectorTransfer.java @@ -0,0 +1,104 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.base.verify.Assert; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.jetbrains.annotations.NotNull; + +import java.nio.Buffer; + +/** + * Used as a base class of transfer objects for arrays/vectors of primitive types. + */ +abstract class PrimitiveArrayAndVectorTransfer + extends ArrayAndVectorTransfer { + + private final int numBytesPerValue; + + PrimitiveArrayAndVectorTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int maxValuesPerPage, final int targetPageSize, + @NotNull final BUFFER_TYPE buffer, final int numBytesPerValue) { + super(columnSource, tableRowSet, maxValuesPerPage, targetPageSize, buffer); + this.numBytesPerValue = numBytesPerValue; + } + + @Override + public int transferOnePageToBuffer() { + // Clear any old buffered data + buffer.clear(); + repeatCounts.clear(); + // Fill the buffer with data from the table + transferOnePageToBufferHelper(); + // Prepare buffer for reading + buffer.flip(); + repeatCounts.flip(); + return buffer.limit(); + } + + @Override + void encodeDataForBuffering(@NotNull final COLUMN_TYPE data, + @NotNull final EncodedData encodedData) { + // No encoding needed here because we can calculate how many bytes will be needed per encoded value. + // So we store the reference to data as is and do any required encoding later while copying to buffer. + // This is done to avoid creating a temporary copy of encoded values here. + int numValues = getSize(data); + // noinspection unchecked + encodedData.fillRepeated((ENCODED_COLUMN_TYPE) data, numValues * numBytesPerValue, numValues); + } + + /** + * Get the size of primitive array/vector, called from inside {@link #encodeDataForBuffering}. Not needed for + * classes which override the method and do their own encoding, like dictionary encoded strings. + * + * @param data the array/vector + */ + int getSize(@NotNull final COLUMN_TYPE data) { + throw new UnsupportedOperationException("getSize() not implemented for " + getClass().getSimpleName()); + } + + @Override + final int getNumBytesBuffered() { + return buffer.position() * numBytesPerValue; + } + + final boolean addEncodedDataToBuffer(@NotNull final EncodedData data, boolean force) { + if (force && (repeatCounts.position() != 0 || buffer.position() != 0)) { + // This should never happen, because "force" is only set by the caller when adding the very first + // array/vector + // noinspection ThrowableNotThrown + Assert.statementNeverExecuted(); + return false; + } + if (!repeatCounts.hasRemaining()) { + return false; + } + if (buffer.position() + data.numValues > maxValuesPerPage) { + if (force) { + // Assuming buffer is empty here, verified earlier + if (buffer.limit() < data.numValues) { + resizeBuffer(data.numValues); + } + } else { + return false; + } + } + copyToBuffer(data); + repeatCounts.put(data.numValues); + return true; + } + + /** + * Copy the encoded values to the buffer. This function should be called after checking that there is enough space + * in the buffer. + */ + abstract void copyToBuffer(@NotNull final EncodedData data); + + /** + * Resize the underlying page buffer, needed in case of overflow when transferring the first array/vector. + */ + abstract void resizeBuffer(final int length); +} + diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/PrimitiveTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/PrimitiveTransfer.java index 7b7e94c4a2f..4ebe7b63d97 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/PrimitiveTransfer.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/PrimitiveTransfer.java @@ -3,6 +3,7 @@ */ package io.deephaven.parquet.table.transfer; +import io.deephaven.base.verify.Assert; import io.deephaven.chunk.WritableChunk; import io.deephaven.chunk.attributes.Values; import io.deephaven.engine.rowset.RowSequence; @@ -16,60 +17,56 @@ * PrimitiveTransfer is a generic class that can be used to transfer primitive data types directly from a ColumnSource * to a Buffer using {@link ColumnSource#fillChunk(ChunkSource.FillContext, WritableChunk, RowSequence)}. */ -class PrimitiveTransfer, B extends Buffer> - implements TransferObject { +abstract class PrimitiveTransfer, B extends Buffer> implements TransferObject { private final C chunk; private final B buffer; private final ColumnSource columnSource; + private final RowSequence.Iterator tableRowSetIt; private final ChunkSource.FillContext context; - private boolean hasMoreDataToBuffer; + private final int maxValuesPerPage; PrimitiveTransfer( @NotNull final ColumnSource columnSource, + @NotNull final RowSequence tableRowSet, @NotNull final C chunk, @NotNull final B buffer, - final int targetSize) { + final int maxValuesPerPage) { this.columnSource = columnSource; + this.tableRowSetIt = tableRowSet.getRowSequenceIterator(); this.chunk = chunk; this.buffer = buffer; - context = columnSource.makeFillContext(targetSize); + Assert.gtZero(maxValuesPerPage, "maxValuesPerPage"); + this.maxValuesPerPage = maxValuesPerPage; + this.context = columnSource.makeFillContext(maxValuesPerPage); } @Override - public void fetchData(@NotNull final RowSequence rs) { - columnSource.fillChunk(context, chunk, rs); - hasMoreDataToBuffer = true; - } - - @Override - public int transferAllToBuffer() { - return transferOnePageToBuffer(); - } - - @Override - public int transferOnePageToBuffer() { + public final int transferOnePageToBuffer() { if (!hasMoreDataToBuffer()) { return 0; } + // Fetch one page worth of data from the column source + final RowSequence rs = tableRowSetIt.getNextRowSequenceWithLength(maxValuesPerPage); + columnSource.fillChunk(context, chunk, rs); // Assuming that buffer and chunk are backed by the same array. buffer.position(0); buffer.limit(chunk.size()); - hasMoreDataToBuffer = false; return chunk.size(); } @Override - public boolean hasMoreDataToBuffer() { - return hasMoreDataToBuffer; + public final boolean hasMoreDataToBuffer() { + return tableRowSetIt.hasMore(); } @Override - public B getBuffer() { + public final B getBuffer() { return buffer; } @Override - public void close() { + public final void close() { context.close(); + tableRowSetIt.close(); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/PrimitiveVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/PrimitiveVectorTransfer.java new file mode 100644 index 00000000000..a75ad837530 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/PrimitiveVectorTransfer.java @@ -0,0 +1,29 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.vector.Vector; +import org.jetbrains.annotations.NotNull; + +import java.nio.Buffer; + +/** + * Used as a base class of transfer objects for vectors of primitive types. + */ +abstract class PrimitiveVectorTransfer, BUFFER_TYPE extends Buffer> + extends PrimitiveArrayAndVectorTransfer { + + PrimitiveVectorTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int maxValuesPerPage, final int targetPageSize, @NotNull final BUFFER_TYPE buffer, + final int numBytesPerValue) { + super(columnSource, tableRowSet, maxValuesPerPage, targetPageSize, buffer, numBytesPerValue); + } + + @Override + final int getSize(@NotNull final COLUMN_TYPE data) { + return data.intSize(); + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ShortArrayTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ShortArrayTransfer.java new file mode 100644 index 00000000000..f1ca2c61d7e --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ShortArrayTransfer.java @@ -0,0 +1,41 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +/* + * --------------------------------------------------------------------------------------------------------------------- + * AUTO-GENERATED CLASS - DO NOT EDIT MANUALLY - for any changes edit CharArrayTransfer and regenerate + * --------------------------------------------------------------------------------------------------------------------- + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.jetbrains.annotations.NotNull; + +import java.nio.IntBuffer; + +final class ShortArrayTransfer extends PrimitiveArrayAndVectorTransfer { + ShortArrayTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + // We encode primitive shorts as primitive ints + super(columnSource, tableRowSet, targetPageSize / Integer.BYTES, targetPageSize, + IntBuffer.allocate(targetPageSize / Integer.BYTES), Integer.BYTES); + } + + @Override + int getSize(final short @NotNull [] data) { + return data.length; + } + + @Override + void resizeBuffer(final int length) { + buffer = IntBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + for (short value : data.encodedValues) { + buffer.put(value); + } + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ShortTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ShortTransfer.java index da4242b3e86..0ff0c6d3842 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ShortTransfer.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ShortTransfer.java @@ -10,18 +10,19 @@ import io.deephaven.chunk.ShortChunk; import io.deephaven.chunk.attributes.Values; +import io.deephaven.engine.rowset.RowSet; import io.deephaven.engine.table.ColumnSource; import org.jetbrains.annotations.NotNull; -class ShortTransfer extends IntCastablePrimitiveTransfer> { - - public ShortTransfer(@NotNull final ColumnSource columnSource, final int targetSize) { - super(columnSource, targetSize); +final class ShortTransfer extends IntCastablePrimitiveTransfer> { + ShortTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSet tableRowSet, final int targetSize) { + super(columnSource, tableRowSet, targetSize); } @Override public void copyAllFromChunkToBuffer() { - for (int chunkIdx = 0; chunkIdx < chunk.size(); ++chunkIdx) { + final int chunkSize = chunk.size(); + for (int chunkIdx = 0; chunkIdx < chunkSize; ++chunkIdx) { buffer.put(chunk.get(chunkIdx)); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ShortVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ShortVectorTransfer.java new file mode 100644 index 00000000000..5c56d59bc9f --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/ShortVectorTransfer.java @@ -0,0 +1,38 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +/* + * --------------------------------------------------------------------------------------------------------------------- + * AUTO-GENERATED CLASS - DO NOT EDIT MANUALLY - for any changes edit CharVectorTransfer and regenerate + * --------------------------------------------------------------------------------------------------------------------- + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.primitive.iterator.CloseablePrimitiveIteratorOfShort; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import io.deephaven.vector.ShortVector; +import org.jetbrains.annotations.NotNull; + +import java.nio.IntBuffer; + +final class ShortVectorTransfer extends PrimitiveVectorTransfer { + ShortVectorTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int targetPageSize) { + // We encode primitive shorts as primitive ints + super(columnSource, tableRowSet, targetPageSize / Integer.BYTES, targetPageSize, + IntBuffer.allocate(targetPageSize / Integer.BYTES), Integer.BYTES); + } + + @Override + void resizeBuffer(final int length) { + buffer = IntBuffer.allocate(length); + } + + @Override + void copyToBuffer(@NotNull final EncodedData data) { + try (final CloseablePrimitiveIteratorOfShort dataIterator = data.encodedValues.iterator()) { + dataIterator.forEachRemaining((short value) -> buffer.put(value)); + } + } +} \ No newline at end of file diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringArrayTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringArrayTransfer.java new file mode 100644 index 00000000000..402736e8dc4 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringArrayTransfer.java @@ -0,0 +1,21 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.apache.parquet.io.api.Binary; +import org.jetbrains.annotations.NotNull; + +final class StringArrayTransfer extends ObjectArrayTransfer { + StringArrayTransfer(final @NotNull ColumnSource columnSource, final @NotNull RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize); + } + + @Override + Binary encodeToBinary(String value) { + return Binary.fromString(value); + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringDictionary.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringDictionary.java new file mode 100644 index 00000000000..1b8524611f3 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringDictionary.java @@ -0,0 +1,95 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ + +package io.deephaven.parquet.table.transfer; + +import gnu.trove.impl.Constants; +import gnu.trove.map.hash.TObjectIntHashMap; +import io.deephaven.parquet.table.DictionarySizeExceededException; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.io.api.Binary; + +import java.util.Arrays; + +/** + * Stores a dictionary of strings and returns their position in the dictionary, useful for encoding string columns. + */ +final public class StringDictionary { + + private static final int INITIAL_DICTIONARY_SIZE = 1 << 8; + + private final int maxKeys; + private final int maxDictSize; + private final Statistics statistics; + /** + * {@code null} is not added to the dictionary. This class will return the following position value on encountering + * a {@code null}. + */ + private final int nullPos; + + private final TObjectIntHashMap keyToPos; + + private Binary[] encodedKeys; + private int keyCount; + private int dictSize; + + public StringDictionary(final int maxKeys, final int maxDictSize, final Statistics statistics, + final int nullPos) { + this.maxKeys = maxKeys; + this.maxDictSize = maxDictSize; + this.statistics = statistics; + this.nullPos = nullPos; + + // Kept as a negative value since 0 is a valid position in the dictionary. + final int NO_ENTRY_VALUE = -1; + this.keyToPos = + new TObjectIntHashMap<>(Constants.DEFAULT_CAPACITY, Constants.DEFAULT_LOAD_FACTOR, NO_ENTRY_VALUE); + + this.encodedKeys = new Binary[Math.min(INITIAL_DICTIONARY_SIZE, maxKeys)]; + this.dictSize = this.keyCount = 0; + } + + public int getKeyCount() { + return keyCount; + } + + public Binary[] getEncodedKeys() { + return encodedKeys; + } + + /** + * Add a string key to the dictionary if it's not already present. + * + * @param key The key to add and/or find the position for + * @return {@code key}'s position in the dictionary, or special null key position (passed in constructor) if + * {@code key == null} + */ + public int add(final String key) { + if (key == null) { + return nullPos; + } + int posInDictionary = keyToPos.get(key); + if (posInDictionary == keyToPos.getNoEntryValue()) { + if (keyCount == encodedKeys.length) { + // Copy into an array of double the size with upper limit at maxKeys + if (keyCount == maxKeys) { + throw new DictionarySizeExceededException("Dictionary maximum keys exceeded"); + } + encodedKeys = Arrays.copyOf(encodedKeys, (int) Math.min(keyCount * 2L, maxKeys)); + } + final Binary encodedKey = Binary.fromString(key); + dictSize += encodedKey.length(); + if (dictSize > maxDictSize) { + throw new DictionarySizeExceededException("Dictionary maximum size exceeded"); + } + encodedKeys[keyCount] = encodedKey; + // Track the min/max statistics while the dictionary is being built. + statistics.updateStats(encodedKey); + posInDictionary = keyCount; + keyCount++; + keyToPos.put(key, posInDictionary); + } + return posInDictionary; + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringTransfer.java index 7d282530d38..3fea482542b 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringTransfer.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringTransfer.java @@ -3,20 +3,20 @@ */ package io.deephaven.parquet.table.transfer; +import io.deephaven.engine.rowset.RowSequence; import io.deephaven.engine.table.ColumnSource; import org.apache.parquet.io.api.Binary; import org.jetbrains.annotations.NotNull; -class StringTransfer extends EncodedTransfer { - public StringTransfer( - @NotNull final ColumnSource columnSource, - final int maxValuesPerPage, +final class StringTransfer extends ObjectTransfer { + StringTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, final int targetPageSize) { - super(columnSource, maxValuesPerPage, targetPageSize); + super(columnSource, tableRowSet, targetPageSize); } @Override - Binary encodeToBinary(String value) { - return Binary.fromString(value); + void encodeDataForBuffering(@NotNull String data, @NotNull final EncodedData encodedData) { + Binary encodedValue = Binary.fromString(data); + encodedData.fillSingle(encodedValue, encodedValue.length()); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringVectorTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringVectorTransfer.java new file mode 100644 index 00000000000..bb3ed000830 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/StringVectorTransfer.java @@ -0,0 +1,21 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ColumnSource; +import org.apache.parquet.io.api.Binary; +import org.jetbrains.annotations.NotNull; + +final class StringVectorTransfer extends ObjectVectorTransfer { + StringVectorTransfer(final @NotNull ColumnSource columnSource, final @NotNull RowSequence tableRowSet, + final int targetPageSize) { + super(columnSource, tableRowSet, targetPageSize); + } + + @Override + Binary encodeToBinary(String value) { + return Binary.fromString(value); + } +} diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/TransferObject.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/TransferObject.java index a737e14e455..79326799065 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/TransferObject.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/TransferObject.java @@ -3,100 +3,196 @@ */ package io.deephaven.parquet.table.transfer; -import io.deephaven.engine.rowset.RowSequence; import io.deephaven.engine.rowset.RowSet; -import io.deephaven.engine.table.ColumnDefinition; import io.deephaven.engine.table.ColumnSource; import io.deephaven.engine.table.impl.CodecLookup; +import io.deephaven.engine.table.impl.sources.ReinterpretUtils; import io.deephaven.engine.util.BigDecimalUtils; import io.deephaven.parquet.table.*; import io.deephaven.util.SafeCloseable; import io.deephaven.util.codec.ObjectCodec; +import io.deephaven.vector.Vector; import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; import java.math.BigDecimal; import java.math.BigInteger; +import java.nio.IntBuffer; +import java.time.Instant; import java.util.Map; /** * Classes that implement this interface are responsible for converting data from individual DH columns into buffers * to be written out to the Parquet file. * - * @param + * @param The type of the buffer to be written out to the Parquet file */ public interface TransferObject extends SafeCloseable { static TransferObject create( - @NotNull final Map> computedCache, @NotNull final RowSet tableRowSet, - @NotNull final ColumnSource columnSource, - @NotNull final ColumnDefinition columnDefinition, - final int maxValuesPerPage, - @NotNull final Class columnType, - @NotNull final ParquetInstructions instructions) { - if (int.class.equals(columnType)) { - return IntTransfer.create(columnSource, maxValuesPerPage); - } else if (long.class.equals(columnType)) { - return LongTransfer.create(columnSource, maxValuesPerPage); - } else if (double.class.equals(columnType)) { - return DoubleTransfer.create(columnSource, maxValuesPerPage); - } else if (float.class.equals(columnType)) { - return FloatTransfer.create(columnSource, maxValuesPerPage); - } else if (Boolean.class.equals(columnType)) { - return BooleanTransfer.create(columnSource, maxValuesPerPage); - } else if (short.class.equals(columnType)) { - return new ShortTransfer(columnSource, maxValuesPerPage); - } else if (char.class.equals(columnType)) { - return new CharTransfer(columnSource, maxValuesPerPage); - } else if (byte.class.equals(columnType)) { - return new ByteTransfer(columnSource, maxValuesPerPage); - } else if (String.class.equals(columnType)) { - return new StringTransfer(columnSource, maxValuesPerPage, instructions.getTargetPageSize()); + @NotNull final ParquetInstructions instructions, + @NotNull final Map> computedCache, + @NotNull final String columnName, + @NotNull final ColumnSource columnSource) { + Class columnType = columnSource.getType(); + if (columnType == int.class) { + return IntTransfer.create(columnSource, tableRowSet, instructions.getTargetPageSize()); } - - // If there's an explicit codec, we should disregard the defaults for these CodecLookup#lookup() will properly - // select the codec assigned by the instructions so we only need to check and redirect once. - if (!CodecLookup.explicitCodecPresent(instructions.getCodecName(columnDefinition.getName()))) { - if (BigDecimal.class.equals(columnType)) { + if (columnType == long.class) { + return LongTransfer.create(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (columnType == Instant.class) { + // noinspection unchecked + final ColumnSource longColumnSource = + (ColumnSource) ReinterpretUtils.instantToLongSource((ColumnSource) columnSource); + return LongTransfer.create(longColumnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (columnType == double.class) { + return DoubleTransfer.create(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (columnType == float.class) { + return FloatTransfer.create(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (columnType == Boolean.class) { + // noinspection unchecked + final ColumnSource byteColumnSource = + (ColumnSource) ReinterpretUtils.booleanToByteSource((ColumnSource) columnSource); + return BooleanTransfer.create(byteColumnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (columnType == short.class) { + return new ShortTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (columnType == char.class) { + return new CharTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (columnType == byte.class) { + return new ByteTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (columnType == String.class) { + return new StringTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (CodecLookup.explicitCodecPresent(instructions.getCodecName(columnName))) { + final ObjectCodec codec = CodecLookup.lookup( + columnType, instructions.getCodecName(columnName), instructions.getCodecArgs(columnName)); + return new CodecTransfer<>(columnSource, codec, tableRowSet, instructions.getTargetPageSize()); + } + if (columnType == BigDecimal.class) { // noinspection unchecked final ColumnSource bigDecimalColumnSource = (ColumnSource) columnSource; final BigDecimalUtils.PrecisionAndScale precisionAndScale = TypeInfos.getPrecisionAndScale( - computedCache, columnDefinition.getName(), tableRowSet, () -> bigDecimalColumnSource); + computedCache, columnName, tableRowSet, () -> bigDecimalColumnSource); final ObjectCodec codec = new BigDecimalParquetBytesCodec( precisionAndScale.precision, precisionAndScale.scale, -1); - return new CodecTransfer<>(bigDecimalColumnSource, codec, maxValuesPerPage, - instructions.getTargetPageSize()); - } else if (BigInteger.class.equals(columnType)) { - return new CodecTransfer<>(columnSource, new BigIntegerParquetBytesCodec(-1), - maxValuesPerPage, instructions.getTargetPageSize()); + return new CodecTransfer<>(bigDecimalColumnSource, codec, tableRowSet, instructions.getTargetPageSize()); + } + if (columnType == BigInteger.class) { + return new CodecTransfer<>(columnSource, new BigIntegerParquetBytesCodec(-1), tableRowSet, + instructions.getTargetPageSize()); + } + + @Nullable final Class componentType = columnSource.getComponentType(); + if (columnType.isArray()) { + if (componentType == int.class) { + return new IntArrayTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == long.class) { + return new LongArrayTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == double.class) { + return new DoubleArrayTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == float.class) { + return new FloatArrayTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == Boolean.class) { + return new BooleanArrayTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == short.class) { + return new ShortArrayTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == char.class) { + return new CharArrayTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); } + if (componentType == byte.class) { + return new ByteArrayTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == String.class) { + return new StringArrayTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == BigInteger.class) { + return new CodecArrayTransfer<>(columnSource, new BigIntegerParquetBytesCodec(-1), + tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == Instant.class) { + return new InstantArrayTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + // TODO(deephaven-core#4612): Handle arrays of BigDecimal and if explicit codec provided + } + if (Vector.class.isAssignableFrom(columnType)) { + if (componentType == int.class) { + return new IntVectorTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType ==long.class) { + return new LongVectorTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == double.class) { + return new DoubleVectorTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == float.class) { + return new FloatVectorTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == Boolean.class) { + return new BooleanVectorTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == short.class) { + return new ShortVectorTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == char.class) { + return new CharVectorTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == byte.class) { + return new ByteVectorTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == String.class) { + return new StringVectorTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == BigInteger.class) { + return new CodecVectorTransfer<>(columnSource, new BigIntegerParquetBytesCodec(-1), + tableRowSet, instructions.getTargetPageSize()); + } + if (componentType == Instant.class) { + return new InstantVectorTransfer(columnSource, tableRowSet, instructions.getTargetPageSize()); + } + // TODO(deephaven-core#4612): Handle vectors of BigDecimal and if explicit codec provided } - final ObjectCodec codec = CodecLookup.lookup(columnDefinition, instructions); - return new CodecTransfer<>(columnSource, codec, maxValuesPerPage, instructions.getTargetPageSize()); + // Go with the default + final ObjectCodec codec = CodecLookup.getDefaultCodec(columnType); + return new CodecTransfer<>(columnSource, codec, tableRowSet, instructions.getTargetPageSize()); } - /** - * Fetch all data corresponding to the provided row sequence. - */ - void fetchData(@NotNull RowSequence rs); - - /** - * Transfer all the fetched data into an internal buffer, which can then be accessed using - * {@link TransferObject#getBuffer()}. This method should only be called after - * {@link TransferObject#fetchData(RowSequence)}}. This method should be used when writing unpaginated data, and - * should not be interleaved with calls to {@link TransferObject#transferOnePageToBuffer()}. Note that this - * method can lead to out-of-memory error for variable-width types (e.g. strings) if the fetched data is too big - * to fit in the available heap. - * - * @return The number of fetched data entries copied into the buffer. - */ - int transferAllToBuffer(); + static @NotNull TransferObject createDictEncodedStringTransfer( + @NotNull final RowSet tableRowSet, @NotNull final ColumnSource columnSource, + final int targetPageSize, @NotNull final StringDictionary dictionary) { + @Nullable final Class dataType = columnSource.getType(); + @Nullable final Class componentType = columnSource.getComponentType(); + if (dataType == String.class) { + return new DictEncodedStringTransfer(columnSource, tableRowSet, targetPageSize, dictionary); + } + if (dataType.isArray() && componentType == String.class) { + return new DictEncodedStringArrayTransfer(columnSource, tableRowSet, targetPageSize, dictionary); + } + if (Vector.class.isAssignableFrom(dataType) && componentType == String.class) { + return new DictEncodedStringVectorTransfer(columnSource, tableRowSet, targetPageSize, dictionary); + } + // Dictionary encoding not supported for other types + throw new UnsupportedOperationException("Dictionary encoding not supported for type " + dataType.getName()); + } /** * Transfer one page size worth of fetched data into an internal buffer, which can then be accessed using - * {@link TransferObject#getBuffer()}. The target page size is passed in the constructor. The method should only - * be called after {@link TransferObject#fetchData(RowSequence)}}. This method should be used when writing - * paginated data, and should not be interleaved with calls to {@link TransferObject#transferAllToBuffer()}. + * {@link TransferObject#getBuffer()}. The target page size is passed in the constructor. + * For dictionary encoded string transfers, this method also updates the dictionary with the strings encountered. * * @return The number of fetched data entries copied into the buffer. This can be different from the total * number of entries fetched in case of variable-width types (e.g. strings) when used with additional @@ -105,7 +201,7 @@ static TransferObject create( int transferOnePageToBuffer(); /** - * Check if there is any fetched data which can be copied into buffer + * Check if there is any more data which can be copied into buffer */ boolean hasMoreDataToBuffer(); @@ -115,4 +211,22 @@ static TransferObject create( * @return the buffer */ B getBuffer(); + + /** + * Returns whether we encountered any null value while transferring page data to buffer. This method is only used + * for dictionary encoded string transfer objects. This method should be called after + * {@link #transferOnePageToBuffer()} and the state resets everytime we call {@link #transferOnePageToBuffer()}. + */ + default boolean pageHasNull() { + throw new UnsupportedOperationException("Only supported for dictionary encoded string transfer objects"); + } + + /** + * Get the lengths of array/vector elements added to the buffer. + * + * @return the buffer with counts + */ + default IntBuffer getRepeatCount() { + throw new UnsupportedOperationException("Only supported for array and vector transfer objects"); + } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/VariableWidthTransfer.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/VariableWidthTransfer.java new file mode 100644 index 00000000000..d73a76ee240 --- /dev/null +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/VariableWidthTransfer.java @@ -0,0 +1,201 @@ +/** + * Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending + */ +package io.deephaven.parquet.table.transfer; + +import io.deephaven.base.verify.Assert; +import io.deephaven.chunk.ObjectChunk; +import io.deephaven.chunk.attributes.Values; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.table.ChunkSource; +import io.deephaven.engine.table.ColumnSource; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; + +/** + * Base type for all transfer objects where we don't know the size of the data before actually reading the data. This + * includes strings, codec encoded objects, arrays and vectors. This class provides methods to iterate over the column, + * fetch the data, encode it and add it to buffer while enforcing page size constraints and handling overflow. + * + * @param The type of the data in the column + * @param The type of the encoded data to be added to the buffer + * @param The type of the buffer to be written out to the Parquet file + */ +abstract class VariableWidthTransfer + implements TransferObject { + private ObjectChunk chunk; + private final ColumnSource columnSource; + private final RowSequence.Iterator tableRowSetIt; + private final ChunkSource.GetContext context; + private final int targetPageSize; + private int currentChunkIdx; + /** + * The reusable field used to store the output from {@link #encodeDataForBuffering}. + */ + private final EncodedData encodedData; + /** + * Whether {@link #encodedData} stores the encoded value corresponding to {@link #currentChunkIdx}. This is useful + * to cache the value which took us beyond the page size limit. We cache it to avoid re-encoding. + */ + @Nullable + private boolean cached; + + /** + * The buffer to be written out to the Parquet file. This buffer is reused across pages and is resized if needed. + */ + BUFFER_TYPE buffer; + final int maxValuesPerPage; + + VariableWidthTransfer(@NotNull final ColumnSource columnSource, @NotNull final RowSequence tableRowSet, + final int maxValuesPerPage, final int targetPageSize, @NotNull final BUFFER_TYPE buffer) { + this.columnSource = columnSource; + this.tableRowSetIt = tableRowSet.getRowSequenceIterator(); + this.targetPageSize = targetPageSize; + Assert.gtZero(maxValuesPerPage, "targetPageSize"); + this.maxValuesPerPage = maxValuesPerPage; + Assert.gtZero(maxValuesPerPage, "maxValuesPerPage"); + this.context = columnSource.makeGetContext(Math.toIntExact(Math.min(maxValuesPerPage, tableRowSet.size()))); + this.currentChunkIdx = 0; + this.buffer = buffer; + this.encodedData = new EncodedData(); + this.cached = false; + } + + @Override + public final BUFFER_TYPE getBuffer() { + return buffer; + } + + @Override + final public boolean hasMoreDataToBuffer() { + // Unread data present either in the table or in the chunk + return tableRowSetIt.hasMore() || chunk != null; + } + + /** + * We pull each row from the column and encode it before adding it to the buffer. This class is used to store the + * encoded data, the number of values encoded (which can be more than 1 in case of array/vector columns) and the + * number of bytes encoded. + */ + static final class EncodedData { + E encodedValues; + int numValues; + int numBytes; + + /** + * Construct an empty object to be filled later using {@link #fillSingle} or {@link #fillRepeated} methods + */ + EncodedData() {} + + /** + * Used for non vector/array types where we have a single value in each row + */ + void fillSingle(@NotNull final E encodedValues, final int numBytes) { + fillInternal(encodedValues, numBytes, 1); + } + + /** + * Used for vector/array types where we can have a more than one value in each row + */ + void fillRepeated(@NotNull final E data, final int numBytes, final int numValues) { + fillInternal(data, numBytes, numValues); + } + + private void fillInternal(@NotNull final E encodedValues, final int numBytes, final int numValues) { + this.encodedValues = encodedValues; + this.numBytes = numBytes; + this.numValues = numValues; + } + } + + /** + * Helper method which transfers one page size worth of data from column source to buffer. The method assumes we + * have more data to buffer, so should be called if {@link #hasMoreDataToBuffer()} returns true. + */ + final void transferOnePageToBufferHelper() { + OUTER: do { + if (chunk == null) { + // Fetch a chunk of data from the table + final RowSequence rs = tableRowSetIt.getNextRowSequenceWithLength(maxValuesPerPage); + // noinspection unchecked + chunk = (ObjectChunk) columnSource.getChunk(context, rs); + currentChunkIdx = 0; + } + final int chunkSize = chunk.size(); + while (currentChunkIdx < chunkSize) { + final COLUMN_TYPE data = chunk.get(currentChunkIdx); + if (data == null) { + if (!addNullToBuffer()) { + // Reattempt adding null to the buffer in the next iteration + break OUTER; + } + currentChunkIdx++; + continue; + } + if (!cached) { + encodeDataForBuffering(data, encodedData); + } + if (isBufferEmpty()) { + // Always copy the first entry + addEncodedDataToBuffer(encodedData, true); + } else if (getNumBytesBuffered() + encodedData.numBytes > targetPageSize || + !addEncodedDataToBuffer(encodedData, false)) { + // Reattempt adding the encoded value to the buffer in the next iteration + cached = true; + break OUTER; + } + cached = false; + currentChunkIdx++; + } + if (currentChunkIdx == chunk.size()) { + chunk = null; + } + } while (tableRowSetIt.hasMore()); + } + + /** + * This method is called when we encounter a null row. + * + * @return Whether we succeeded in adding the null row to the buffer. A false value indicates overflow of underlying + * buffer and that we should stop reading more data from the column and return the buffer as-is. + */ + abstract boolean addNullToBuffer(); + + /** + * This method is called when we fetch a non-null row entry from the column and need to encode it before adding it + * to the buffer. This method assumes the data is non-null. The encoded data is stored in the parameter + * {@code encodedData} + * + * @param data The fetched value to be encoded, can be an array/vector or a single value + * @param encodedData The object to be filled with the encoded data + */ + abstract void encodeDataForBuffering(@NotNull final COLUMN_TYPE data, + @NotNull final EncodedData encodedData); + + /** + * This method is called for adding the encoded data to the buffer. + * + * @param data The encoded data to be added to the buffer + * @param force Whether we should force adding the data to the buffer even if it overflows buffer size and requires + * resizing + * + * @return Whether we succeeded in adding the data to the buffer. A false value indicates overflow of underlying + * buffer and that we should stop reading more data from the column and return the buffer as-is. + */ + abstract boolean addEncodedDataToBuffer(@NotNull final EncodedData data, final boolean force); + + /** + * The total number of encoded bytes corresponding to non-null values. Useful for adding page size constraints. + */ + abstract int getNumBytesBuffered(); + + /** + * Whether the buffer is empty, i.e. it contains no null or non-null value. + */ + abstract boolean isBufferEmpty(); + + final public void close() { + context.close(); + tableRowSetIt.close(); + } +} diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index bc23323bfbc..843ab40fd01 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -6,7 +6,6 @@ import io.deephaven.UncheckedDeephavenException; import io.deephaven.api.Selectable; import io.deephaven.base.FileUtils; -import io.deephaven.configuration.Configuration; import io.deephaven.datastructures.util.CollectionUtil; import io.deephaven.engine.context.ExecutionContext; import io.deephaven.engine.primitive.function.ByteConsumer; @@ -16,6 +15,8 @@ import io.deephaven.engine.primitive.iterator.CloseableIterator; import io.deephaven.engine.table.ColumnDefinition; import io.deephaven.engine.table.ColumnSource; +import io.deephaven.engine.table.impl.select.FunctionalColumn; +import io.deephaven.engine.table.impl.select.SelectColumn; import io.deephaven.engine.table.impl.sources.ReinterpretUtils; import io.deephaven.engine.table.impl.util.ColumnHolder; import io.deephaven.engine.table.impl.select.FormulaEvaluationException; @@ -24,8 +25,10 @@ import io.deephaven.engine.testutil.junit4.EngineCleanup; import io.deephaven.engine.util.BigDecimalUtils; import io.deephaven.engine.util.file.TrackedFileHandleFactory; +import io.deephaven.parquet.base.NullStatistics; import io.deephaven.parquet.base.InvalidParquetFileException; import io.deephaven.parquet.table.location.ParquetTableLocationKey; +import io.deephaven.parquet.table.transfer.StringDictionary; import io.deephaven.stringset.ArrayStringSet; import io.deephaven.engine.table.Table; import io.deephaven.engine.table.TableDefinition; @@ -58,9 +61,13 @@ import java.time.Instant; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; import java.util.Collection; +import java.util.Map; +import java.util.Objects; import java.util.function.DoubleConsumer; +import java.util.function.Function; import java.util.function.IntConsumer; import java.util.function.LongConsumer; @@ -101,7 +108,6 @@ private static Table getTableFlat(int size, boolean includeSerializable, boolean ArrayList columns = new ArrayList<>(Arrays.asList("someStringColumn = i % 10 == 0?null:(`` + (i % 101))", "nonNullString = `` + (i % 60)", - "nullString = (String) null", "nonNullPolyString = `` + (i % 600)", "someIntColumn = i", "someLongColumn = ii", @@ -113,6 +119,7 @@ private static Table getTableFlat(int size, boolean includeSerializable, boolean "someCharColumn = (char)i", "someTime = DateTimeUtils.now() + i", "someKey = `` + (int)(i /100)", + "biColumn = java.math.BigInteger.valueOf(ii)", "nullKey = i < -1?`123`:null", "nullIntColumn = (int)null", "nullLongColumn = (long)null", @@ -123,10 +130,10 @@ private static Table getTableFlat(int size, boolean includeSerializable, boolean "nullByteColumn = (byte)null", "nullCharColumn = (char)null", "nullTime = (Instant)null", + "nullBiColumn = (java.math.BigInteger)null", "nullString = (String)null")); if (includeBigDecimal) { columns.add("bdColumn = java.math.BigDecimal.valueOf(ii).stripTrailingZeros()"); - columns.add("biColumn = java.math.BigInteger.valueOf(ii)"); } if (includeSerializable) { columns.add("someSerializable = new SomeSillyTest(i)"); @@ -359,11 +366,11 @@ public void test_lz4_compressed() { // LZ4_RAW. We should be able to read it anyway with no exceptions. String path = TestParquetTools.class.getResource("/sample_lz4_compressed.parquet").getFile(); fromDisk = ParquetTools.readTable(path).select(); - File randomDest = new File(rootFile, "random.parquet"); - ParquetTools.writeTable(fromDisk, randomDest, ParquetTools.LZ4_RAW); } catch (RuntimeException e) { TestCase.fail("Failed to read parquet file sample_lz4_compressed.parquet"); } + File randomDest = new File(rootFile, "random.parquet"); + ParquetTools.writeTable(fromDisk, randomDest, ParquetTools.LZ4_RAW); // Read the LZ4 compressed file again, to make sure we use a new adapter fromDisk = ParquetTools.readTable(dest).select(); @@ -417,36 +424,78 @@ public void testBigDecimalPrecisionScale() { } } - @Test - public void testNullVectorColumns() { - final Table nullTable = getTableFlat(10, true, false); + private static void writeReadTableTest(final Table table, final File dest) { + writeReadTableTest(table, dest, ParquetInstructions.EMPTY); + } - final File dest = new File(rootFile + File.separator + "nullTable.parquet"); - ParquetTools.writeTable(nullTable, dest); - Table fromDisk = ParquetTools.readTable(dest); - assertTableEquals(nullTable, fromDisk); + private static void writeReadTableTest(final Table table, final File dest, ParquetInstructions writeInstructions) { + ParquetTools.writeTable(table, dest, writeInstructions); + final Table fromDisk = ParquetTools.readTable(dest); + TstUtils.assertTableEquals(table, fromDisk); + } + @Test + public void testVectorColumns() { + final Table table = getTableFlat(20000, true, false); // Take a groupBy to create vector columns containing null values - final Table nullVectorTable = nullTable.groupBy(); - ParquetTools.writeTable(nullVectorTable, dest); - fromDisk = ParquetTools.readTable(dest); - assertTableEquals(nullVectorTable, fromDisk); + Table vectorTable = table.groupBy().select(); + + final File dest = new File(rootFile + File.separator + "testVectorColumns.parquet"); + writeReadTableTest(vectorTable, dest); + + // Take a join with empty table to repeat the same row multiple times + vectorTable = vectorTable.join(TableTools.emptyTable(100)).select(); + writeReadTableTest(vectorTable, dest); + + // Convert the table from vector to array column + final Table arrayTable = vectorTable.updateView(vectorTable.getColumnSourceMap().keySet().stream() + .map(name -> name + " = " + name + ".toArray()") + .toArray(String[]::new)); + writeReadTableTest(arrayTable, dest); + + // Enforce a smaller page size to overflow the page + final ParquetInstructions writeInstructions = new ParquetInstructions.Builder() + .setTargetPageSize(ParquetInstructions.MIN_TARGET_PAGE_SIZE) + .build(); + writeReadTableTest(arrayTable, dest, writeInstructions); + writeReadTableTest(vectorTable, dest, writeInstructions); + } + + private static Table arrayToVectorTable(final Table table) { + final TableDefinition tableDefinition = table.getDefinition(); + final Collection arrayToVectorFormulas = new ArrayList<>(); + for (final ColumnDefinition columnDefinition : tableDefinition.getColumns()) { + final String columnName = columnDefinition.getName(); + final Class sourceDataType = (Class) columnDefinition.getDataType(); + if (!sourceDataType.isArray()) { + continue; + } + final Class componentType = Objects.requireNonNull(columnDefinition.getComponentType()); + final VectorFactory vectorFactory = VectorFactory.forElementType(componentType); + final Class> destinationDataType = vectorFactory.vectorType(); + final Function> vectorWrapFunction = vectorFactory::vectorWrap; + // noinspection unchecked,rawtypes + arrayToVectorFormulas.add(new FunctionalColumn( + columnName, sourceDataType, columnName, destinationDataType, componentType, vectorWrapFunction)); + } + return arrayToVectorFormulas.isEmpty() ? table : table.updateView(arrayToVectorFormulas); } @Test public void testArrayColumns() { ArrayList columns = new ArrayList<>(Arrays.asList( - "someStringArrayColumn = new String[] {i % 10 == 0?null:(`` + (i % 101))}", - "someIntArrayColumn = new int[] {i}", - "someLongArrayColumn = new long[] {ii}", - "someDoubleArrayColumn = new double[] {i*1.1}", - "someFloatArrayColumn = new float[] {(float)(i*1.1)}", - "someBoolArrayColumn = new Boolean[] {i % 3 == 0?true:i%3 == 1?false:null}", - "someShorArrayColumn = new short[] {(short)i}", - "someByteArrayColumn = new byte[] {(byte)i}", - "someCharArrayColumn = new char[] {(char)i}", - "someTimeArrayColumn = new Instant[] {(Instant)DateTimeUtils.now() + i}", + "someStringArrayColumn = new String[] {i % 10 == 0 ? null : (`` + (i % 101))}", + "someIntArrayColumn = new int[] {i % 10 == 0 ? null : i}", + "someLongArrayColumn = new long[] {i % 10 == 0 ? null : i}", + "someDoubleArrayColumn = new double[] {i % 10 == 0 ? null : i*1.1}", + "someFloatArrayColumn = new float[] {i % 10 == 0 ? null : (float)(i*1.1)}", + "someBoolArrayColumn = new Boolean[] {i % 3 == 0 ? true :i % 3 == 1 ? false : null}", + "someShorArrayColumn = new short[] {i % 10 == 0 ? null : (short)i}", + "someByteArrayColumn = new byte[] {i % 10 == 0 ? null : (byte)i}", + "someCharArrayColumn = new char[] {i % 10 == 0 ? null : (char)i}", + "someTimeArrayColumn = new Instant[] {i % 10 == 0 ? null : (Instant)DateTimeUtils.now() + i}", + "someBiColumn = new java.math.BigInteger[] {i % 10 == 0 ? null : java.math.BigInteger.valueOf(i)}", "nullStringArrayColumn = new String[] {(String)null}", "nullIntArrayColumn = new int[] {(int)null}", "nullLongArrayColumn = new long[] {(long)null}", @@ -456,20 +505,82 @@ public void testArrayColumns() { "nullShorArrayColumn = new short[] {(short)null}", "nullByteArrayColumn = new byte[] {(byte)null}", "nullCharArrayColumn = new char[] {(char)null}", - "nullTimeArrayColumn = new Instant[] {(Instant)null}")); + "nullTimeArrayColumn = new Instant[] {(Instant)null}", + "nullBiColumn = new java.math.BigInteger[] {(java.math.BigInteger)null}")); - final Table arrayTable = TableTools.emptyTable(20).select( - Selectable.from(columns)); - final File dest = new File(rootFile + File.separator + "arrayTable.parquet"); - ParquetTools.writeTable(arrayTable, dest); - Table fromDisk = ParquetTools.readTable(dest); - assertTableEquals(arrayTable, fromDisk); + Table arrayTable = TableTools.emptyTable(10000).select(Selectable.from(columns)); + final File dest = new File(rootFile + File.separator + "testArrayColumns.parquet"); + writeReadTableTest(arrayTable, dest); + + // Convert array table to vector + Table vectorTable = arrayToVectorTable(arrayTable); + writeReadTableTest(vectorTable, dest); + + // Enforce a smaller dictionary size to overflow the dictionary and test plain encoding + final ParquetInstructions writeInstructions = new ParquetInstructions.Builder() + .setMaximumDictionarySize(20) + .build(); + arrayTable = arrayTable.select("someStringArrayColumn", "nullStringArrayColumn"); + writeReadTableTest(arrayTable, dest, writeInstructions); + + // Make sure the column didn't use dictionary encoding + ParquetMetadata metadata = new ParquetTableLocationKey(dest, 0, null).getMetadata(); + String firstColumnMetadata = metadata.getBlocks().get(0).getColumns().get(0).toString(); + assertTrue(firstColumnMetadata.contains("someStringArrayColumn") + && !firstColumnMetadata.contains("RLE_DICTIONARY")); + + vectorTable = vectorTable.select("someStringArrayColumn", "nullStringArrayColumn"); + writeReadTableTest(vectorTable, dest, writeInstructions); + + // Make sure the column didn't use dictionary encoding + metadata = new ParquetTableLocationKey(dest, 0, null).getMetadata(); + firstColumnMetadata = metadata.getBlocks().get(0).getColumns().get(0).toString(); + assertTrue(firstColumnMetadata.contains("someStringArrayColumn") + && !firstColumnMetadata.contains("RLE_DICTIONARY")); + } + + @Test + public void stringDictionaryTest() { + final int nullPos = -5; + final int maxKeys = 10; + final int maxDictSize = 100; + final Statistics stats = NullStatistics.INSTANCE; + StringDictionary dict = new StringDictionary(maxKeys, maxDictSize, NullStatistics.INSTANCE, nullPos); + assertEquals(0, dict.getKeyCount()); + assertEquals(nullPos, dict.add(null)); + + final String[] keys = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"}; + final Map keyToPos = new HashMap<>(); + for (int ii = 0; ii <= 6 * keys.length; ii += 3) { + final String key = keys[ii % keys.length]; + final int dictPos = dict.add(key); + if (keyToPos.containsKey(key)) { + assertEquals(keyToPos.get(key).intValue(), dictPos); + } else { + keyToPos.put(key, dictPos); + assertEquals(dictPos, dict.getKeyCount() - 1); + } + } + assertEquals(keys.length, dict.getKeyCount()); + assertEquals(keys.length, keyToPos.size()); + final Binary[] encodedKeys = dict.getEncodedKeys(); + for (int i = 0; i < keys.length; i++) { + final String decodedKey = encodedKeys[i].toStringUsingUTF8(); + final int expectedPos = keyToPos.get(decodedKey).intValue(); + assertEquals(i, expectedPos); + } + assertEquals(nullPos, dict.add(null)); + try { + dict.add("Never before seen key which should take us over the allowed dictionary size"); + TestCase.fail("Exception expected for exceeding dictionary size"); + } catch (DictionarySizeExceededException expected) { + } } /** * Encoding bigDecimal is tricky -- the writer will try to pick the precision and scale automatically. Because of * that tableTools.assertTableEquals will fail because, even though the numbers are identical, the representation - * may not be so we have to coerce the expected values to the same precision and scale value. We know how it should + * may not be, so we have to coerce the expected values to the same precision and scale value. We know how it should * be doing it, so we can use the same pattern of encoding/decoding with the codec. * * @param toFix @@ -613,7 +724,6 @@ public void writeMultiTableExceptionTest() { assertTrue(parentDir.list().length == 0); } - /** * These are tests for writing to a table with grouping columns to a parquet file and making sure there are no * unnecessary files left in the directory after we finish writing. @@ -883,7 +993,7 @@ public void dictionaryEncodingTest() { @Test public void overflowingStringsTest() { // Test the behavior of writing parquet files if entries exceed the page size limit - final int pageSize = 2 << 10; + final int pageSize = ParquetInstructions.MIN_TARGET_PAGE_SIZE; final char[] data = new char[pageSize / 4]; String someString = new String(data); Collection columns = new ArrayList<>(Arrays.asList( @@ -906,11 +1016,11 @@ public void overflowingStringsTest() { // We will have 10 pages each containing 1 row. assertEquals(columnMetadata.getEncodingStats().getNumDataPagesEncodedAs(Encoding.PLAIN), 10); - // Table with null rows + // Table with rows of null alternating with strings exceeding the page size columns = new ArrayList<>(Arrays.asList("someStringColumn = ii % 2 == 0 ? null : `" + someString + "` + ii")); columnMetadata = overflowingStringsTestHelper(columns, numRows, pageSize); - // We will have 5 pages containing 3, 2, 2, 2, 1 rows. - assertEquals(columnMetadata.getEncodingStats().getNumDataPagesEncodedAs(Encoding.PLAIN), 5); + // We will have 6 pages containing 1, 2, 2, 2, 2, 1 rows. + assertEquals(columnMetadata.getEncodingStats().getNumDataPagesEncodedAs(Encoding.PLAIN), 6); } private static ColumnChunkMetaData overflowingStringsTestHelper(final Collection columns, @@ -932,7 +1042,7 @@ private static ColumnChunkMetaData overflowingStringsTestHelper(final Collection @Test public void overflowingCodecsTest() { - final int pageSize = 2 << 10; + final int pageSize = ParquetInstructions.MIN_TARGET_PAGE_SIZE; final ParquetInstructions writeInstructions = new ParquetInstructions.Builder() .setTargetPageSize(pageSize) // Force a small page size to cause splitting across pages .addColumnCodec("VariableWidthByteArrayColumn", SimpleByteArrayCodec.class.getName()) diff --git a/py/server/tests/test_parquet.py b/py/server/tests/test_parquet.py index 57ce42594e1..eb3825b291b 100644 --- a/py/server/tests/test_parquet.py +++ b/py/server/tests/test_parquet.py @@ -208,16 +208,17 @@ def get_table_data(self): def get_table_with_array_data(self): # create a table with columns to test different types and edge cases dh_table = empty_table(20).update(formulas=[ - "someStringArrayColumn = new String[] {i % 10 == 0?null:(`` + (i % 101))}", - "someIntArrayColumn = new int[] {i}", - "someLongArrayColumn = new long[] {ii}", - "someDoubleArrayColumn = new double[] {i*1.1}", - "someFloatArrayColumn = new float[] {(float)(i*1.1)}", - "someBoolArrayColumn = new Boolean[] {i % 3 == 0?true:i%3 == 1?false:null}", - "someShorArrayColumn = new short[] {(short)i}", - "someByteArrayColumn = new byte[] {(byte)i}", - "someCharArrayColumn = new char[] {(char)i}", - "someTimeArrayColumn = new Instant[] {(Instant)DateTimeUtils.now() + i}", + "someStringArrayColumn = new String[] {i % 10 == 0 ? null : (`` + (i % 101))}", + "someIntArrayColumn = new int[] {i % 10 == 0 ? null : i}", + "someLongArrayColumn = new long[] {i % 10 == 0 ? null : i}", + "someDoubleArrayColumn = new double[] {i % 10 == 0 ? null : i*1.1}", + "someFloatArrayColumn = new float[] {i % 10 == 0 ? null : (float)(i*1.1)}", + "someBoolArrayColumn = new Boolean[] {i % 3 == 0 ? true :i % 3 == 1 ? false : null}", + "someShorArrayColumn = new short[] {i % 10 == 0 ? null : (short)i}", + "someByteArrayColumn = new byte[] {i % 10 == 0 ? null : (byte)i}", + "someCharArrayColumn = new char[] {i % 10 == 0 ? null : (char)i}", + "someTimeArrayColumn = new Instant[] {i % 10 == 0 ? null : (Instant)DateTimeUtils.now() + i}", + "someBiColumn = new java.math.BigInteger[] {i % 10 == 0 ? null : java.math.BigInteger.valueOf(i)}", "nullStringArrayColumn = new String[] {(String)null}", "nullIntArrayColumn = new int[] {(int)null}", "nullLongArrayColumn = new long[] {(long)null}", @@ -227,7 +228,8 @@ def get_table_with_array_data(self): "nullShorArrayColumn = new short[] {(short)null}", "nullByteArrayColumn = new byte[] {(byte)null}", "nullCharArrayColumn = new char[] {(char)null}", - "nullTimeArrayColumn = new Instant[] {(Instant)null}" + "nullTimeArrayColumn = new Instant[] {(Instant)null}", + "nullBiColumn = new java.math.BigInteger[] {(java.math.BigInteger)null}" ]) return dh_table diff --git a/replication/static/src/main/java/io/deephaven/replicators/ReplicateParquetTransferObjects.java b/replication/static/src/main/java/io/deephaven/replicators/ReplicateParquetTransferObjects.java index 158417e9dea..10d32210fcc 100644 --- a/replication/static/src/main/java/io/deephaven/replicators/ReplicateParquetTransferObjects.java +++ b/replication/static/src/main/java/io/deephaven/replicators/ReplicateParquetTransferObjects.java @@ -7,11 +7,22 @@ public class ReplicateParquetTransferObjects { private static final String PARQUET_TRANSFER_DIR = "extensions/parquet/table/src/main/java/io/deephaven/parquet/table/transfer/"; - private static final String PARQUET_INT_TRANSFER_PATH = PARQUET_TRANSFER_DIR + "IntTransfer.java"; private static final String PARQUET_CHAR_TRANSFER_PATH = PARQUET_TRANSFER_DIR + "CharTransfer.java"; + private static final String PARQUET_CHAR_ARRAY_TRANSFER_PATH = PARQUET_TRANSFER_DIR + "CharArrayTransfer.java"; + private static final String PARQUET_CHAR_VECTOR_TRANSFER_PATH = PARQUET_TRANSFER_DIR + "CharVectorTransfer.java"; + + private static final String PARQUET_INT_TRANSFER_PATH = PARQUET_TRANSFER_DIR + "IntTransfer.java"; + private static final String PARQUET_INT_ARRAY_TRANSFER_PATH = PARQUET_TRANSFER_DIR + "IntArrayTransfer.java"; + private static final String PARQUET_INT_VECTOR_TRANSFER_PATH = PARQUET_TRANSFER_DIR + "IntVectorTransfer.java"; public static void main(String[] args) throws IOException { charToShortAndByte(PARQUET_CHAR_TRANSFER_PATH); - intToLongAndFloatingPoints(PARQUET_INT_TRANSFER_PATH, "int targetSize"); + charToShortAndByte(PARQUET_CHAR_ARRAY_TRANSFER_PATH); + charToShortAndByte(PARQUET_CHAR_VECTOR_TRANSFER_PATH); + + intToLongAndFloatingPoints(PARQUET_INT_TRANSFER_PATH, "int targetPageSize", "int maxValuesPerPage", + "Math.toIntExact"); + intToLongAndFloatingPoints(PARQUET_INT_ARRAY_TRANSFER_PATH, "int targetPageSize", "int length", "int getSize"); + intToLongAndFloatingPoints(PARQUET_INT_VECTOR_TRANSFER_PATH, "int targetPageSize", "int length"); } }