deephaven · malhotrashivam · Apr 2, 2024 · Jan 30, 2024 · Jan 31, 2024 · Feb 13, 2024
diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java
@@ -75,7 +75,7 @@ final class ColumnChunkReaderImpl implements ColumnChunkReader {
  this.numRows = numRows;
  this.version = version;
  if (columnChunk.isSetFile_path() && FILE_URI_SCHEME.equals(rootURI.getScheme())) {
- this.columnChunkURI = Path.of(rootURI).resolve(columnChunk.getFile_path()).toUri();
+ this.columnChunkURI = convertToURI(Path.of(rootURI).resolve(columnChunk.getFile_path()), false);
  } else {
  // TODO(deephaven-core#5066): Add support for reading metadata files from non-file URIs
  this.columnChunkURI = rootURI;

diff --git a/...sions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java b/...sions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetMetadataFileWriter.java
@@ -5,7 +5,6 @@
 
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 
-import java.io.File;
 import java.io.IOException;
 
 /**

diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetUtils.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetUtils.java
@@ -3,6 +3,7 @@
 //
 package io.deephaven.parquet.base;
 
+import java.io.File;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 
@@ -33,9 +34,10 @@ public static boolean fileNameMatches(final Path path) {
  }
 
  /**
- * @return the key value derived from the file name, used for storing each file's metadata in the metadata files.
+ * @return the key value derived from the file path, used for storing each file's metadata in the combined
+ * {@value #METADATA_FILE_NAME} and {@value #COMMON_METADATA_FILE_NAME} files.
  */
- public static String getKeyForFile(final String fileName) {
- return "deephaven_per_file_" + fileName;
+ public static String getPerFileMetadataKey(final String filePath) {
+ return "deephaven_per_file_" + filePath.replace(File.separatorChar, '_');
  }
 }
diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetInstructions.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetInstructions.java
@@ -118,6 +118,11 @@ public static int getDefaultTargetPageSize() {
 
  private static final boolean DEFAULT_GENERATE_METADATA_FILES = false;
 
+ static final String UUID_TOKEN = "{uuid}";
+ static final String PARTITIONS_TOKEN = "{partitions}";
+ static final String FILE_INDEX_TOKEN = "{i}";
+ private static final String DEFAULT_BASE_NAME_FOR_PARTITIONED_PARQUET_DATA = UUID_TOKEN;
+
  public ParquetInstructions() {}
 
  public final String getColumnNameFromParquetColumnNameOrDefault(final String parquetColumnName) {
@@ -173,6 +178,14 @@ public final String getColumnNameFromParquetColumnNameOrDefault(final String par
  */
  public abstract boolean generateMetadataFiles();
 
+
+ /**
+ * @return the base name for partitioned parquet data. Check
+ * {@link Builder#setBaseNameForPartitionedParquetData(String) setBaseNameForPartitionedParquetData} for
+ * more details about different tokens that can be used in the base name.
+ */
+ public abstract String baseNameForPartitionedParquetData();
+
  @VisibleForTesting
  public static boolean sameColumnNamesAndCodecMappings(final ParquetInstructions i1, final ParquetInstructions i2) {
  if (i1 == EMPTY) {
@@ -252,6 +265,11 @@ public boolean isRefreshing() {
  public boolean generateMetadataFiles() {
  return DEFAULT_GENERATE_METADATA_FILES;
  }
+
+ @Override
+ public String baseNameForPartitionedParquetData() {
+ return DEFAULT_BASE_NAME_FOR_PARTITIONED_PARQUET_DATA;
+ }
  };
 
  private static class ColumnInstructions {
@@ -321,6 +339,7 @@ private static final class ReadOnly extends ParquetInstructions {
  private final boolean isRefreshing;
  private final Object specialInstructions;
  private final boolean generateMetadataFiles;
+ private final String baseNameForPartitionedParquetData;
 
  private ReadOnly(
  final KeyedObjectHashMap<String, ColumnInstructions> columnNameToInstructions,
@@ -332,7 +351,8 @@ private ReadOnly(
  final int targetPageSize,
  final boolean isRefreshing,
  final Object specialInstructions,
- final boolean generateMetadataFiles) {
+ final boolean generateMetadataFiles,
+ final String baseNameForPartitionedParquetData) {
  this.columnNameToInstructions = columnNameToInstructions;
  this.parquetColumnNameToInstructions = parquetColumnNameToColumnName;
  this.compressionCodecName = compressionCodecName;
@@ -343,6 +363,7 @@ private ReadOnly(
  this.isRefreshing = isRefreshing;
  this.specialInstructions = specialInstructions;
  this.generateMetadataFiles = generateMetadataFiles;
+ this.baseNameForPartitionedParquetData = baseNameForPartitionedParquetData;
  }
 
  private String getOrDefault(final String columnName, final String defaultValue,
@@ -441,6 +462,11 @@ public boolean generateMetadataFiles() {
  return generateMetadataFiles;
  }
 
+ @Override
+ public String baseNameForPartitionedParquetData() {
+ return baseNameForPartitionedParquetData;
+ }
+
  KeyedObjectHashMap<String, ColumnInstructions> copyColumnNameToInstructions() {
  // noinspection unchecked
  return (columnNameToInstructions == null)
@@ -493,6 +519,7 @@ public static class Builder {
  private boolean isRefreshing = DEFAULT_IS_REFRESHING;
  private Object specialInstructions;
  private boolean generateMetadataFiles = DEFAULT_GENERATE_METADATA_FILES;
+ private String baseNameForPartitionedParquetData = DEFAULT_BASE_NAME_FOR_PARTITIONED_PARQUET_DATA;
 
  public Builder() {}
 
@@ -687,6 +714,30 @@ public Builder setGenerateMetadataFiles(final boolean generateMetadataFiles) {
  return this;
  }
 
+ /**
+ * Set the base name for partitioned parquet data. This is used to generate the file name for partitioned
+ * parquet files, and therefore, this parameter is only used when writing partitioned parquet data. Users can
+ * provide the following tokens to be replaced in the base name:
+ * <ul>
+ * <li>The token {@value #FILE_INDEX_TOKEN} will be replaced with an automatically incremented integer for files
+ * in a directory. For example, a base name of "table-{i}" will result in files named like
+ * "PC=partition1/table-0.parquet", "PC=partition1/table-1.parquet", etc., where PC is a partitioning
+ * column.</li>
+ * <li>The token {@value #UUID_TOKEN} will be replaced with a random UUID. For example, a base name of
+ * "table-{uuid}" will result in files named like
+ * "table-8e8ab6b2-62f2-40d1-8191-1c5b70c5f330.parquet.parquet".</li>
+ * <li>The token {@value #PARTITIONS_TOKEN} will be replaced with an underscore-delimited, concatenated string
+ * of partition values. For example, a base name of "{partitions}-table" will result in files like
+ * "PC1=partition1/PC2=partitionA/PC1=partition1_PC2=partitionA-table.parquet", where "PC1" and "PC2" are
+ * partitioning columns.</li>
+ * </ul>
+ * The default value of this parameter is {@value #DEFAULT_BASE_NAME_FOR_PARTITIONED_PARQUET_DATA}.
+ */
+ public Builder setBaseNameForPartitionedParquetData(final String baseNameForPartitionedParquetData) {
+ this.baseNameForPartitionedParquetData = baseNameForPartitionedParquetData;
+ return this;
+ }
+
  public ParquetInstructions build() {
  final KeyedObjectHashMap<String, ColumnInstructions> columnNameToInstructionsOut = columnNameToInstructions;
  columnNameToInstructions = null;
@@ -695,7 +746,7 @@ public ParquetInstructions build() {
  parquetColumnNameToInstructions = null;
  return new ReadOnly(columnNameToInstructionsOut, parquetColumnNameToColumnNameOut, compressionCodecName,
  maximumDictionaryKeys, maximumDictionarySize, isLegacyParquet, targetPageSize, isRefreshing,
- specialInstructions, generateMetadataFiles);
+ specialInstructions, generateMetadataFiles, baseNameForPartitionedParquetData);
  }
  }
 

diff --git a/...parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java b/...parquet/table/src/main/java/io/deephaven/parquet/table/ParquetMetadataFileWriterImpl.java
@@ -32,11 +32,12 @@
 import static io.deephaven.base.FileUtils.convertToURI;
 import static io.deephaven.parquet.base.ParquetUtils.MAGIC;
 import static io.deephaven.parquet.base.ParquetUtils.METADATA_KEY;
-import static io.deephaven.parquet.base.ParquetUtils.getKeyForFile;
+import static io.deephaven.parquet.base.ParquetUtils.getPerFileMetadataKey;
 
 /**
  * Used to generate a combined {@value ParquetUtils#METADATA_FILE_NAME} and
- * {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file for provided Parquet files.
+ * {@value ParquetUtils#COMMON_METADATA_FILE_NAME} file for provided Parquet files. This class is stateful and therefore
+ * should not be used by multiple threads concurrently.
  */
 final class ParquetMetadataFileWriterImpl implements ParquetMetadataFileWriter {
 
@@ -82,24 +83,6 @@ private static class ParquetFileMetadata {
  }
  this.metadataRootDirAbsPath = metadataRootDir.getAbsoluteFile().toPath();
  final String metadataRootDirAbsPathString = metadataRootDirAbsPath.toString();
- final File firstDestination = destinations[0];
- for (int i = 0; i < destinations.length; i++) {
- final File destination = destinations[i];
- if (!destination.getAbsolutePath().startsWith(metadataRootDirAbsPathString)) {
- throw new UncheckedDeephavenException("All destinations must be nested under the provided metadata root"
- + " directory, provided destination " + destination.getAbsolutePath() + " is not under " +
- metadataRootDirAbsPathString);
- }
- // TODO How should I change the basename in the API for writeKeyValuePartitioned data for this check?
- if (i > 0) {
- // We use filename to generate the key for each file's metadata in the common metadata file, therefore
- // all files must have unique names.
- if (destination.getName().equals(firstDestination.getName())) {
- throw new UncheckedDeephavenException("When generating common metadata for multiple parquet files, "
- + "all files must have unique names, but " + destination.getName() + " is repeated.");
- }
- }
- }
  for (final File destination : destinations) {
  if (!destination.getAbsolutePath().startsWith(metadataRootDirAbsPathString)) {
  throw new UncheckedDeephavenException("All destinations must be nested under the provided metadata root"
@@ -121,10 +104,7 @@ private static class ParquetFileMetadata {
  }
 
  /**
- * Add parquet metadata for the provided parquet file the combined metadata file. We store deephaven-specific
- * metadata for each file individually inside the key-value metadata of the combined metadata file, with keys being
- * derived from the file names and values being the metadata for the file. Therefore, the provided parquet files
- * must have unique names.
+ * Add parquet metadata for the provided parquet file the combined metadata file.
  *
  * @param parquetFilePath The parquet file destination path
  * @param metadata The parquet metadata
@@ -170,8 +150,9 @@ private void mergeMetadata() throws IOException {
  for (final ParquetFileMetadata parquetFileMetadata : parquetFileMetadataList) {
  final FileMetaData fileMetaData = parquetFileMetadata.metadata.getFileMetaData();
  mergedSchema = mergeSchemaInto(fileMetaData.getSchema(), mergedSchema);
- mergeKeyValueMetaData(parquetFileMetadata);
- mergeBlocksInto(parquetFileMetadata, metadataRootDirAbsPath, mergedBlocks);
+ final String relativePath = getRelativePath(parquetFileMetadata.filePath, metadataRootDirAbsPath);
+ mergeKeyValueMetaData(parquetFileMetadata, relativePath);
+ mergeBlocksInto(parquetFileMetadata, relativePath, mergedBlocks);
  mergedCreatedBy.add(fileMetaData.getCreatedBy());
  }
  if (mergedKeyValueMetaData.size() != parquetFileMetadataList.size()) {
@@ -212,7 +193,8 @@ private static MessageType mergeSchemaInto(final MessageType schema, final Messa
  * well as accumulate the required fields to generate a common table info later once all files are processed.</li>
  * </ul>
  */
- private void mergeKeyValueMetaData(@NotNull final ParquetFileMetadata parquetFileMetadata) throws IOException {
+ private void mergeKeyValueMetaData(@NotNull final ParquetFileMetadata parquetFileMetadata,
+ @NotNull final String relativePath) throws IOException {
  final Map<String, String> keyValueMetaData =
  parquetFileMetadata.metadata.getFileMetaData().getKeyValueMetaData();
  for (final Map.Entry<String, String> entry : keyValueMetaData.entrySet()) {
@@ -232,12 +214,11 @@ private void mergeKeyValueMetaData(@NotNull final ParquetFileMetadata parquetFil
  });
  } else {
  // Add a separate entry for each file
- final String fileKey = getKeyForFile(new File(parquetFileMetadata.filePath).getName());
+ final String fileKey = getPerFileMetadataKey(relativePath);
  // Assuming the keys are unique for each file because file names are unique, verified in the constructor
  if (mergedKeyValueMetaData.containsKey(fileKey)) {
- throw new UncheckedDeephavenException("Could not merge metadata for for file " +
- parquetFileMetadata.filePath + " because has conflicting file name with another file. For "
- + " generating metadata files, file names should be unique");
+ throw new IllegalStateException("Could not merge metadata for file " +
+ parquetFileMetadata.filePath + " because it has conflicting file key: " + fileKey);
  }
  mergedKeyValueMetaData.put(fileKey, entry.getValue());
 
@@ -265,21 +246,24 @@ private void mergeKeyValueMetaData(@NotNull final ParquetFileMetadata parquetFil
  }
 
  private static void mergeBlocksInto(final ParquetFileMetadata parquetFileMetadata,
- final Path metadataRootDirAbsPath, final Collection<BlockMetaData> mergedBlocks) {
- final Path parquetFileAbsPath = new File(parquetFileMetadata.filePath).getAbsoluteFile().toPath();
- String fileRelativePathString = metadataRootDirAbsPath.relativize(parquetFileAbsPath).toString();
- // Remove leading slashes from the relative path
- int pos = 0;
- while (pos < fileRelativePathString.length() && fileRelativePathString.charAt(pos) == '/') {
- pos++;
- }
- fileRelativePathString = fileRelativePathString.substring(pos);
+ final String fileRelativePathString, final Collection<BlockMetaData> mergedBlocks) {
  for (final BlockMetaData block : parquetFileMetadata.metadata.getBlocks()) {
  block.setPath(fileRelativePathString);
  mergedBlocks.add(block);
  }
  }
 
+ private static String getRelativePath(final String parquetFilePath, final Path metadataRootDirAbsPath) {
+ final Path parquetFileAbsPath = new File(parquetFilePath).getAbsoluteFile().toPath();
+ final String relativePath = metadataRootDirAbsPath.relativize(parquetFileAbsPath).toString();
+ // Remove leading slashes from the relative path
+ int pos = 0;
+ while (pos < relativePath.length() && relativePath.charAt(pos) == '/') {
+ pos++;
+ }
+ return relativePath.substring(pos);
+ }
+
  private void writeMetadataFile(final ParquetMetadata metadataFooter, final String outputPath) throws IOException {
  final PositionedBufferedOutputStream metadataOutputStream =
  new PositionedBufferedOutputStream(channelsProvider.getWriteChannel(outputPath, false),