From 488184b33eb3d662e89ecc10f2c3e5fab6f34bf5 Mon Sep 17 00:00:00 2001 From: zhangyiqun Date: Tue, 29 Aug 2023 20:33:30 -0700 Subject: [PATCH] ORC-1489: Assign a writer id to CUDF ### What changes were proposed in this pull request? This pr is aimed at assigning a writer id to the CUDF. ### Why are the changes needed? This helps to locate the writer of a specific orc file, and it also helps the reader to do some special reads for files created by different writers. ### How was this patch tested? Added UT Closes #1594 from guiyanakuang/ORC-1489. Lead-authored-by: zhangyiqun Co-authored-by: Yiqun Zhang Signed-off-by: Dongjoon Hyun (cherry picked from commit 5d163d2f3a64272a8f6e7e839df1e062df399f0b) Signed-off-by: Dongjoon Hyun --- c++/include/orc/Common.hh | 1 + c++/src/Common.cc | 2 ++ c++/src/Reader.cc | 2 +- java/core/src/java/org/apache/orc/OrcFile.java | 4 ++++ java/core/src/java/org/apache/orc/OrcUtils.java | 3 +++ java/core/src/test/org/apache/orc/TestVectorOrcFile.java | 6 ++++++ proto/orc_proto.proto | 4 ++++ site/specification/ORCv1.md | 1 + site/specification/ORCv2.md | 1 + 9 files changed, 23 insertions(+), 1 deletion(-) diff --git a/c++/include/orc/Common.hh b/c++/include/orc/Common.hh index 5b580b891a..601dad569c 100644 --- a/c++/include/orc/Common.hh +++ b/c++/include/orc/Common.hh @@ -72,6 +72,7 @@ namespace orc { PRESTO_WRITER = 2, SCRITCHLEY_GO = 3, TRINO_WRITER = 4, + CUDF_WRITER = 5, UNKNOWN_WRITER = INT32_MAX }; diff --git a/c++/src/Common.cc b/c++/src/Common.cc index 477bfd3b4c..31a4f820a7 100644 --- a/c++/src/Common.cc +++ b/c++/src/Common.cc @@ -82,6 +82,8 @@ namespace orc { return "Scritchley Go"; case TRINO_WRITER: return "Trino"; + case CUDF_WRITER: + return "CUDF"; default: { std::ostringstream buffer; buffer << "Unknown(" << id << ")"; diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 82001b9f72..ab5dc0eb28 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -584,7 +584,7 @@ namespace orc { WriterId ReaderImpl::getWriterId() const { if (footer->has_writer()) { uint32_t id = footer->writer(); - if (id > WriterId::TRINO_WRITER) { + if (id > WriterId::CUDF_WRITER) { return WriterId::UNKNOWN_WRITER; } else { return static_cast(id); diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java index c888b93a12..579750ecb7 100644 --- a/java/core/src/java/org/apache/orc/OrcFile.java +++ b/java/core/src/java/org/apache/orc/OrcFile.java @@ -132,6 +132,7 @@ public enum WriterImplementation { PRESTO(2), // Presto writer SCRITCHLEY_GO(3), // Go writer from https://github.com/scritchley/orc TRINO(4), // Trino writer + CUDF(5), // CUDF writer UNKNOWN(Integer.MAX_VALUE); private final int id; @@ -189,6 +190,9 @@ public enum WriterVersion { // Trino Writer TRINO_ORIGINAL(WriterImplementation.TRINO, 6), + // CUDF Writer + CUDF_ORIGINAL(WriterImplementation.CUDF, 6), + // Don't use any magic numbers here except for the below: FUTURE(WriterImplementation.UNKNOWN, Integer.MAX_VALUE); // a version from a future writer diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java index 358407fc98..1fd6862d66 100644 --- a/java/core/src/java/org/apache/orc/OrcUtils.java +++ b/java/core/src/java/org/apache/orc/OrcUtils.java @@ -418,6 +418,9 @@ public static String getSoftwareVersion(int writer, case 4: base = "Trino"; break; + case 5: + base = "CUDF"; + break; default: base = String.format("Unknown(%d)", writer); break; diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java index 7c8d8cf1e1..f1aa38a2a1 100644 --- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java +++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java @@ -3595,6 +3595,8 @@ public void testWriterVersion(Version fileFormat) throws Exception { OrcFile.WriterImplementation.from(2)); assertEquals(OrcFile.WriterImplementation.TRINO, OrcFile.WriterImplementation.from(4)); + assertEquals(OrcFile.WriterImplementation.CUDF, + OrcFile.WriterImplementation.from(5)); assertEquals(OrcFile.WriterImplementation.UNKNOWN, OrcFile.WriterImplementation.from(99)); @@ -3613,6 +3615,8 @@ public void testWriterVersion(Version fileFormat) throws Exception { OrcFile.WriterVersion.from(OrcFile.WriterImplementation.PRESTO, 6)); assertEquals(OrcFile.WriterVersion.TRINO_ORIGINAL, OrcFile.WriterVersion.from(OrcFile.WriterImplementation.TRINO, 6)); + assertEquals(OrcFile.WriterVersion.CUDF_ORIGINAL, + OrcFile.WriterVersion.from(OrcFile.WriterImplementation.CUDF, 6)); assertEquals(OrcFile.WriterVersion.FUTURE, OrcFile.WriterVersion.from(OrcFile.WriterImplementation.UNKNOWN, 0)); @@ -3631,6 +3635,8 @@ public void testWriterVersion(Version fileFormat) throws Exception { OrcFile.WriterVersion.PRESTO_ORIGINAL)); assertTrue(OrcFile.WriterVersion.HIVE_12055.includes( OrcFile.WriterVersion.TRINO_ORIGINAL)); + assertTrue(OrcFile.WriterVersion.HIVE_12055.includes( + OrcFile.WriterVersion.CUDF_ORIGINAL)); } @ParameterizedTest diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto index ff05657a54..45d7d2a054 100644 --- a/proto/orc_proto.proto +++ b/proto/orc_proto.proto @@ -367,6 +367,7 @@ message Footer { // 2 = Presto // 3 = Scritchley Go from https://github.com/scritchley/orc // 4 = Trino + // 5 = CUDF optional uint32 writer = 9; // information about the encryption in this file @@ -432,6 +433,9 @@ message PostScript { // Version of the Trino writer: // 6 = original // + // Version of the CUDF writer: + // 6 = original + // optional uint32 writerVersion = 6; // the number of bytes in the encrypted stripe statistics diff --git a/site/specification/ORCv1.md b/site/specification/ORCv1.md index 472e11e5bd..b789faa746 100644 --- a/site/specification/ORCv1.md +++ b/site/specification/ORCv1.md @@ -136,6 +136,7 @@ message Footer { // 2 = Presto // 3 = Scritchley Go from https://github.com/scritchley/orc // 4 = Trino + // 5 = CUDF optional uint32 writer = 9; // information about the encryption in this file optional Encryption encryption = 10; diff --git a/site/specification/ORCv2.md b/site/specification/ORCv2.md index 703cea01d8..b98dea9538 100644 --- a/site/specification/ORCv2.md +++ b/site/specification/ORCv2.md @@ -156,6 +156,7 @@ message Footer { // 2 = Presto // 3 = Scritchley Go from https://github.com/scritchley/orc // 4 = Trino + // 5 = CUDF optional uint32 writer = 9; // information about the encryption in this file optional Encryption encryption = 10;