Skip to content

Commit

Permalink
ORC-634: Fix the json output for double NaN and infinite
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
The meta command of tools supports outputting NaN and infinite of Double type.

### Why are the changes needed?
When ORC's double type data contains NaN or infinite, dump data cannot work properly, and outputting meta in json will also fail.

```java
java.lang.IllegalArgumentException: Numeric values must be finite, but was NaN
	at com.google.gson.stream.JsonWriter.value(JsonWriter.java:505)
	at org.apache.orc.tools.PrintData.printValue(PrintData.java:140)
	at org.apache.orc.tools.PrintData.printRow(PrintData.java:192)
	at org.apache.orc.tools.PrintData.printJsonData(PrintData.java:215)
	at org.apache.orc.tools.PrintData.main(PrintData.java:288)
	at org.apache.orc.tools.FileDump.main(FileDump.java:129)
	at org.apache.orc.tools.FileDump.main(FileDump.java:144)
```

```java
Exception in thread "main" java.lang.IllegalStateException: Nesting problem.
	at com.google.gson.stream.JsonWriter.beforeName(JsonWriter.java:648)
	at com.google.gson.stream.JsonWriter.writeDeferredName(JsonWriter.java:408)
	at com.google.gson.stream.JsonWriter.value(JsonWriter.java:424)
	at org.apache.orc.tools.JsonFileDump.printJsonMetaData(JsonFileDump.java:229)
	at org.apache.orc.tools.FileDump.main(FileDump.java:135)
	at org.apache.orc.tools.Driver.main(Driver.java:124)
```

### How was this patch tested?
add UT

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #1770 from cxzl25/ORC-634.

Authored-by: sychen <sychen@ctrip.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit dc2449b)
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
  • Loading branch information
cxzl25 authored and dongjoon-hyun committed Jan 31, 2024
1 parent 6d2c476 commit 9bc4f9d
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 0 deletions.
1 change: 1 addition & 0 deletions java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public static void printJsonMetaData(List<String> files,
}
StringWriter stringWriter = new StringWriter();
JsonWriter writer = new JsonWriter(stringWriter);
writer.setLenient(true);
if (prettyPrint) {
writer.setIndent(" ");
}
Expand Down
1 change: 1 addition & 0 deletions java/tools/src/java/org/apache/orc/tools/PrintData.java
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ static void printJsonData(PrintStream printStream,
}
for (int r=0; r < batch.size; ++r) {
JsonWriter writer = new JsonWriter(out);
writer.setLenient(true);
printRow(writer, batch, schema, r);
out.write("\n");
out.flush();
Expand Down
36 changes: 36 additions & 0 deletions java/tools/src/test/org/apache/orc/tools/TestFileDump.java
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,42 @@ public void testRecover() throws Exception {
}
}

@Test
public void testDoubleNaNAndInfinite() throws Exception {
TypeDescription schema = TypeDescription.fromString("struct<x:double>");
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf)
.fileSystem(fs)
.setSchema(schema));
VectorizedRowBatch batch = schema.createRowBatch();
DoubleColumnVector x = (DoubleColumnVector) batch.cols[0];
int row = batch.size++;
x.vector[row] = Double.NaN;
row = batch.size++;
x.vector[row] = Double.POSITIVE_INFINITY;
row = batch.size++;
x.vector[row] = 12.34D;
if (batch.size != 0) {
writer.addRowBatch(batch);
}
writer.close();

assertEquals(3, writer.getNumberOfRows());

PrintStream origOut = System.out;
ByteArrayOutputStream myOut = new ByteArrayOutputStream();

// replace stdout and run command
System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8));
FileDump.main(new String[]{testFilePath.toString(), "-d"});
System.out.flush();
System.setOut(origOut);
String[] lines = myOut.toString(StandardCharsets.UTF_8).split("\n");
assertEquals("{\"x\":NaN}", lines[0]);
assertEquals("{\"x\":Infinity}", lines[1]);
assertEquals("{\"x\":12.34}", lines[2]);
}

private static boolean contentEquals(String filePath, String otherFilePath) throws IOException {
try (InputStream is = new BufferedInputStream(new FileInputStream(filePath));
InputStream otherIs = new BufferedInputStream(new FileInputStream(otherFilePath))) {
Expand Down
39 changes: 39 additions & 0 deletions java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.CompressionKind;
Expand All @@ -32,12 +33,16 @@
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Random;

import static org.junit.jupiter.api.Assertions.assertEquals;

public class TestJsonFileDump {

public static String getFileFromClasspath(String name) {
Expand Down Expand Up @@ -127,4 +132,38 @@ public void testJsonDump() throws Exception {

TestFileDump.checkOutput(outputFilename, workDir + File.separator + outputFilename);
}

@Test
public void testDoubleNaNAndInfinite() throws Exception {
TypeDescription schema = TypeDescription.fromString("struct<x:double>");
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf)
.fileSystem(fs)
.setSchema(schema));
VectorizedRowBatch batch = schema.createRowBatch();
DoubleColumnVector x = (DoubleColumnVector) batch.cols[0];
int row = batch.size++;
x.vector[row] = Double.NaN;
row = batch.size++;
x.vector[row] = Double.POSITIVE_INFINITY;
row = batch.size++;
x.vector[row] = 12.34D;
if (batch.size != 0) {
writer.addRowBatch(batch);
}
writer.close();

assertEquals(3, writer.getNumberOfRows());

PrintStream origOut = System.out;
ByteArrayOutputStream myOut = new ByteArrayOutputStream();

// replace stdout and run command
System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8));
FileDump.main(new String[]{testFilePath.toString(), "-j"});
System.out.flush();
System.setOut(origOut);
String[] lines = myOut.toString(StandardCharsets.UTF_8).split("\n");
assertEquals("{\"fileName\":\"TestFileDump.testDump.orc\",\"fileVersion\":\"0.12\",\"writerVersion\":\"ORC_14\",\"softwareVersion\":\"ORC Java unknown\",\"numberOfRows\":3,\"compression\":\"ZSTD\",\"compressionBufferSize\":262144,\"schemaString\":\"struct<x:double>\",\"schema\":{\"columnId\":0,\"columnType\":\"STRUCT\",\"children\":{\"x\":{\"columnId\":1,\"columnType\":\"DOUBLE\"}}},\"calendar\":\"Julian/Gregorian\",\"stripeStatistics\":[{\"stripeNumber\":1,\"columnStatistics\":[{\"columnId\":0,\"count\":3,\"hasNull\":false},{\"columnId\":1,\"count\":3,\"hasNull\":false,\"bytesOnDisk\":27,\"min\":NaN,\"max\":NaN,\"sum\":NaN,\"type\":\"DOUBLE\"}]}],\"fileStatistics\":[{\"columnId\":0,\"count\":3,\"hasNull\":false},{\"columnId\":1,\"count\":3,\"hasNull\":false,\"bytesOnDisk\":27,\"min\":NaN,\"max\":NaN,\"sum\":NaN,\"type\":\"DOUBLE\"}],\"stripes\":[{\"stripeNumber\":1,\"stripeInformation\":{\"offset\":3,\"indexLength\":55,\"dataLength\":27,\"footerLength\":35,\"rowCount\":3},\"streams\":[{\"columnId\":0,\"section\":\"ROW_INDEX\",\"startOffset\":3,\"length\":11},{\"columnId\":1,\"section\":\"ROW_INDEX\",\"startOffset\":14,\"length\":44},{\"columnId\":1,\"section\":\"DATA\",\"startOffset\":58,\"length\":27}],\"encodings\":[{\"columnId\":0,\"kind\":\"DIRECT\"},{\"columnId\":1,\"kind\":\"DIRECT\"}]}],\"fileLength\":286,\"rawDataSize\":36,\"paddingLength\":0,\"paddingRatio\":0.0,\"status\":\"OK\"}", lines[0]);
}
}

0 comments on commit 9bc4f9d

Please sign in to comment.