Skip to content

Commit

Permalink
Merge branch 'main' into fix-typelength
Browse files Browse the repository at this point in the history
  • Loading branch information
wilwade authored Jan 19, 2024
2 parents a9e0eff + 8d34ac1 commit 53412c2
Show file tree
Hide file tree
Showing 64 changed files with 116 additions and 18 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ npm-debug.log
.nyc_output
dist
!test/test-files/*.parquet
!test/reference-test/files/*.parquet
examples/server/package-lock.json
test/browser/*.js
test/browser/*.js
2 changes: 1 addition & 1 deletion lib/codec/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
export * as PLAIN from './plain'
export * as RLE from './rle'
export * as PLAIN_DICTIONARY from './plain_dictionary'

export * as RLE_DICTIONARY from './plain_dictionary'

5 changes: 3 additions & 2 deletions lib/reader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -672,8 +672,9 @@ export class ParquetEnvelopeReader {
num_values: metadata.num_values
});

if (metadata.dictionary_page_offset) {
const offset = +metadata.dictionary_page_offset;
// If this exists and is greater than zero then we need to have an offset
if (metadata.dictionary_page_offset && +metadata.dictionary_page_offset > 0) {
const offset: number = +metadata.dictionary_page_offset;
const size = Math.min(+this.fileSize - offset, this.default_dictionary_size);

await this.read(offset, size, colChunk.file_path).then(async (buffer: Buffer) => {
Expand Down
2 changes: 2 additions & 0 deletions lib/shred.ts
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ function materializeRecordField(record: Record<string, unknown>, branch: Array<P
const node = branch[0];

if (dLevel < node.dLevelMax) {
// This ensures that nulls are correctly processed
record[node.name] = value;
return;
}

Expand Down
13 changes: 9 additions & 4 deletions test/integration.js
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,8 @@ async function readTestFile() {
{ quantity: [10n], warehouse: "A" },
{ quantity: [20n], warehouse: "B" }
],
colour: [ 'green', 'red' ]
colour: [ 'green', 'red' ],
meta_json: null,
});

assert.deepEqual(await cursor.next(), {
Expand All @@ -317,11 +318,13 @@ async function readTestFile() {
stock: [
{ quantity: [50n, 33n], warehouse: "X" }
],
colour: [ 'orange' ]
colour: [ 'orange' ],
meta_json: null,
});

assert.deepEqual(await cursor.next(), {
name: 'kiwi',
quantity: null,
price: 4.2,
day: new Date('2017-11-26'),
date: new Date(TEST_VTIME + 8000 * i),
Expand All @@ -337,11 +340,13 @@ async function readTestFile() {

assert.deepEqual(await cursor.next(), {
name: 'banana',
quantity: null,
price: 3.2,
day: new Date('2017-11-26'),
date: new Date(TEST_VTIME + 6000 * i),
finger: Buffer.from("FNORD"),
inter: { months: 42, days: 23, milliseconds: 777 },
stock: null,
colour: [ 'yellow' ],
meta_json: { shape: 'curved' }
});
Expand All @@ -366,8 +371,8 @@ async function readTestFile() {
for (let i = 0; i < TEST_NUM_ROWS; ++i) {
assert.deepEqual(await cursor.next(), { name: 'apples', quantity: 10n });
assert.deepEqual(await cursor.next(), { name: 'oranges', quantity: 20n });
assert.deepEqual(await cursor.next(), { name: 'kiwi' });
assert.deepEqual(await cursor.next(), { name: 'banana' });
assert.deepEqual(await cursor.next(), { name: 'kiwi', quantity: null });
assert.deepEqual(await cursor.next(), { name: 'banana', quantity: null });
}

assert.equal(await cursor.next(), null);
Expand Down
12 changes: 12 additions & 0 deletions test/reference-test/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# References Tests

This is a set of tests that use the reference files from https://github.com/apache/parquet-testing/.

## Updating the Reference Files

This assumes that parquetjs is in the same folder as the clone of parquet-testing.

1. `git clone git@github.com:apache/parquet-testing.git`
1. `cd ../parquetjs`
1. `cp ../parquet-testing/data/*.parquet ./test/reference-test/files/`

Binary file not shown.
Binary file added test/reference-test/files/alltypes_plain.parquet
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/binary.parquet
Binary file not shown.
Binary file added test/reference-test/files/byte_array_decimal.parquet
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/int32_decimal.parquet
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/int64_decimal.parquet
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/list_columns.parquet
Binary file not shown.
Binary file added test/reference-test/files/lz4_raw_compressed.parquet
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/nan_in_stats.parquet
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/null_list.parquet
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/nulls.snappy.parquet
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/single_nan.parquet
Binary file not shown.
50 changes: 50 additions & 0 deletions test/reference-test/read-all.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import { expect } from "chai";
import path from "node:path";
import fs from "node:fs";

import parquet from '../../parquet';

// Used for testing a single file. Example:
// const onlyTest = 'single_nan.parquet';
const onlyTest = null;

// Test files currently unsupported / needing separate test
const unsupported = [
'byte_stream_split.zstd.parquet', // ZSTD unsupported
'hadoop_lz4_compressed.parquet', // LZ4 unsupported
'hadoop_lz4_compressed_larger.parquet', // LZ4 unsupported
'lz4_raw_compressed.parquet', // LZ4_RAW unsupported
'lz4_raw_compressed_larger.parquet', // LZ4_RAW unsupported
'nested_structs.rust.parquet', // ZSTD unsupported
'non_hadoop_lz4_compressed.parquet', // ZSTD unsupported
'rle_boolean_encoding.parquet', // BUG?: https://github.com/LibertyDSNP/parquetjs/issues/113
'datapage_v2.snappy.parquet', // DELTA_BINARY_PACKED unsupported
'delta_binary_packed.parquet', // DELTA_BINARY_PACKED unsupported
'delta_byte_array.parquet', // DELTA_BYTE_ARRAY unsupported
'delta_encoding_optional_column.parquet', // DELTA_BINARY_PACKED unsupported
'delta_encoding_required_column.parquet', // DELTA_BINARY_PACKED unsupported
'delta_length_byte_array.parquet', // ZSTD unsupported, DELTA_BINARY_PACKED unsupported
'float16_nonzeros_and_nans.parquet', // missing option: typeLength (required for FIXED_LEN_BYTE_ARRAY)
'float16_zeros_and_nans.parquet', // missing option: typeLength (required for FIXED_LEN_BYTE_ARRAY)
'large_string_map.brotli.parquet', // BUG?
];

describe("Read Test for all files", function () {

const listOfFiles = fs.readdirSync(path.join(__dirname, 'files'))
.filter(x => x.endsWith(".parquet") && !unsupported.includes(x));

for (const filename of listOfFiles) {
if (onlyTest && onlyTest !== filename) continue;
it(`Reading ${filename}`, async function () {
const reader = await parquet.ParquetReader.openFile(path.join(__dirname, 'files', filename));
const schema = reader.getSchema();
expect(schema.fieldList).to.have.length.greaterThan(0);
const cursor = reader.getCursor();
const record = await cursor.next() as any;
// Expect the same keys as top-level fields
const expectedRecordKeys = schema.fieldList.filter(x => x.path.length === 1).map(x => x.name);
expect(Object.keys(record)).to.deep.equal(expectedRecordKeys);
})
}
});
6 changes: 3 additions & 3 deletions test/shred.js
Original file line number Diff line number Diff line change
Expand Up @@ -498,11 +498,11 @@ describe('ParquetShredder', function() {

assert.deepEqual(
records[2],
{ name: "kiwi", price: 99.0 });
{ name: "kiwi", price: 99.0, stock: null });

assert.deepEqual(
records[3],
{ name: "banana", stock: [{ warehouse: "C" }], price: 42.0 });
{ name: "banana", stock: [{ quantity: null, warehouse: "C" }], price: 42.0 });
});

it('should materialize a static nested record with blank optional value', function() {
Expand Down Expand Up @@ -549,7 +549,7 @@ describe('ParquetShredder', function() {

assert.deepEqual(
records[0],
{ fruit: { name: "apple" } });
{ fruit: { name: "apple", colour: null } });

});

Expand Down
41 changes: 34 additions & 7 deletions test/test-files.js
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ describe('test-files', function() {

it('test-converted-type-null.parquet loads', async function() {
const data = await readData('test-converted-type-null.parquet');
assert.deepEqual(data,[{foo: 'bar'},{}]);
assert.deepEqual(data,[{foo: 'bar'},{foo: null}]);
});

it('test-enum-type.parquet loads', async function() {
Expand All @@ -119,12 +119,20 @@ describe('test-files', function() {

it('test-null-dictionary.parquet loads', async function() {
const data = await readData('test-null-dictionary.parquet');
assert.deepEqual(data,[].concat.apply([{}],[...Array(3)].map( () => ([{foo: 'bar'}, {foo: 'baz'}]))));
assert.deepEqual(
data,
[
{ foo: null },
{ foo: 'bar' }, { foo: 'baz' },
{ foo: 'bar' }, { foo: 'baz' },
{ foo: 'bar' }, { foo: 'baz' }
]
);
});

it('test-null.parquet loads', async function() {
const data = await readData('test-null.parquet');
assert.deepEqual(data,[{foo: 1, bar: 2},{foo: 1}]);
assert.deepEqual(data,[{foo: 1, bar: 2},{foo: 1, bar: null}]);
});

it('test.parquet loads', async function() {
Expand All @@ -146,7 +154,7 @@ describe('test-files', function() {
const scale = schema.fields["value"].scale;
assert.equal(scale, 2);
const divider = 10 ** scale;

for (let i = 0; i < data.length; i++) {
const valueToMatch = i + 1;
// Decimal values whose primitive types are fixed length byte array will
Expand All @@ -160,11 +168,11 @@ describe('test-files', function() {
assert.equal(numericalValue, valueToMatch);
}
});

it('byte_array_decimal.parquet loads', async function () {
const schema = await readSchema('byte_array_decimal.parquet');
const data = await readData('byte_array_decimal.parquet');

const scale = schema.fields["value"].scale;
assert.equal(scale, 2);
const divider = 10 ** scale;
Expand All @@ -173,7 +181,7 @@ describe('test-files', function() {
const valueToMatch = i + 1;
// Decimal values whose primitive types are byte array will
// be returned as raw buffer values.
// For the test data, the actual decimal values and the corresponding buffer lengths
// For the test data, the actual decimal values and the corresponding buffer lengths
// are small enough so we can treat the buffer as a positive integer and compare the values.
// In reality, the user will need to use a more novel approach to parse the
// buffer to an object that can handle large fractional numbers.
Expand All @@ -188,4 +196,23 @@ describe('test-files', function() {
assert.equal(decimalValue, valueToMatch);
}
});

describe("RLE", function () {
// Tracked in https://github.com/LibertyDSNP/parquetjs/issues/113
it.skip('rle_boolean_encoding.parquet loads', async function() {
const data = await readData('rle/rle_boolean_encoding.parquet');
assert.deepEqual(data[0],{ datatype_boolean: true });
assert.deepEqual(data[1],{ datatype_boolean: false });
});

it('rle-dict-snappy-checksum.parquet loads', async function() {
const data = await readData('rle/rle-dict-snappy-checksum.parquet');
assert.deepEqual(data[0],{ binary_field: "c95e263a-f5d4-401f-8107-5ca7146a1f98", long_field: "0" });
});

it('rle-dict-uncompressed-corrupt-checksum.parquet loads', async function() {
const data = await readData('rle/rle-dict-uncompressed-corrupt-checksum.parquet');
assert.deepEqual(data[0],{ binary_field: "6325c32b-f417-41aa-9e02-9b8601542aff", long_field: "0" });
});
})
});
Binary file added test/test-files/rle/rle-dict-snappy-checksum.parquet
Binary file not shown.
Binary file not shown.
Binary file added test/test-files/rle/rle_boolean_encoding.parquet
Binary file not shown.

0 comments on commit 53412c2

Please sign in to comment.