Skip to content

Commit

Permalink
Closes #3339: Add multi-batch parquet read tests (#3350)
Browse files Browse the repository at this point in the history
* Closes #3339: Add multi-batch parquet read tests

This PR (closes #3339) adds testing for parquet reads or arrays and strings large enough to trigger more than one batch. We also add testing of a segarray of segstrings containing empty segs and empty strings

* add proto

---------

Co-authored-by: Tess Hayes <stress-tess@users.noreply.github.com>
  • Loading branch information
stress-tess and stress-tess committed Jun 20, 2024
1 parent ffe8484 commit cf6eeac
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 6 deletions.
18 changes: 15 additions & 3 deletions PROTO_tests/tests/io_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,11 +527,13 @@ def test_ipv4_columns(self, comp):
df["a"] = df["a"].export_uint()
assert ak.arange(10).to_list() == df["a"].to_list()

def test_empty_segs_segarray(self):
def test_multi_batch_reads(self):
# verify reproducer for #3074 is resolved
# seagarray w/ empty segs multi-batch pq reads

# bug seemed to consistently appear for val_sizes
# exceeding 700000, round up to ensure we'd hit it
# exceeding 700000 (likely due to this requiring more than one batch)
# we round up to ensure we'd hit it
val_size = 1000000

df_dict = dict()
Expand All @@ -544,7 +546,8 @@ def test_empty_segs_segarray(self):
rng.integers(0, 2**32, size=val_size, dtype="uint"),
rng.integers(0, 1, size=val_size, dtype="bool"),
rng.integers(-(2**32), 2**32, size=val_size, dtype="int"),
some_nans,
some_nans, # contains nans
ak.random_strings_uniform(0, 4, val_size, seed=seed), # contains empty strings
]

for vals in vals_list:
Expand All @@ -571,6 +574,15 @@ def test_empty_segs_segarray(self):
print("\nseed: ", seed)
assert_series_equal(pddf["rand"], to_pd, check_names=False, rtol=1e-05, atol=1e-08)

# test writing multi-batch non-segarrays
file_path = f"{tmp_dirname}/multi_batch_vals"
vals.to_parquet(file_path, dataset="my_vals")
read = ak.read_parquet(file_path + "*")["my_vals"]
if isinstance(vals, ak.pdarray) and vals.dtype == ak.float64:
assert np.allclose(read.to_list(), vals.to_list(), equal_nan=True)
else:
assert (read == vals).all()


class TestHDF5:
hdf_test_base_tmp = f"{os.getcwd()}/hdf_io_test"
Expand Down
18 changes: 15 additions & 3 deletions tests/parquet_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,11 +584,13 @@ def test_decimal_reads(self):
for i in range(1, 39):
self.assertTrue(np.allclose(ak_data["decCol" + str(i)].to_ndarray(), data[i - 1]))

def test_empty_segs_segarray(self):
def test_multi_batch_reads(self):
# verify reproducer for #3074 is resolved
# seagarray w/ empty segs multi-batch pq reads

# bug seemed to consistently appear for val_sizes
# exceeding 700000, round up to ensure we'd hit it
# exceeding 700000 (likely due to this requiring more than one batch)
# we round up to ensure we'd hit it
val_size = 1000000

df_dict = dict()
Expand All @@ -601,7 +603,8 @@ def test_empty_segs_segarray(self):
rng.integers(0, 2**32, size=val_size, dtype="uint"),
rng.integers(0, 1, size=val_size, dtype="bool"),
rng.integers(-(2**32), 2**32, size=val_size, dtype="int"),
some_nans,
some_nans, # contains nans
ak.random_strings_uniform(0, 4, val_size, seed=seed), # contains empty strings
]

for vals in vals_list:
Expand All @@ -628,6 +631,15 @@ def test_empty_segs_segarray(self):
print("\nseed: ", seed)
assert_series_equal(pddf["rand"], to_pd, check_names=False, rtol=1e-05, atol=1e-08)

# test writing multi-batch non-segarrays
file_path = f"{tmp_dirname}/multi_batch_vals"
vals.to_parquet(file_path, dataset="my_vals")
read = ak.read_parquet(file_path + "*")["my_vals"]
if isinstance(vals, ak.pdarray) and vals.dtype == ak.float64:
assert np.allclose(read.to_list(), vals.to_list(), equal_nan=True)
else:
assert (read == vals).all()

@pytest.mark.optional_parquet
def test_against_standard_files(self):
datadir = "resources/parquet-testing"
Expand Down

0 comments on commit cf6eeac

Please sign in to comment.