From 5c00e790aec84781a7ae9230ac03c5e87584e409 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 22 Mar 2024 12:37:18 +0000 Subject: [PATCH] Add test of #12243 --- .../tests/dataframe/test_io_serialization.py | 38 ++++++++++++++++++- python/cudf/cudf/tests/test_parquet.py | 14 +++++-- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/dataframe/test_io_serialization.py b/python/cudf/cudf/tests/dataframe/test_io_serialization.py index 06777c8e6af..911a7f9e865 100644 --- a/python/cudf/cudf/tests/dataframe/test_io_serialization.py +++ b/python/cudf/cudf/tests/dataframe/test_io_serialization.py @@ -1 +1,37 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +from io import BytesIO + +import pandas as pd +import pyarrow.parquet as pq +import pytest + +import cudf +from cudf.testing._utils import assert_eq + + +@pytest.mark.parametrize( + "index", + [range(1, 11), list(range(1, 11)), range(1, 11)[::2]], + ids=["RangeIndex", "IntIndex", "StridedRange"], +) +@pytest.mark.parametrize("write_index", [False, True, None]) +@pytest.mark.parametrize("empty", [False, True], ids=["nonempty", "empty"]) +def test_dataframe_parquet_roundtrip(index, write_index, empty): + if empty: + data = {} + else: + data = {"a": [i * 2 for i in index]} + df = cudf.DataFrame(data=data, index=index) + pf = pd.DataFrame(data=data, index=index) + gpu_buf = BytesIO() + cpu_buf = BytesIO() + + df.to_parquet(gpu_buf, index=write_index) + pf.to_parquet(cpu_buf, index=write_index) + gpu_table = pq.read_table(gpu_buf) + cpu_table = pq.read_table(cpu_buf) + assert gpu_table.schema.pandas_metadata == cpu_table.schema.pandas_metadata + + gpu_read = cudf.read_parquet(gpu_buf) + cpu_read = cudf.read_parquet(cpu_buf) + assert_eq(gpu_read, cpu_read) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 8b72fe84359..9ba71b28637 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2442,9 +2442,17 @@ def test_parquet_index(pdf, index): run_parquet_index(pdf, index) -@pytest.mark.parametrize("index", [None, True]) -@pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/12243", +@pytest.mark.parametrize( + "index", + [ + pytest.param( + None, + marks=pytest.mark.xfail( + reason="https://github.com/apache/arrow/issues/40743" + ), + ), + True, + ], ) def test_parquet_index_empty(index): pdf = pd.DataFrame(index=pd.RangeIndex(0, 10, 1))