diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index b6637e9df08..0afecb215e4 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -59,7 +59,7 @@ cpdef generate_pandas_metadata(table, index): types = [] index_levels = [] index_descriptors = [] - + columns_to_convert = list(table._columns) # Columns for name, col in table._data.items(): if cudf.get_option("mode.pandas_compatible"): @@ -90,6 +90,7 @@ cpdef generate_pandas_metadata(table, index): types.append(np_to_pa_dtype(col.dtype)) # Indexes + materialize_index = False if index is not False: for level, name in enumerate(table._index.names): if isinstance(table._index, cudf.core.multiindex.MultiIndex): @@ -107,22 +108,26 @@ cpdef generate_pandas_metadata(table, index): "step": table.index.step, } else: + materialize_index = True # When `index=True`, RangeIndex needs to be materialized. materialized_idx = cudf.Index(idx._values, name=idx.name) - descr = \ - _index_level_name( - index_name=materialized_idx.name, - level=level, - column_names=col_names - ) - index_levels.append(materialized_idx) - else: - descr = \ - _index_level_name( - index_name=idx.name, + descr = _index_level_name( + index_name=materialized_idx.name, level=level, column_names=col_names ) + index_levels.append(materialized_idx) + columns_to_convert.append(materialized_idx._values) + col_names.append(descr) + types.append(np_to_pa_dtype(materialized_idx.dtype)) + else: + descr = _index_level_name( + index_name=idx.name, + level=level, + column_names=col_names + ) + columns_to_convert.append(idx._values) + col_names.append(descr) if isinstance(idx.dtype, cudf.CategoricalDtype): raise ValueError( "'category' column dtypes are currently not " @@ -141,17 +146,16 @@ cpdef generate_pandas_metadata(table, index): types.append(np_to_pa_dtype(idx.dtype)) index_levels.append(idx) - col_names.append(name) index_descriptors.append(descr) + df_meta = table.head(0) + if materialize_index: + df_meta.index = df_meta.index._as_int_index() metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=[ - col - for col in table._columns - ], + columns_to_convert=columns_to_convert, # It is OKAY to do `.head(0).to_pandas()` because # this method will extract `.columns` metadata only - df=table.head(0).to_pandas(), + df=df_meta.to_pandas(), column_names=col_names, index_levels=index_levels, index_descriptors=index_descriptors, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index da0a969b70c..2a4f93c1716 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5485,14 +5485,18 @@ def from_arrow(cls, table): return out @_cudf_nvtx_annotate - def to_arrow(self, preserve_index=True): + def to_arrow(self, preserve_index=None): """ Convert to a PyArrow Table. Parameters ---------- - preserve_index : bool, default True - whether index column and its meta data needs to be saved or not + preserve_index : bool, optional + whether index column and its meta data needs to be saved + or not. The default of None will store the index as a + column, except for a RangeIndex which is stored as + metadata only. Setting preserve_index to True will force + a RangeIndex to be materialized. Returns ------- @@ -5523,34 +5527,35 @@ def to_arrow(self, preserve_index=True): data = self.copy(deep=False) index_descr = [] - if preserve_index: - if isinstance(self.index, cudf.RangeIndex): + write_index = preserve_index is not False + keep_range_index = write_index and preserve_index is None + index = self.index + if write_index: + if isinstance(index, cudf.RangeIndex) and keep_range_index: descr = { "kind": "range", - "name": self.index.name, - "start": self.index._start, - "stop": self.index._stop, + "name": index.name, + "start": index._start, + "stop": index._stop, "step": 1, } else: - if isinstance(self.index, MultiIndex): + if isinstance(index, cudf.RangeIndex): + index = index._as_int_index() + index.name = "__index_level_0__" + if isinstance(index, MultiIndex): gen_names = tuple( - f"level_{i}" - for i, _ in enumerate(self.index._data.names) + f"level_{i}" for i, _ in enumerate(index._data.names) ) else: gen_names = ( - self.index.names - if self.index.name is not None - else ("index",) + index.names if index.name is not None else ("index",) ) - for gen_name, col_name in zip( - gen_names, self.index._data.names - ): + for gen_name, col_name in zip(gen_names, index._data.names): data._insert( data.shape[1], gen_name, - self.index._data[col_name], + index._data[col_name], ) descr = gen_names[0] index_descr.append(descr) @@ -5560,7 +5565,7 @@ def to_arrow(self, preserve_index=True): columns_to_convert=[self[col] for col in self._data.names], df=self, column_names=out.schema.names, - index_levels=[self.index], + index_levels=[index], index_descriptors=index_descr, preserve_index=preserve_index, types=out.schema.types, diff --git a/python/cudf/cudf/tests/dataframe/test_io_serialization.py b/python/cudf/cudf/tests/dataframe/test_io_serialization.py index 06777c8e6af..ad81609470c 100644 --- a/python/cudf/cudf/tests/dataframe/test_io_serialization.py +++ b/python/cudf/cudf/tests/dataframe/test_io_serialization.py @@ -1 +1,58 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import contextlib +from io import BytesIO + +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +import pytest + +import cudf +from cudf.testing._utils import assert_eq + + +@pytest.mark.parametrize( + "index", + [range(1, 11), list(range(1, 11)), range(1, 11)[::2]], + ids=["RangeIndex", "IntIndex", "StridedRange"], +) +@pytest.mark.parametrize("write_index", [False, True, None]) +@pytest.mark.parametrize("empty", [False, True], ids=["nonempty", "empty"]) +def test_dataframe_parquet_roundtrip(index, write_index, empty): + if empty: + data = {} + else: + data = {"a": [i * 2 for i in index]} + df = cudf.DataFrame(data=data, index=index) + pf = pd.DataFrame(data=data, index=index) + gpu_buf = BytesIO() + cpu_buf = BytesIO() + + df.to_parquet(gpu_buf, index=write_index) + pf.to_parquet(cpu_buf, index=write_index) + gpu_table = pq.read_table(gpu_buf) + cpu_table = pq.read_table(cpu_buf) + metadata_equal = ( + gpu_table.schema.pandas_metadata == cpu_table.schema.pandas_metadata + ) + if empty and write_index is not False: + # https://github.com/rapidsai/cudf/issues/15372 + ctx = pytest.raises(AssertionError) + else: + ctx = contextlib.nullcontext() + with ctx: + assert metadata_equal + + gpu_read = cudf.read_parquet(gpu_buf) + cpu_read = cudf.read_parquet(cpu_buf) + with ctx: + assert_eq(gpu_read, cpu_read) + + +@pytest.mark.parametrize("preserve_index", [False, True, None]) +def test_dataframe_to_arrow_preserve_index(preserve_index): + df = cudf.DataFrame({"x": ["cat", "dog"] * 5}) + pf = df.to_pandas() + expect = pa.Table.from_pandas(pf, preserve_index=preserve_index).schema + got = df.to_arrow(preserve_index=preserve_index).schema + assert expect == got diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 8b72fe84359..9ba71b28637 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2442,9 +2442,17 @@ def test_parquet_index(pdf, index): run_parquet_index(pdf, index) -@pytest.mark.parametrize("index", [None, True]) -@pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/12243", +@pytest.mark.parametrize( + "index", + [ + pytest.param( + None, + marks=pytest.mark.xfail( + reason="https://github.com/apache/arrow/issues/40743" + ), + ), + True, + ], ) def test_parquet_index_empty(index): pdf = pd.DataFrame(index=pd.RangeIndex(0, 10, 1))