From 9d7ee90e38103fef3dd1bd2f5eb0654b8bd3fdff Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 17 Nov 2023 12:46:42 -0500 Subject: [PATCH] persist dataframe attrs (#903) * persist dataframe attrs * later cython for pandas build --- .github/workflows/main.yaml | 2 +- fastparquet/api.py | 7 +++++++ fastparquet/test/test_output.py | 10 ++++++++++ fastparquet/writer.py | 5 ++++- 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index a80b925f..87eb224a 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -120,7 +120,7 @@ jobs: - name: pip-install shell: bash -l {0} run: | - pip install 'Cython<3' + pip install Cython pip install hypothesis pip install pytest-localserver pytest-xdist pytest-asyncio pip install -e . --no-deps # Install fastparquet diff --git a/fastparquet/api.py b/fastparquet/api.py index 62a5c8e3..a5fe2858 100644 --- a/fastparquet/api.py +++ b/fastparquet/api.py @@ -380,6 +380,9 @@ def read_row_group_file(self, rg, columns, categories, index=None, size = rg.num_rows df, assign = self.pre_allocate( size, columns, categories, index) + if "PANDAS_ATTRS" in self.key_value_metadata: + import json + df.attrs = json.loads(self.key_value_metadata["PANDAS_ATTRS"]) ret = True f = infile or self.open(fn, mode='rb') @@ -765,6 +768,10 @@ def to_pandas(self, columns=None, categories=None, filters=[], size = sum(rg.num_rows for rg in rgs) selected = [None] * len(rgs) # just to fill zip, below df, views = self.pre_allocate(size, columns, categories, index, dtypes=dtypes) + if "PANDAS_ATTRS" in self.key_value_metadata: + import json + df.attrs = json.loads(self.key_value_metadata["PANDAS_ATTRS"]) + start = 0 if self.file_scheme == 'simple': infile = self.open(self.fn, 'rb') diff --git a/fastparquet/test/test_output.py b/fastparquet/test/test_output.py index f5d87903..4cb8efcf 100644 --- a/fastparquet/test/test_output.py +++ b/fastparquet/test/test_output.py @@ -1206,3 +1206,13 @@ def test_nested_infer(tempdir): df.to_parquet(path=fn, engine="fastparquet") df2 = pd.read_parquet(fn, engine="fastparquet") assert df.to_dict() == df2.to_dict() + + +def test_attrs_roundtrip(tempdir): + fn = os.path.join(tempdir, "out.parq") + attrs = {"oi": 5} + df = pd.DataFrame({"A": np.array([[1.1, 1.2], [], None], dtype=object)}) + df.attrs = attrs + df.to_parquet(path=fn, engine="fastparquet") + df2 = pd.read_parquet(fn, engine="fastparquet") + assert df2.attrs == attrs diff --git a/fastparquet/writer.py b/fastparquet/writer.py index d439ad5a..ca743d81 100644 --- a/fastparquet/writer.py +++ b/fastparquet/writer.py @@ -1243,6 +1243,9 @@ def write(filename, data, row_group_offsets=None, -------- >>> fastparquet.write('myfile.parquet', df) # doctest: +SKIP """ + custom_metadata = custom_metadata or {} + if getattr(data, "attrs", None): + custom_metadata["PANDAS_ATTRS"] = json.dumps(data.attrs) if file_scheme not in ('simple', 'hive', 'drill'): raise ValueError( 'File scheme should be simple|hive|drill, not ' f'{file_scheme}.') @@ -1305,7 +1308,7 @@ def write(filename, data, row_group_offsets=None, object_encoding=object_encoding, times=times, index_cols=index_cols, partition_cols=partition_on, cols_dtype=cols_dtype) - if custom_metadata is not None: + if custom_metadata: kvm = fmd.key_value_metadata or [] kvm.extend( [