persist dataframe attrs (#903)

* persist dataframe attrs * later cython for pandas build
dask · Nov 17, 2023 · 9d7ee90 · 9d7ee90
1 parent 1f90f1a
commit 9d7ee90
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 2 deletions.
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -120,7 +120,7 @@ jobs:
       - name: pip-install
         shell: bash -l {0}
         run: |
-          pip install 'Cython<3'
+          pip install Cython
           pip install hypothesis
           pip install pytest-localserver pytest-xdist pytest-asyncio
           pip install -e . --no-deps # Install fastparquet

diff --git a/fastparquet/api.py b/fastparquet/api.py
@@ -380,6 +380,9 @@ def read_row_group_file(self, rg, columns, categories, index=None,
                 size = rg.num_rows
             df, assign = self.pre_allocate(
                     size, columns, categories, index)
+            if "PANDAS_ATTRS" in self.key_value_metadata:
+                import json
+                df.attrs = json.loads(self.key_value_metadata["PANDAS_ATTRS"])
             ret = True
         f = infile or self.open(fn, mode='rb')
 
@@ -765,6 +768,10 @@ def to_pandas(self, columns=None, categories=None, filters=[],
             size = sum(rg.num_rows for rg in rgs)
             selected = [None] * len(rgs)  # just to fill zip, below
         df, views = self.pre_allocate(size, columns, categories, index, dtypes=dtypes)
+        if "PANDAS_ATTRS" in self.key_value_metadata:
+            import json
+            df.attrs = json.loads(self.key_value_metadata["PANDAS_ATTRS"])
+
         start = 0
         if self.file_scheme == 'simple':
             infile = self.open(self.fn, 'rb')

diff --git a/fastparquet/test/test_output.py b/fastparquet/test/test_output.py
@@ -1206,3 +1206,13 @@ def test_nested_infer(tempdir):
     df.to_parquet(path=fn, engine="fastparquet")
     df2 = pd.read_parquet(fn, engine="fastparquet")
     assert df.to_dict() == df2.to_dict()
+
+
+def test_attrs_roundtrip(tempdir):
+    fn = os.path.join(tempdir, "out.parq")
+    attrs = {"oi": 5}
+    df = pd.DataFrame({"A": np.array([[1.1, 1.2], [], None], dtype=object)})
+    df.attrs = attrs
+    df.to_parquet(path=fn, engine="fastparquet")
+    df2 = pd.read_parquet(fn, engine="fastparquet")
+    assert df2.attrs == attrs
diff --git a/fastparquet/writer.py b/fastparquet/writer.py
@@ -1243,6 +1243,9 @@ def write(filename, data, row_group_offsets=None,
     --------
     >>> fastparquet.write('myfile.parquet', df)  # doctest: +SKIP
     """
+    custom_metadata = custom_metadata or {}
+    if getattr(data, "attrs", None):
+        custom_metadata["PANDAS_ATTRS"] = json.dumps(data.attrs)
     if file_scheme not in ('simple', 'hive', 'drill'):
         raise ValueError( 'File scheme should be simple|hive|drill, not '
                          f'{file_scheme}.')
@@ -1305,7 +1308,7 @@ def write(filename, data, row_group_offsets=None,
                             object_encoding=object_encoding,
                             times=times, index_cols=index_cols,
                             partition_cols=partition_on, cols_dtype=cols_dtype)
-        if custom_metadata is not None:
+        if custom_metadata:
             kvm = fmd.key_value_metadata or []
             kvm.extend(
                 [