From 37ea685b120fdbe0415e36969c2d5b756a1cdb91 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 29 Aug 2024 16:16:05 -0400 Subject: [PATCH] str dtype compat --- fastparquet/test/test_api.py | 18 ++++++++++++++++++ fastparquet/writer.py | 6 +++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/fastparquet/test/test_api.py b/fastparquet/test/test_api.py index b6123be7..a7f824e1 100644 --- a/fastparquet/test/test_api.py +++ b/fastparquet/test/test_api.py @@ -1564,3 +1564,21 @@ def test_writing_to_buffer_does_not_close(): assert not buffer.closed parquet_file = ParquetFile(buffer) assert parquet_file.count() == 2 + + +@pytest.fixture() +def pandas_string(): + if pd.__version__.split(".") < ["3"]: + pytest.skip("'string' type coming in pandas 3.0.0") + original = pd.options.future.infer_string + pd.options.future.infer_string = True + yield + pd.options.future.infer_string = original + + +def test_auto_string(tempdir, pandas_string): + fn = f"{tempdir}/test.parquet" + df = pd.DataFrame({"a": ["some", "strings"]}) + df.to_parquet(fn, engine="fastparquet") + + diff --git a/fastparquet/writer.py b/fastparquet/writer.py index 86afad7c..a30d40db 100644 --- a/fastparquet/writer.py +++ b/fastparquet/writer.py @@ -214,7 +214,7 @@ def find_type(data, fixed_text=None, object_encoding=None, times='int64', elif dtype.kind == "m": type, converted_type, width = (parquet_thrift.Type.INT64, parquet_thrift.ConvertedType.TIME_MICROS, None) - elif "string" in str(dtype): + elif "str" in str(dtype): type, converted_type, width = (parquet_thrift.Type.BYTE_ARRAY, parquet_thrift.ConvertedType.UTF8, None) @@ -283,7 +283,7 @@ def convert(data, se): raise ValueError('Error converting column "%s" to bytes using ' 'encoding %s. Original error: ' '%s' % (data.name, ct, e)) - elif str(dtype) == "string": + elif "str" in str(dtype): try: if converted_type == parquet_thrift.ConvertedType.UTF8: # TODO: into bytes in one step @@ -467,7 +467,7 @@ def _rows_per_page(data, selement, has_nulls=True, page_size=None): bytes_per_element = 4 elif isinstance(data.dtype, BaseMaskedDtype) and data.dtype in pdoptional_to_numpy_typemap: bytes_per_element = np.dtype(pdoptional_to_numpy_typemap[data.dtype]).itemsize - elif data.dtype == "object" or str(data.dtype) == "string": + elif data.dtype == "object" or "str" in str(data.dtype): dd = data.iloc[:1000] d2 = dd[dd.notnull()] try: