From 37ea685b120fdbe0415e36969c2d5b756a1cdb91 Mon Sep 17 00:00:00 2001
From: Martin Durant <martin.durant@alumni.utoronto.ca>
Date: Thu, 29 Aug 2024 16:16:05 -0400
Subject: [PATCH] str dtype compat

---
 fastparquet/test/test_api.py | 18 ++++++++++++++++++
 fastparquet/writer.py        |  6 +++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/fastparquet/test/test_api.py b/fastparquet/test/test_api.py
index b6123be7..a7f824e1 100644
--- a/fastparquet/test/test_api.py
+++ b/fastparquet/test/test_api.py
@@ -1564,3 +1564,21 @@ def test_writing_to_buffer_does_not_close():
     assert not buffer.closed
     parquet_file = ParquetFile(buffer)
     assert parquet_file.count() == 2
+
+
+@pytest.fixture()
+def pandas_string():
+    if pd.__version__.split(".") < ["3"]:
+        pytest.skip("'string' type coming in pandas 3.0.0")
+    original = pd.options.future.infer_string
+    pd.options.future.infer_string = True
+    yield
+    pd.options.future.infer_string = original
+
+
+def test_auto_string(tempdir, pandas_string):
+    fn = f"{tempdir}/test.parquet"
+    df = pd.DataFrame({"a": ["some", "strings"]})
+    df.to_parquet(fn, engine="fastparquet")
+
+
diff --git a/fastparquet/writer.py b/fastparquet/writer.py
index 86afad7c..a30d40db 100644
--- a/fastparquet/writer.py
+++ b/fastparquet/writer.py
@@ -214,7 +214,7 @@ def find_type(data, fixed_text=None, object_encoding=None, times='int64',
     elif dtype.kind == "m":
         type, converted_type, width = (parquet_thrift.Type.INT64,
                                        parquet_thrift.ConvertedType.TIME_MICROS, None)
-    elif "string" in str(dtype):
+    elif "str" in str(dtype):
         type, converted_type, width = (parquet_thrift.Type.BYTE_ARRAY,
                                        parquet_thrift.ConvertedType.UTF8,
                                        None)
@@ -283,7 +283,7 @@ def convert(data, se):
             raise ValueError('Error converting column "%s" to bytes using '
                              'encoding %s. Original error: '
                              '%s' % (data.name, ct, e))
-    elif str(dtype) == "string":
+    elif "str" in str(dtype):
         try:
             if converted_type == parquet_thrift.ConvertedType.UTF8:
                 # TODO: into bytes in one step
@@ -467,7 +467,7 @@ def _rows_per_page(data, selement, has_nulls=True, page_size=None):
         bytes_per_element = 4
     elif isinstance(data.dtype, BaseMaskedDtype) and data.dtype in pdoptional_to_numpy_typemap:
         bytes_per_element = np.dtype(pdoptional_to_numpy_typemap[data.dtype]).itemsize
-    elif data.dtype == "object" or str(data.dtype) == "string":
+    elif data.dtype == "object" or "str" in str(data.dtype):
         dd = data.iloc[:1000]
         d2 = dd[dd.notnull()]
         try: