Skip to content

Commit

Permalink
str dtype compat
Browse files Browse the repository at this point in the history
  • Loading branch information
martindurant committed Aug 29, 2024
1 parent 987e443 commit 37ea685
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
18 changes: 18 additions & 0 deletions fastparquet/test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1564,3 +1564,21 @@ def test_writing_to_buffer_does_not_close():
assert not buffer.closed
parquet_file = ParquetFile(buffer)
assert parquet_file.count() == 2


@pytest.fixture()
def pandas_string():
if pd.__version__.split(".") < ["3"]:
pytest.skip("'string' type coming in pandas 3.0.0")
original = pd.options.future.infer_string
pd.options.future.infer_string = True
yield
pd.options.future.infer_string = original


def test_auto_string(tempdir, pandas_string):
fn = f"{tempdir}/test.parquet"
df = pd.DataFrame({"a": ["some", "strings"]})
df.to_parquet(fn, engine="fastparquet")


6 changes: 3 additions & 3 deletions fastparquet/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def find_type(data, fixed_text=None, object_encoding=None, times='int64',
elif dtype.kind == "m":
type, converted_type, width = (parquet_thrift.Type.INT64,
parquet_thrift.ConvertedType.TIME_MICROS, None)
elif "string" in str(dtype):
elif "str" in str(dtype):
type, converted_type, width = (parquet_thrift.Type.BYTE_ARRAY,
parquet_thrift.ConvertedType.UTF8,
None)
Expand Down Expand Up @@ -283,7 +283,7 @@ def convert(data, se):
raise ValueError('Error converting column "%s" to bytes using '
'encoding %s. Original error: '
'%s' % (data.name, ct, e))
elif str(dtype) == "string":
elif "str" in str(dtype):
try:
if converted_type == parquet_thrift.ConvertedType.UTF8:
# TODO: into bytes in one step
Expand Down Expand Up @@ -467,7 +467,7 @@ def _rows_per_page(data, selement, has_nulls=True, page_size=None):
bytes_per_element = 4
elif isinstance(data.dtype, BaseMaskedDtype) and data.dtype in pdoptional_to_numpy_typemap:
bytes_per_element = np.dtype(pdoptional_to_numpy_typemap[data.dtype]).itemsize
elif data.dtype == "object" or str(data.dtype) == "string":
elif data.dtype == "object" or "str" in str(data.dtype):
dd = data.iloc[:1000]
d2 = dd[dd.notnull()]
try:
Expand Down

0 comments on commit 37ea685

Please sign in to comment.