-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
TST(string dtype): Resolve HDF5 xfails in test_put.py #60625
Changes from 2 commits
2be559a
4c0e07d
0851ea8
846b2b5
3e1503a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,6 +86,7 @@ | |
PeriodArray, | ||
) | ||
from pandas.core.arrays.datetimes import tz_to_dtype | ||
from pandas.core.arrays.string_ import BaseStringArray | ||
import pandas.core.common as com | ||
from pandas.core.computation.pytables import ( | ||
PyTablesExpr, | ||
|
@@ -3185,6 +3186,8 @@ def write_array( | |
# both self._filters and EA | ||
|
||
value = extract_array(obj, extract_numpy=True) | ||
if isinstance(value, BaseStringArray): | ||
value = value.to_numpy() | ||
|
||
if key in self.group: | ||
self._handle.remove_node(self.group, key) | ||
|
@@ -3363,7 +3366,11 @@ def read( | |
|
||
columns = items[items.get_indexer(blk_items)] | ||
df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) | ||
if using_string_dtype() and is_string_array(values, skipna=True): | ||
if ( | ||
using_string_dtype() | ||
and isinstance(values, np.ndarray) | ||
and is_string_array(values, skipna=True) | ||
): | ||
Comment on lines
-3366
to
+3377
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The same code pattern happens above in SeriesFixed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed, added a test |
||
df = df.astype(StringDtype(na_value=np.nan)) | ||
dfs.append(df) | ||
|
||
|
@@ -4737,9 +4744,10 @@ def read( | |
df = DataFrame._from_arrays([values], columns=cols_, index=index_) | ||
if not (using_string_dtype() and values.dtype.kind == "O"): | ||
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) | ||
if using_string_dtype() and is_string_array( | ||
values, # type: ignore[arg-type] | ||
skipna=True, | ||
if ( | ||
using_string_dtype() | ||
and isinstance(values, np.ndarray) | ||
and is_string_array(values, skipna=True) | ||
): | ||
df = df.astype(StringDtype(na_value=np.nan)) | ||
frames.append(df) | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -3,8 +3,6 @@ | |||||||||||
import numpy as np | ||||||||||||
import pytest | ||||||||||||
|
||||||||||||
from pandas._config import using_string_dtype | ||||||||||||
|
||||||||||||
from pandas._libs.tslibs import Timestamp | ||||||||||||
|
||||||||||||
import pandas as pd | ||||||||||||
|
@@ -26,7 +24,6 @@ | |||||||||||
|
||||||||||||
pytestmark = [ | ||||||||||||
pytest.mark.single_cpu, | ||||||||||||
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), | ||||||||||||
] | ||||||||||||
|
||||||||||||
|
||||||||||||
|
@@ -106,7 +103,7 @@ def test_put(setup_path): | |||||||||||
) | ||||||||||||
df = DataFrame( | ||||||||||||
np.random.default_rng(2).standard_normal((20, 4)), | ||||||||||||
columns=Index(list("ABCD"), dtype=object), | ||||||||||||
columns=Index(list("ABCD")), | ||||||||||||
index=date_range("2000-01-01", periods=20, freq="B"), | ||||||||||||
) | ||||||||||||
store["a"] = ts | ||||||||||||
|
@@ -133,7 +130,9 @@ def test_put(setup_path): | |||||||||||
|
||||||||||||
# overwrite table | ||||||||||||
store.put("c", df[:10], format="table", append=False) | ||||||||||||
tm.assert_frame_equal(df[:10], store["c"]) | ||||||||||||
expected = df[:10] | ||||||||||||
result = store["c"] | ||||||||||||
tm.assert_frame_equal(result, expected) | ||||||||||||
|
||||||||||||
|
||||||||||||
def test_put_string_index(setup_path): | ||||||||||||
|
@@ -166,12 +165,14 @@ def test_put_compression(setup_path): | |||||||||||
with ensure_clean_store(setup_path) as store: | ||||||||||||
df = DataFrame( | ||||||||||||
np.random.default_rng(2).standard_normal((10, 4)), | ||||||||||||
columns=Index(list("ABCD"), dtype=object), | ||||||||||||
columns=Index(list("ABCD")), | ||||||||||||
index=date_range("2000-01-01", periods=10, freq="B"), | ||||||||||||
) | ||||||||||||
|
||||||||||||
store.put("c", df, format="table", complib="zlib") | ||||||||||||
tm.assert_frame_equal(store["c"], df) | ||||||||||||
expected = df | ||||||||||||
result = store["c"] | ||||||||||||
tm.assert_frame_equal(result, expected) | ||||||||||||
|
||||||||||||
# can't compress if format='fixed' | ||||||||||||
msg = "Compression not supported on Fixed format stores" | ||||||||||||
|
@@ -183,7 +184,7 @@ def test_put_compression(setup_path): | |||||||||||
def test_put_compression_blosc(setup_path): | ||||||||||||
df = DataFrame( | ||||||||||||
np.random.default_rng(2).standard_normal((10, 4)), | ||||||||||||
columns=Index(list("ABCD"), dtype=object), | ||||||||||||
columns=Index(list("ABCD")), | ||||||||||||
index=date_range("2000-01-01", periods=10, freq="B"), | ||||||||||||
) | ||||||||||||
|
||||||||||||
|
@@ -194,13 +195,15 @@ def test_put_compression_blosc(setup_path): | |||||||||||
store.put("b", df, format="fixed", complib="blosc") | ||||||||||||
|
||||||||||||
store.put("c", df, format="table", complib="blosc") | ||||||||||||
tm.assert_frame_equal(store["c"], df) | ||||||||||||
expected = df | ||||||||||||
result = store["c"] | ||||||||||||
tm.assert_frame_equal(result, expected) | ||||||||||||
|
||||||||||||
|
||||||||||||
def test_put_mixed_type(setup_path, performance_warning): | ||||||||||||
df = DataFrame( | ||||||||||||
np.random.default_rng(2).standard_normal((10, 4)), | ||||||||||||
columns=Index(list("ABCD"), dtype=object), | ||||||||||||
columns=Index(list("ABCD")), | ||||||||||||
index=date_range("2000-01-01", periods=10, freq="B"), | ||||||||||||
) | ||||||||||||
df["obj1"] = "foo" | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
To explicitly test here with both object dtype and string dtype, and with strings with missing values (this is passing locally, so that should address Will's comment at https://github.com/pandas-dev/pandas/pull/60625/files#r1900476124 |
||||||||||||
|
@@ -223,8 +226,9 @@ def test_put_mixed_type(setup_path, performance_warning): | |||||||||||
with tm.assert_produces_warning(performance_warning): | ||||||||||||
store.put("df", df) | ||||||||||||
|
||||||||||||
expected = store.get("df") | ||||||||||||
tm.assert_frame_equal(expected, df) | ||||||||||||
expected = df | ||||||||||||
result = store.get("df") | ||||||||||||
tm.assert_frame_equal(result, expected) | ||||||||||||
|
||||||||||||
|
||||||||||||
@pytest.mark.parametrize("format", ["table", "fixed"]) | ||||||||||||
|
@@ -253,7 +257,7 @@ def test_store_index_types(setup_path, format, index): | |||||||||||
tm.assert_frame_equal(df, store["df"]) | ||||||||||||
|
||||||||||||
|
||||||||||||
def test_column_multiindex(setup_path): | ||||||||||||
def test_column_multiindex(setup_path, using_infer_string): | ||||||||||||
# GH 4710 | ||||||||||||
# recreate multi-indexes properly | ||||||||||||
|
||||||||||||
|
@@ -264,6 +268,11 @@ def test_column_multiindex(setup_path): | |||||||||||
expected = df.set_axis(df.index.to_numpy()) | ||||||||||||
|
||||||||||||
with ensure_clean_store(setup_path) as store: | ||||||||||||
if using_infer_string: | ||||||||||||
msg = "Saving a MultiIndex with an extension dtype is not supported." | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
(just as a reminder, because ideally we still solve this) |
||||||||||||
with pytest.raises(NotImplementedError, match=msg): | ||||||||||||
store.put("df", df) | ||||||||||||
return | ||||||||||||
store.put("df", df) | ||||||||||||
tm.assert_frame_equal( | ||||||||||||
store["df"], expected, check_index_type=True, check_column_type=True | ||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this logic live in
extract_array
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The line from the docstring of
extract_array
states:and the
extract_numpy
argument:So I think no - I would expect that function to still return the ExtensionArray.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do the tests currently cover the "string" data type going through pytables? This seems like it might mangle the NA markers?
Not super familiar with pytables so not saying this is right or wrong - just want to double check
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are certainly tests with dataframe roundtrip that contain string columns, not entirely sure if there are also tests where those columns have missing values, though
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jorisvandenbossche - I'm not seeing this particular line hit with string dtype. I added a test, including an NA value.
@WillAyd - the current behavior is to write out the underlying objects, and then infer upon loading. So if we start with string an
future.infer_string=False
, we get object. When that option is True, we get str.