From 2be559acbc0a50695716df61c75f41a9f8891449 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 30 Dec 2024 08:54:00 -0500 Subject: [PATCH 1/5] TST(string dtype): Resolve HDF5 xfails in test_put.py --- pandas/io/pytables.py | 16 ++++++++--- pandas/tests/io/pytables/test_put.py | 43 +++++++++++++++++++--------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b75dc6c3a43b4..8e70f50ce9934 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -86,6 +86,7 @@ PeriodArray, ) from pandas.core.arrays.datetimes import tz_to_dtype +from pandas.core.arrays.string_ import BaseStringArray import pandas.core.common as com from pandas.core.computation.pytables import ( PyTablesExpr, @@ -3185,6 +3186,8 @@ def write_array( # both self._filters and EA value = extract_array(obj, extract_numpy=True) + if isinstance(value, BaseStringArray): + value = value.to_numpy() if key in self.group: self._handle.remove_node(self.group, key) @@ -3363,7 +3366,11 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) - if using_string_dtype() and is_string_array(values, skipna=True): + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): df = df.astype(StringDtype(na_value=np.nan)) dfs.append(df) @@ -4737,9 +4744,10 @@ def read( df = DataFrame._from_arrays([values], columns=cols_, index=index_) if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if using_string_dtype() and is_string_array( - values, # type: ignore[arg-type] - skipna=True, + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) ): df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index a4257b54dd6db..4b7548192196e 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp import pandas as pd @@ -26,7 +24,6 @@ pytestmark = [ pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @@ -99,7 +96,7 @@ def test_api_default_format(tmp_path, setup_path): assert store.get_storer("df4").is_table -def test_put(setup_path): +def test_put(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: ts = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) @@ -133,7 +130,11 @@ def test_put(setup_path): # overwrite table store.put("c", df[:10], format="table", append=False) - tm.assert_frame_equal(df[:10], store["c"]) + expected = df[:10] + if using_infer_string: + expected.columns = expected.columns.astype("str") + result = store["c"] + tm.assert_frame_equal(result, expected) def test_put_string_index(setup_path): @@ -162,7 +163,7 @@ def test_put_string_index(setup_path): tm.assert_frame_equal(store["b"], df) -def test_put_compression(setup_path): +def test_put_compression(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -171,7 +172,11 @@ def test_put_compression(setup_path): ) store.put("c", df, format="table", complib="zlib") - tm.assert_frame_equal(store["c"], df) + expected = df + if using_infer_string: + expected.columns = expected.columns.astype("str") + result = store["c"] + tm.assert_frame_equal(result, expected) # can't compress if format='fixed' msg = "Compression not supported on Fixed format stores" @@ -180,7 +185,7 @@ def test_put_compression(setup_path): @td.skip_if_windows -def test_put_compression_blosc(setup_path): +def test_put_compression_blosc(setup_path, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -194,10 +199,14 @@ def test_put_compression_blosc(setup_path): store.put("b", df, format="fixed", complib="blosc") store.put("c", df, format="table", complib="blosc") - tm.assert_frame_equal(store["c"], df) + expected = df + if using_infer_string: + expected.columns = expected.columns.astype("str") + result = store["c"] + tm.assert_frame_equal(result, expected) -def test_put_mixed_type(setup_path, performance_warning): +def test_put_mixed_type(setup_path, performance_warning, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -223,8 +232,11 @@ def test_put_mixed_type(setup_path, performance_warning): with tm.assert_produces_warning(performance_warning): store.put("df", df) - expected = store.get("df") - tm.assert_frame_equal(expected, df) + expected = df + if using_infer_string: + expected.columns = expected.columns.astype("str") + result = store.get("df") + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("format", ["table", "fixed"]) @@ -253,7 +265,7 @@ def test_store_index_types(setup_path, format, index): tm.assert_frame_equal(df, store["df"]) -def test_column_multiindex(setup_path): +def test_column_multiindex(setup_path, using_infer_string): # GH 4710 # recreate multi-indexes properly @@ -264,6 +276,11 @@ def test_column_multiindex(setup_path): expected = df.set_axis(df.index.to_numpy()) with ensure_clean_store(setup_path) as store: + if using_infer_string: + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + store.put("df", df) + return store.put("df", df) tm.assert_frame_equal( store["df"], expected, check_index_type=True, check_column_type=True From 4c0e07d21631530ed75dba1475c2dc62739994c8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 Dec 2024 12:26:43 -0500 Subject: [PATCH 2/5] Change tests --- pandas/tests/io/pytables/test_put.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index 4b7548192196e..ab27f2bcbe32c 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -96,14 +96,14 @@ def test_api_default_format(tmp_path, setup_path): assert store.get_storer("df4").is_table -def test_put(setup_path, using_infer_string): +def test_put(setup_path): with ensure_clean_store(setup_path) as store: ts = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) store["a"] = ts @@ -131,8 +131,6 @@ def test_put(setup_path, using_infer_string): # overwrite table store.put("c", df[:10], format="table", append=False) expected = df[:10] - if using_infer_string: - expected.columns = expected.columns.astype("str") result = store["c"] tm.assert_frame_equal(result, expected) @@ -163,18 +161,16 @@ def test_put_string_index(setup_path): tm.assert_frame_equal(store["b"], df) -def test_put_compression(setup_path, using_infer_string): +def test_put_compression(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store.put("c", df, format="table", complib="zlib") expected = df - if using_infer_string: - expected.columns = expected.columns.astype("str") result = store["c"] tm.assert_frame_equal(result, expected) @@ -185,10 +181,10 @@ def test_put_compression(setup_path, using_infer_string): @td.skip_if_windows -def test_put_compression_blosc(setup_path, using_infer_string): +def test_put_compression_blosc(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -200,16 +196,14 @@ def test_put_compression_blosc(setup_path, using_infer_string): store.put("c", df, format="table", complib="blosc") expected = df - if using_infer_string: - expected.columns = expected.columns.astype("str") result = store["c"] tm.assert_frame_equal(result, expected) -def test_put_mixed_type(setup_path, performance_warning, using_infer_string): +def test_put_mixed_type(setup_path, performance_warning): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["obj1"] = "foo" @@ -233,8 +227,6 @@ def test_put_mixed_type(setup_path, performance_warning, using_infer_string): store.put("df", df) expected = df - if using_infer_string: - expected.columns = expected.columns.astype("str") result = store.get("df") tm.assert_frame_equal(result, expected) From 0851ea8b74cf44047e4f6e8e98f90fea3a764e21 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 2 Jan 2025 16:22:29 -0500 Subject: [PATCH 3/5] Add test for string dtype --- pandas/tests/io/pytables/test_put.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index ab27f2bcbe32c..320bd3a80e0ce 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -200,14 +200,15 @@ def test_put_compression_blosc(setup_path): tm.assert_frame_equal(result, expected) -def test_put_mixed_type(setup_path, performance_warning): +def test_put_mixed_type(setup_path, performance_warning, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["obj1"] = "foo" - df["obj2"] = "bar" + df["obj2"] = [pd.NA] + 9 * ["bar"] + df["obj2"] = df["obj2"].astype("string") df["bool1"] = df["A"] > 0 df["bool2"] = df["B"] > 0 df["bool3"] = True @@ -226,7 +227,11 @@ def test_put_mixed_type(setup_path, performance_warning): with tm.assert_produces_warning(performance_warning): store.put("df", df) - expected = df + expected = df.copy() + if using_infer_string: + expected["obj2"] = expected["obj2"].astype("str") + else: + expected["obj2"] = expected["obj2"].astype("object") result = store.get("df") tm.assert_frame_equal(result, expected) From 846b2b532dfe81855fed2148c77b0d57727306d7 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 2 Jan 2025 16:46:08 -0500 Subject: [PATCH 4/5] Refinements --- pandas/io/pytables.py | 6 +++++- pandas/tests/io/pytables/test_put.py | 14 ++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8e70f50ce9934..7e227d37c74d6 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3297,7 +3297,11 @@ def read( index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) - if using_string_dtype() and is_string_array(values, skipna=True): + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): result = result.astype(StringDtype(na_value=np.nan)) return result diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index 320bd3a80e0ce..c280fde17d84a 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -200,6 +200,16 @@ def test_put_compression_blosc(setup_path): tm.assert_frame_equal(result, expected) +def test_put_datetime_ser(setup_path, performance_warning, using_infer_string): + # https://github.com/pandas-dev/pandas/pull/60625 + ser = Series(3 * [Timestamp("20010102").as_unit("ns")]) + with ensure_clean_store(setup_path) as store: + store.put("ser", ser) + expected = ser.copy() + result = store.get("ser") + tm.assert_frame_equal(result, expected) + + def test_put_mixed_type(setup_path, performance_warning, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -207,8 +217,7 @@ def test_put_mixed_type(setup_path, performance_warning, using_infer_string): index=date_range("2000-01-01", periods=10, freq="B"), ) df["obj1"] = "foo" - df["obj2"] = [pd.NA] + 9 * ["bar"] - df["obj2"] = df["obj2"].astype("string") + df["obj2"] = pd.array([pd.NA] + 9 * ["bar"], dtype="string") df["bool1"] = df["A"] > 0 df["bool2"] = df["B"] > 0 df["bool3"] = True @@ -274,6 +283,7 @@ def test_column_multiindex(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: if using_infer_string: + # TODO(infer_string) make this work for string dtype msg = "Saving a MultiIndex with an extension dtype is not supported." with pytest.raises(NotImplementedError, match=msg): store.put("df", df) From 3e1503aeeb741a3f9e08552714e0fae3e766abd9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 3 Jan 2025 16:28:57 -0500 Subject: [PATCH 5/5] Update pandas/tests/io/pytables/test_put.py Co-authored-by: Joris Van den Bossche --- pandas/tests/io/pytables/test_put.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index c280fde17d84a..9bfb10699c8b5 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -207,7 +207,7 @@ def test_put_datetime_ser(setup_path, performance_warning, using_infer_string): store.put("ser", ser) expected = ser.copy() result = store.get("ser") - tm.assert_frame_equal(result, expected) + tm.assert_series_equal(result, expected) def test_put_mixed_type(setup_path, performance_warning, using_infer_string):