From 4b8e20fa17cf66e4ed8a667419f1e485c0ae653b Mon Sep 17 00:00:00 2001 From: Jianfeng Mao <4297243+jmao-denver@users.noreply.github.com> Date: Tue, 10 Sep 2024 12:50:00 -0600 Subject: [PATCH] feat!: Add infer_objects argument to the to_table() function in the DH pandas module (#6024) Fixes #6019 BREAKING CHANGE: the default behavior is changed to converting the object-type columns in the given data frame first before creating a DH table from it. While this change would result in much more sensible column types in DH, e.g. String vs. PyObject for object type column with only strings, it however will break user code written to handle the PyObject type columns in the result table. --- py/server/deephaven/pandas.py | 23 +++++++++++++++++------ py/server/tests/test_pandas.py | 25 +++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/py/server/deephaven/pandas.py b/py/server/deephaven/pandas.py index 8a2be32c53a..c937bc47b94 100644 --- a/py/server/deephaven/pandas.py +++ b/py/server/deephaven/pandas.py @@ -201,12 +201,15 @@ def _map_na(array: [np.ndarray, pd.api.extensions.ExtensionArray]): return array -def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table: +def to_table(df: pd.DataFrame, cols: List[str] = None, infer_objects: bool = True) -> Table: """Creates a new table from a pandas DataFrame. Args: df (DataFrame): the pandas DataFrame instance cols (List[str]): the dataframe column names, default is None which means including all columns in the DataFrame + infer_objects (bool): whether to infer the best possible types for columns of the generic 'object' type in the + DataFrame before creating the table, default is True. When True, pandas convert_dtypes() method is called to + perform the conversion. Note that any conversion will make a copy of the data. Returns: a Deephaven table @@ -222,11 +225,19 @@ def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table: if diff_set: raise DHError(message=f"columns - {list(diff_set)} not found") + # if infer_objects is True, convert object dtypes to the best possible types supporting pd.NA + converted_df = df + if infer_objects: + converted_df = df[cols] + for col in cols: + if df.dtypes[col] == object: + converted_df[col] = df[col].convert_dtypes() + # if any arrow backed column is present, create a pyarrow table first, then upload to DH, if error occurs, fall # back to the numpy-array based approach - if _is_dtype_backend_supported and any(isinstance(df[col].dtype, pd.ArrowDtype) for col in cols): + if _is_dtype_backend_supported and any(isinstance(converted_df[col].dtype, pd.ArrowDtype) for col in cols): try: - pa_table = pa.Table.from_pandas(df=df, columns=cols) + pa_table = pa.Table.from_pandas(df=converted_df, columns=cols) dh_table = arrow.to_table(pa_table) return dh_table except: @@ -235,9 +246,9 @@ def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table: try: input_cols = [] for col in cols: - np_array = df.get(col).values - if isinstance(df.dtypes[col], pd.CategoricalDtype): - dtype = df.dtypes[col].categories.dtype + np_array = converted_df.get(col).values + if isinstance(converted_df.dtypes[col], pd.CategoricalDtype): + dtype = converted_df.dtypes[col].categories.dtype else: dtype = np_array.dtype dh_dtype = dtypes.from_np_dtype(dtype) diff --git a/py/server/tests/test_pandas.py b/py/server/tests/test_pandas.py index d3f8312a3e8..04bbe942821 100644 --- a/py/server/tests/test_pandas.py +++ b/py/server/tests/test_pandas.py @@ -207,11 +207,11 @@ def test_to_table_nullable(self): self.assertIs(table.columns[6].data_type, dtypes.float32) self.assertIs(table.columns[7].data_type, dtypes.double) self.assertIs(table.columns[8].data_type, dtypes.string) - self.assertIs(table.columns[9].data_type, dtypes.PyObject) + self.assertIs(table.columns[9].data_type, dtypes.string) self.assertEqual(table.size, 3) table_string = table.to_string() - self.assertEqual(8, table_string.count("null")) + self.assertEqual(9, table_string.count("null")) self.assertEqual(2, table_string.count("NaN")) def test_arrow_backend(self): @@ -343,6 +343,27 @@ def test_to_table_readonly(self): t = to_table(df) self.assert_table_equals(source, t) + def test_infer_objects(self): + df = pd.DataFrame({ + "A": pd.Series([1, 2, 3], dtype=np.dtype("O")), + "B": pd.Series(["a", "b", "c"], dtype=np.dtype("O")), + "C": pd.Series([1.1, 2.2, 3.3], dtype=np.dtype("O")), + "D": pd.Series([True, False, True], dtype=np.dtype("O")), + "E": pd.Series( [pd.Timestamp("2021-01-01"), pd.Timestamp("2021-01-02"), pd.Timestamp("2021-01-03")], dtype=np.dtype("O")), + "F": pd.Series( [np.datetime64("2021-01-01"), np.datetime64("2021-01-02"), np.datetime64("2021-01-03")], dtype=np.dtype("O")), + }) + self.assertTrue(all(df[col].dtype == object for col in list(df))) + t = to_table(df) + self.assertEqual(t.columns[0].data_type, dtypes.int64) + self.assertEqual(t.columns[1].data_type, dtypes.string) + self.assertEqual(t.columns[2].data_type, dtypes.double) + self.assertEqual(t.columns[3].data_type, dtypes.bool_) + self.assertEqual(t.columns[4].data_type, dtypes.Instant) + self.assertEqual(t.columns[5].data_type, dtypes.Instant) + + t = to_table(df, infer_objects=False) + self.assertTrue(all([t.columns[i].data_type == dtypes.PyObject for i in range(6)])) + if __name__ == '__main__': unittest.main()