diff --git a/py/server/deephaven/pandas.py b/py/server/deephaven/pandas.py index 8a2be32c53a..c937bc47b94 100644 --- a/py/server/deephaven/pandas.py +++ b/py/server/deephaven/pandas.py @@ -201,12 +201,15 @@ def _map_na(array: [np.ndarray, pd.api.extensions.ExtensionArray]): return array -def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table: +def to_table(df: pd.DataFrame, cols: List[str] = None, infer_objects: bool = True) -> Table: """Creates a new table from a pandas DataFrame. Args: df (DataFrame): the pandas DataFrame instance cols (List[str]): the dataframe column names, default is None which means including all columns in the DataFrame + infer_objects (bool): whether to infer the best possible types for columns of the generic 'object' type in the + DataFrame before creating the table, default is True. When True, pandas convert_dtypes() method is called to + perform the conversion. Note that any conversion will make a copy of the data. Returns: a Deephaven table @@ -222,11 +225,19 @@ def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table: if diff_set: raise DHError(message=f"columns - {list(diff_set)} not found") + # if infer_objects is True, convert object dtypes to the best possible types supporting pd.NA + converted_df = df + if infer_objects: + converted_df = df[cols] + for col in cols: + if df.dtypes[col] == object: + converted_df[col] = df[col].convert_dtypes() + # if any arrow backed column is present, create a pyarrow table first, then upload to DH, if error occurs, fall # back to the numpy-array based approach - if _is_dtype_backend_supported and any(isinstance(df[col].dtype, pd.ArrowDtype) for col in cols): + if _is_dtype_backend_supported and any(isinstance(converted_df[col].dtype, pd.ArrowDtype) for col in cols): try: - pa_table = pa.Table.from_pandas(df=df, columns=cols) + pa_table = pa.Table.from_pandas(df=converted_df, columns=cols) dh_table = arrow.to_table(pa_table) return dh_table except: @@ -235,9 +246,9 @@ def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table: try: input_cols = [] for col in cols: - np_array = df.get(col).values - if isinstance(df.dtypes[col], pd.CategoricalDtype): - dtype = df.dtypes[col].categories.dtype + np_array = converted_df.get(col).values + if isinstance(converted_df.dtypes[col], pd.CategoricalDtype): + dtype = converted_df.dtypes[col].categories.dtype else: dtype = np_array.dtype dh_dtype = dtypes.from_np_dtype(dtype) diff --git a/py/server/tests/test_pandas.py b/py/server/tests/test_pandas.py index d3f8312a3e8..04bbe942821 100644 --- a/py/server/tests/test_pandas.py +++ b/py/server/tests/test_pandas.py @@ -207,11 +207,11 @@ def test_to_table_nullable(self): self.assertIs(table.columns[6].data_type, dtypes.float32) self.assertIs(table.columns[7].data_type, dtypes.double) self.assertIs(table.columns[8].data_type, dtypes.string) - self.assertIs(table.columns[9].data_type, dtypes.PyObject) + self.assertIs(table.columns[9].data_type, dtypes.string) self.assertEqual(table.size, 3) table_string = table.to_string() - self.assertEqual(8, table_string.count("null")) + self.assertEqual(9, table_string.count("null")) self.assertEqual(2, table_string.count("NaN")) def test_arrow_backend(self): @@ -343,6 +343,27 @@ def test_to_table_readonly(self): t = to_table(df) self.assert_table_equals(source, t) + def test_infer_objects(self): + df = pd.DataFrame({ + "A": pd.Series([1, 2, 3], dtype=np.dtype("O")), + "B": pd.Series(["a", "b", "c"], dtype=np.dtype("O")), + "C": pd.Series([1.1, 2.2, 3.3], dtype=np.dtype("O")), + "D": pd.Series([True, False, True], dtype=np.dtype("O")), + "E": pd.Series( [pd.Timestamp("2021-01-01"), pd.Timestamp("2021-01-02"), pd.Timestamp("2021-01-03")], dtype=np.dtype("O")), + "F": pd.Series( [np.datetime64("2021-01-01"), np.datetime64("2021-01-02"), np.datetime64("2021-01-03")], dtype=np.dtype("O")), + }) + self.assertTrue(all(df[col].dtype == object for col in list(df))) + t = to_table(df) + self.assertEqual(t.columns[0].data_type, dtypes.int64) + self.assertEqual(t.columns[1].data_type, dtypes.string) + self.assertEqual(t.columns[2].data_type, dtypes.double) + self.assertEqual(t.columns[3].data_type, dtypes.bool_) + self.assertEqual(t.columns[4].data_type, dtypes.Instant) + self.assertEqual(t.columns[5].data_type, dtypes.Instant) + + t = to_table(df, infer_objects=False) + self.assertTrue(all([t.columns[i].data_type == dtypes.PyObject for i in range(6)])) + if __name__ == '__main__': unittest.main()