From 4b8e20fa17cf66e4ed8a667419f1e485c0ae653b Mon Sep 17 00:00:00 2001
From: Jianfeng Mao <4297243+jmao-denver@users.noreply.github.com>
Date: Tue, 10 Sep 2024 12:50:00 -0600
Subject: [PATCH] feat!:  Add infer_objects argument to the to_table() function
 in the DH pandas module (#6024)

Fixes #6019


BREAKING CHANGE: the default behavior is changed to converting the
object-type columns in the given data frame first before creating a DH
table from it. While this change would result in much more sensible
column types in DH, e.g. String vs. PyObject for object type column with
only strings, it however will break user code written to handle the
PyObject type columns in the result table.
---
 py/server/deephaven/pandas.py  | 23 +++++++++++++++++------
 py/server/tests/test_pandas.py | 25 +++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/py/server/deephaven/pandas.py b/py/server/deephaven/pandas.py
index 8a2be32c53a..c937bc47b94 100644
--- a/py/server/deephaven/pandas.py
+++ b/py/server/deephaven/pandas.py
@@ -201,12 +201,15 @@ def _map_na(array: [np.ndarray, pd.api.extensions.ExtensionArray]):
     return array
 
 
-def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table:
+def to_table(df: pd.DataFrame, cols: List[str] = None, infer_objects: bool = True) -> Table:
     """Creates a new table from a pandas DataFrame.
 
     Args:
         df (DataFrame): the pandas DataFrame instance
         cols (List[str]): the dataframe column names, default is None which means including all columns in the DataFrame
+        infer_objects (bool): whether to infer the best possible types for columns of the generic 'object' type in the
+            DataFrame before creating the table, default is True. When True, pandas convert_dtypes() method is called to
+            perform the conversion. Note that any conversion will make a copy of the data.
 
     Returns:
         a Deephaven table
@@ -222,11 +225,19 @@ def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table:
         if diff_set:
             raise DHError(message=f"columns - {list(diff_set)} not found")
 
+    # if infer_objects is True, convert object dtypes to the best possible types supporting pd.NA
+    converted_df = df
+    if infer_objects:
+        converted_df = df[cols]
+        for col in cols:
+            if df.dtypes[col] == object:
+                converted_df[col] = df[col].convert_dtypes()
+
     # if any arrow backed column is present, create a pyarrow table first, then upload to DH, if error occurs, fall
     # back to the numpy-array based approach
-    if _is_dtype_backend_supported and any(isinstance(df[col].dtype, pd.ArrowDtype) for col in cols):
+    if _is_dtype_backend_supported and any(isinstance(converted_df[col].dtype, pd.ArrowDtype) for col in cols):
         try:
-            pa_table = pa.Table.from_pandas(df=df, columns=cols)
+            pa_table = pa.Table.from_pandas(df=converted_df, columns=cols)
             dh_table = arrow.to_table(pa_table)
             return dh_table
         except:
@@ -235,9 +246,9 @@ def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table:
     try:
         input_cols = []
         for col in cols:
-            np_array = df.get(col).values
-            if isinstance(df.dtypes[col], pd.CategoricalDtype):
-                dtype = df.dtypes[col].categories.dtype
+            np_array = converted_df.get(col).values
+            if isinstance(converted_df.dtypes[col], pd.CategoricalDtype):
+                dtype = converted_df.dtypes[col].categories.dtype
             else:
                 dtype = np_array.dtype
             dh_dtype = dtypes.from_np_dtype(dtype)
diff --git a/py/server/tests/test_pandas.py b/py/server/tests/test_pandas.py
index d3f8312a3e8..04bbe942821 100644
--- a/py/server/tests/test_pandas.py
+++ b/py/server/tests/test_pandas.py
@@ -207,11 +207,11 @@ def test_to_table_nullable(self):
         self.assertIs(table.columns[6].data_type, dtypes.float32)
         self.assertIs(table.columns[7].data_type, dtypes.double)
         self.assertIs(table.columns[8].data_type, dtypes.string)
-        self.assertIs(table.columns[9].data_type, dtypes.PyObject)
+        self.assertIs(table.columns[9].data_type, dtypes.string)
 
         self.assertEqual(table.size, 3)
         table_string = table.to_string()
-        self.assertEqual(8, table_string.count("null"))
+        self.assertEqual(9, table_string.count("null"))
         self.assertEqual(2, table_string.count("NaN"))
 
     def test_arrow_backend(self):
@@ -343,6 +343,27 @@ def test_to_table_readonly(self):
         t = to_table(df)
         self.assert_table_equals(source, t)
 
+    def test_infer_objects(self):
+        df = pd.DataFrame({
+            "A": pd.Series([1, 2, 3], dtype=np.dtype("O")),
+            "B": pd.Series(["a", "b", "c"], dtype=np.dtype("O")),
+            "C": pd.Series([1.1, 2.2, 3.3], dtype=np.dtype("O")),
+            "D": pd.Series([True, False, True], dtype=np.dtype("O")),
+            "E": pd.Series( [pd.Timestamp("2021-01-01"), pd.Timestamp("2021-01-02"), pd.Timestamp("2021-01-03")], dtype=np.dtype("O")),
+            "F": pd.Series( [np.datetime64("2021-01-01"), np.datetime64("2021-01-02"), np.datetime64("2021-01-03")], dtype=np.dtype("O")),
+        })
+        self.assertTrue(all(df[col].dtype == object for col in list(df)))
+        t = to_table(df)
+        self.assertEqual(t.columns[0].data_type, dtypes.int64)
+        self.assertEqual(t.columns[1].data_type, dtypes.string)
+        self.assertEqual(t.columns[2].data_type, dtypes.double)
+        self.assertEqual(t.columns[3].data_type, dtypes.bool_)
+        self.assertEqual(t.columns[4].data_type, dtypes.Instant)
+        self.assertEqual(t.columns[5].data_type, dtypes.Instant)
+
+        t = to_table(df, infer_objects=False)
+        self.assertTrue(all([t.columns[i].data_type == dtypes.PyObject for i in range(6)]))
+
 
 if __name__ == '__main__':
     unittest.main()