feat!: Add infer_objects argument to the to_table() function in the D…

…H pandas module (#6024) Fixes #6019 BREAKING CHANGE: the default behavior is changed to converting the object-type columns in the given data frame first before creating a DH table from it. While this change would result in much more sensible column types in DH, e.g. String vs. PyObject for object type column with only strings, it however will break user code written to handle the PyObject type columns in the result table.
deephaven · Sep 10, 2024 · 4b8e20f · 4b8e20f
1 parent d4528ab
commit 4b8e20f
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 8 deletions.
diff --git a/py/server/deephaven/pandas.py b/py/server/deephaven/pandas.py
@@ -201,12 +201,15 @@ def _map_na(array: [np.ndarray, pd.api.extensions.ExtensionArray]):
     return array
 
 
-def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table:
+def to_table(df: pd.DataFrame, cols: List[str] = None, infer_objects: bool = True) -> Table:
     """Creates a new table from a pandas DataFrame.
 
     Args:
         df (DataFrame): the pandas DataFrame instance
         cols (List[str]): the dataframe column names, default is None which means including all columns in the DataFrame
+        infer_objects (bool): whether to infer the best possible types for columns of the generic 'object' type in the
+            DataFrame before creating the table, default is True. When True, pandas convert_dtypes() method is called to
+            perform the conversion. Note that any conversion will make a copy of the data.
 
     Returns:
         a Deephaven table
@@ -222,11 +225,19 @@ def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table:
         if diff_set:
             raise DHError(message=f"columns - {list(diff_set)} not found")
 
+    # if infer_objects is True, convert object dtypes to the best possible types supporting pd.NA
+    converted_df = df
+    if infer_objects:
+        converted_df = df[cols]
+        for col in cols:
+            if df.dtypes[col] == object:
+                converted_df[col] = df[col].convert_dtypes()
+
     # if any arrow backed column is present, create a pyarrow table first, then upload to DH, if error occurs, fall
     # back to the numpy-array based approach
-    if _is_dtype_backend_supported and any(isinstance(df[col].dtype, pd.ArrowDtype) for col in cols):
+    if _is_dtype_backend_supported and any(isinstance(converted_df[col].dtype, pd.ArrowDtype) for col in cols):
         try:
-            pa_table = pa.Table.from_pandas(df=df, columns=cols)
+            pa_table = pa.Table.from_pandas(df=converted_df, columns=cols)
             dh_table = arrow.to_table(pa_table)
             return dh_table
         except:
@@ -235,9 +246,9 @@ def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table:
     try:
         input_cols = []
         for col in cols:
-            np_array = df.get(col).values
-            if isinstance(df.dtypes[col], pd.CategoricalDtype):
-                dtype = df.dtypes[col].categories.dtype
+            np_array = converted_df.get(col).values
+            if isinstance(converted_df.dtypes[col], pd.CategoricalDtype):
+                dtype = converted_df.dtypes[col].categories.dtype
             else:
                 dtype = np_array.dtype
             dh_dtype = dtypes.from_np_dtype(dtype)

diff --git a/py/server/tests/test_pandas.py b/py/server/tests/test_pandas.py
@@ -207,11 +207,11 @@ def test_to_table_nullable(self):
         self.assertIs(table.columns[6].data_type, dtypes.float32)
         self.assertIs(table.columns[7].data_type, dtypes.double)
         self.assertIs(table.columns[8].data_type, dtypes.string)
-        self.assertIs(table.columns[9].data_type, dtypes.PyObject)
+        self.assertIs(table.columns[9].data_type, dtypes.string)
 
         self.assertEqual(table.size, 3)
         table_string = table.to_string()
-        self.assertEqual(8, table_string.count("null"))
+        self.assertEqual(9, table_string.count("null"))
         self.assertEqual(2, table_string.count("NaN"))
 
     def test_arrow_backend(self):
@@ -343,6 +343,27 @@ def test_to_table_readonly(self):
         t = to_table(df)
         self.assert_table_equals(source, t)
 
+    def test_infer_objects(self):
+        df = pd.DataFrame({
+            "A": pd.Series([1, 2, 3], dtype=np.dtype("O")),
+            "B": pd.Series(["a", "b", "c"], dtype=np.dtype("O")),
+            "C": pd.Series([1.1, 2.2, 3.3], dtype=np.dtype("O")),
+            "D": pd.Series([True, False, True], dtype=np.dtype("O")),
+            "E": pd.Series( [pd.Timestamp("2021-01-01"), pd.Timestamp("2021-01-02"), pd.Timestamp("2021-01-03")], dtype=np.dtype("O")),
+            "F": pd.Series( [np.datetime64("2021-01-01"), np.datetime64("2021-01-02"), np.datetime64("2021-01-03")], dtype=np.dtype("O")),
+        })
+        self.assertTrue(all(df[col].dtype == object for col in list(df)))
+        t = to_table(df)
+        self.assertEqual(t.columns[0].data_type, dtypes.int64)
+        self.assertEqual(t.columns[1].data_type, dtypes.string)
+        self.assertEqual(t.columns[2].data_type, dtypes.double)
+        self.assertEqual(t.columns[3].data_type, dtypes.bool_)
+        self.assertEqual(t.columns[4].data_type, dtypes.Instant)
+        self.assertEqual(t.columns[5].data_type, dtypes.Instant)
+
+        t = to_table(df, infer_objects=False)
+        self.assertTrue(all([t.columns[i].data_type == dtypes.PyObject for i in range(6)]))
+
 
 if __name__ == '__main__':
     unittest.main()