Skip to content

Commit

Permalink
feat!: Add infer_objects argument to the to_table() function in the D…
Browse files Browse the repository at this point in the history
…H pandas module (#6024)

Fixes #6019 


BREAKING CHANGE: the default behavior is changed to converting the
object-type columns in the given data frame first before creating a DH
table from it. While this change would result in much more sensible
column types in DH, e.g. String vs. PyObject for object type column with
only strings, it however will break user code written to handle the
PyObject type columns in the result table.
  • Loading branch information
jmao-denver committed Sep 10, 2024
1 parent d4528ab commit 4b8e20f
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 8 deletions.
23 changes: 17 additions & 6 deletions py/server/deephaven/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,12 +201,15 @@ def _map_na(array: [np.ndarray, pd.api.extensions.ExtensionArray]):
return array


def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table:
def to_table(df: pd.DataFrame, cols: List[str] = None, infer_objects: bool = True) -> Table:
"""Creates a new table from a pandas DataFrame.
Args:
df (DataFrame): the pandas DataFrame instance
cols (List[str]): the dataframe column names, default is None which means including all columns in the DataFrame
infer_objects (bool): whether to infer the best possible types for columns of the generic 'object' type in the
DataFrame before creating the table, default is True. When True, pandas convert_dtypes() method is called to
perform the conversion. Note that any conversion will make a copy of the data.
Returns:
a Deephaven table
Expand All @@ -222,11 +225,19 @@ def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table:
if diff_set:
raise DHError(message=f"columns - {list(diff_set)} not found")

# if infer_objects is True, convert object dtypes to the best possible types supporting pd.NA
converted_df = df
if infer_objects:
converted_df = df[cols]
for col in cols:
if df.dtypes[col] == object:
converted_df[col] = df[col].convert_dtypes()

# if any arrow backed column is present, create a pyarrow table first, then upload to DH, if error occurs, fall
# back to the numpy-array based approach
if _is_dtype_backend_supported and any(isinstance(df[col].dtype, pd.ArrowDtype) for col in cols):
if _is_dtype_backend_supported and any(isinstance(converted_df[col].dtype, pd.ArrowDtype) for col in cols):
try:
pa_table = pa.Table.from_pandas(df=df, columns=cols)
pa_table = pa.Table.from_pandas(df=converted_df, columns=cols)
dh_table = arrow.to_table(pa_table)
return dh_table
except:
Expand All @@ -235,9 +246,9 @@ def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table:
try:
input_cols = []
for col in cols:
np_array = df.get(col).values
if isinstance(df.dtypes[col], pd.CategoricalDtype):
dtype = df.dtypes[col].categories.dtype
np_array = converted_df.get(col).values
if isinstance(converted_df.dtypes[col], pd.CategoricalDtype):
dtype = converted_df.dtypes[col].categories.dtype
else:
dtype = np_array.dtype
dh_dtype = dtypes.from_np_dtype(dtype)
Expand Down
25 changes: 23 additions & 2 deletions py/server/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,11 +207,11 @@ def test_to_table_nullable(self):
self.assertIs(table.columns[6].data_type, dtypes.float32)
self.assertIs(table.columns[7].data_type, dtypes.double)
self.assertIs(table.columns[8].data_type, dtypes.string)
self.assertIs(table.columns[9].data_type, dtypes.PyObject)
self.assertIs(table.columns[9].data_type, dtypes.string)

self.assertEqual(table.size, 3)
table_string = table.to_string()
self.assertEqual(8, table_string.count("null"))
self.assertEqual(9, table_string.count("null"))
self.assertEqual(2, table_string.count("NaN"))

def test_arrow_backend(self):
Expand Down Expand Up @@ -343,6 +343,27 @@ def test_to_table_readonly(self):
t = to_table(df)
self.assert_table_equals(source, t)

def test_infer_objects(self):
df = pd.DataFrame({
"A": pd.Series([1, 2, 3], dtype=np.dtype("O")),
"B": pd.Series(["a", "b", "c"], dtype=np.dtype("O")),
"C": pd.Series([1.1, 2.2, 3.3], dtype=np.dtype("O")),
"D": pd.Series([True, False, True], dtype=np.dtype("O")),
"E": pd.Series( [pd.Timestamp("2021-01-01"), pd.Timestamp("2021-01-02"), pd.Timestamp("2021-01-03")], dtype=np.dtype("O")),
"F": pd.Series( [np.datetime64("2021-01-01"), np.datetime64("2021-01-02"), np.datetime64("2021-01-03")], dtype=np.dtype("O")),
})
self.assertTrue(all(df[col].dtype == object for col in list(df)))
t = to_table(df)
self.assertEqual(t.columns[0].data_type, dtypes.int64)
self.assertEqual(t.columns[1].data_type, dtypes.string)
self.assertEqual(t.columns[2].data_type, dtypes.double)
self.assertEqual(t.columns[3].data_type, dtypes.bool_)
self.assertEqual(t.columns[4].data_type, dtypes.Instant)
self.assertEqual(t.columns[5].data_type, dtypes.Instant)

t = to_table(df, infer_objects=False)
self.assertTrue(all([t.columns[i].data_type == dtypes.PyObject for i in range(6)]))


if __name__ == '__main__':
unittest.main()

0 comments on commit 4b8e20f

Please sign in to comment.