From a4e6eb0a333ec6ed87cccbe0344e2f398c44edb7 Mon Sep 17 00:00:00 2001 From: jianfengmao Date: Tue, 1 Aug 2023 14:36:06 -0600 Subject: [PATCH 1/4] Respond to reivew comments --- py/server/deephaven/numpy.py | 1 + py/server/deephaven/pandas.py | 75 +++++++++++++++++++++++++-------- py/server/tests/test_pandas.py | 48 ++++++++++++++++----- py/server/tests/test_parquet.py | 12 +++--- 4 files changed, 102 insertions(+), 34 deletions(-) diff --git a/py/server/deephaven/numpy.py b/py/server/deephaven/numpy.py index 40dccedad89..412b6e8b5ac 100644 --- a/py/server/deephaven/numpy.py +++ b/py/server/deephaven/numpy.py @@ -17,6 +17,7 @@ _JPrimitiveArrayConversionUtility = jpy.get_type("io.deephaven.integrations.common.PrimitiveArrayConversionUtility") _JDataAccessHelpers = jpy.get_type("io.deephaven.engine.table.impl.DataAccessHelpers") + def _to_column_name(name: str) -> str: """ Transforms the given name string into a valid table column name. """ tmp_name = re.sub(r"\W+", " ", str(name)).strip() diff --git a/py/server/deephaven/pandas.py b/py/server/deephaven/pandas.py index a75c6f4b93d..d4be9da032e 100644 --- a/py/server/deephaven/pandas.py +++ b/py/server/deephaven/pandas.py @@ -3,7 +3,7 @@ # """ This module supports the conversion between Deephaven tables and pandas DataFrames. """ -from typing import List +from typing import List, Dict, Tuple import jpy import numpy as np @@ -11,23 +11,38 @@ import pyarrow as pa from deephaven import DHError, new_table, dtypes, arrow -from deephaven.arrow import SUPPORTED_ARROW_TYPES from deephaven.column import Column -from deephaven.constants import NULL_BYTE, NULL_SHORT, NULL_CHAR, NULL_INT, NULL_LONG, NULL_FLOAT, NULL_DOUBLE +from deephaven.constants import NULL_BYTE, NULL_SHORT, NULL_INT, NULL_LONG, NULL_FLOAT, NULL_DOUBLE, NULL_CHAR +from deephaven.dtypes import DType from deephaven.numpy import column_to_numpy_array, _make_input_column from deephaven.table import Table +_NULL_BOOLEAN_AS_BYTE = jpy.get_type("io.deephaven.util.BooleanUtils").NULL_BOOLEAN_AS_BYTE _JPrimitiveArrayConversionUtility = jpy.get_type("io.deephaven.integrations.common.PrimitiveArrayConversionUtility") _JDataAccessHelpers = jpy.get_type("io.deephaven.engine.table.impl.DataAccessHelpers") _is_dtype_backend_supported = pd.__version__ >= "2.0.0" -def _column_to_series(table: Table, col_def: Column) -> pd.Series: +_DTYPE_NULL_MAPPING: Dict[DType, Tuple] = { + dtypes.bool_: (_NULL_BOOLEAN_AS_BYTE, pd.BooleanDtype), + dtypes.byte: (NULL_BYTE, pd.Int8Dtype), + dtypes.short: (NULL_SHORT, pd.Int16Dtype), + dtypes.char: (NULL_CHAR, pd.UInt16Dtype), + dtypes.int32: (NULL_INT, pd.Int32Dtype), + dtypes.int64: (NULL_LONG, pd.Int64Dtype), + dtypes.float32: (NULL_FLOAT, pd.Float32Dtype), + dtypes.float64: (NULL_DOUBLE, pd.Float64Dtype), +} + + +def _column_to_series(table: Table, col_def: Column, conv_null: bool) -> pd.Series: """Produce a copy of the specified column as a pandas.Series object. Args: table (Table): the table col_def (Column): the column definition + conv_null (bool): whether to check for Deephaven nulls in the data and automatically replace them with + pd.NA. Returns: a pandas Series @@ -37,9 +52,22 @@ def _column_to_series(table: Table, col_def: Column) -> pd.Series: """ try: data_col = _JDataAccessHelpers.getColumn(table.j_table, col_def.name) - np_array = column_to_numpy_array(col_def, data_col.getDirect()) + if conv_null and col_def.data_type == dtypes.bool_: + j_array = _JPrimitiveArrayConversionUtility.translateArrayBooleanToByte(data_col.getDirect()) + np_array = np.frombuffer(j_array, dtype=np.byte) + s = pd.Series(data=np_array, dtype=pd.Int8Dtype(), copy=False) + s.mask(s == _NULL_BOOLEAN_AS_BYTE, inplace=True) + return s.astype(pd.BooleanDtype(), copy=False) - return pd.Series(data=np_array, copy=False) + np_array = column_to_numpy_array(col_def, data_col.getDirect()) + if conv_null and (null_pair := _DTYPE_NULL_MAPPING.get(col_def.data_type)) is not None: + nv = null_pair[0] + pd_ex_dtype = null_pair[1] + s = pd.Series(data=np_array, dtype=pd_ex_dtype(), copy=False) + s.mask(s == nv, inplace=True) + else: + s = pd.Series(data=np_array, copy=False) + return s except DHError: raise except Exception as e: @@ -85,7 +113,8 @@ def _column_to_series(table: Table, col_def: Column) -> pd.Series: } -def to_pandas(table: Table, cols: List[str] = None, dtype_backend: str = None) -> pd.DataFrame: +def to_pandas(table: Table, cols: List[str] = None, dtype_backend: str = None, conv_null: bool = False) -> \ + pd.DataFrame: """Produces a pandas DataFrame from a table. Note that the **entire table** is going to be cloned into memory, so the total number of entries in the table @@ -99,6 +128,8 @@ def to_pandas(table: Table, cols: List[str] = None, dtype_backend: str = None) - nullable dtypes are used for all dtypes that have a nullable implementation when “numpy_nullable” is set, pyarrow is used for all dtypes if “pyarrow” is set. default is None, meaning Numpy backed DataFrames with no nullable dtypes. + conv_null (bool): When dtype_backend is not set, whether to check for Deephaven nulls in the data and + automatically replace them with pd.NA. default is False. Returns: a pandas DataFrame @@ -111,6 +142,9 @@ def to_pandas(table: Table, cols: List[str] = None, dtype_backend: str = None) - raise DHError(message=f"the dtype_backend ({dtype_backend}) option is only available for pandas 2.0.0 and " f"above. {pd.__version__} is being used.") + if dtype_backend is not None and conv_null: + raise DHError(message=f"conv_null doesn't apply when dtype_backend is either numpy_nullable or pyarrow") + type_mapper = _PYARROW_TO_PANDAS_TYPE_MAPPERS.get(dtype_backend) # if nullable dtypes (pandas or pyarrow) is requested if type_mapper: @@ -134,7 +168,7 @@ def to_pandas(table: Table, cols: List[str] = None, dtype_backend: str = None) - data = {} for col in cols: - series = _column_to_series(table, col_def_dict[col]) + series = _column_to_series(table, col_def_dict[col], conv_null) data[col] = series dtype_set = set([v.dtype for k, v in data.items()]) @@ -153,7 +187,7 @@ def to_pandas(table: Table, cols: List[str] = None, dtype_backend: str = None) - _EX_DTYPE_NULL_MAP = { # This reflects the fact that in the server we use NULL_BOOLEAN_AS_BYTE - the byte encoding of null boolean to # translate boxed Boolean to/from primitive bytes - pd.BooleanDtype: NULL_BYTE, + pd.BooleanDtype: _NULL_BOOLEAN_AS_BYTE, pd.Int8Dtype: NULL_BYTE, pd.Int16Dtype: NULL_SHORT, pd.UInt16Dtype: NULL_CHAR, @@ -173,20 +207,27 @@ def to_pandas(table: Table, cols: List[str] = None, dtype_backend: str = None) - } -def _map_na(np_array: np.ndarray): +def _map_na(array: [np.ndarray, pd.api.extensions.ExtensionArray]): """Replaces the pd.NA values in the array if it is of pandas ExtensionDtype(nullable).""" - pd_dtype = np_array.dtype + pd_dtype = array.dtype if not isinstance(pd_dtype, pd.api.extensions.ExtensionDtype): - return np_array + return array dh_null = _EX_DTYPE_NULL_MAP.get(type(pd_dtype)) or _EX_DTYPE_NULL_MAP.get(pd_dtype) - if isinstance(pd_dtype, pd.StringDtype) or isinstance(pd_dtype, pd.BooleanDtype) or pd_dtype == pd.ArrowDtype( - pa.bool_()): - np_array = np.array(list(map(lambda v: dh_null if v is pd.NA else v, np_array))) + # To preserve NaNs in floating point arrays, Pandas doesn't distinguish NaN/Null as far as NA testing is + # concerted, thus its fillna() method will replace both NaN/Null in the data. + if isinstance(pd_dtype, (pd.Float32Dtype, pd.Float64Dtype)) and isinstance(getattr(array, "_data"), np.ndarray): + np_array = array._data + null_mask = np.logical_and(array._mask, np.logical_not(np.isnan(np_array))) + np_array[null_mask] = dh_null + return np_array + + if isinstance(pd_dtype, (pd.StringDtype, pd.BooleanDtype)) or pd_dtype == pd.ArrowDtype(pa.bool_()): + array = np.array(list(map(lambda v: dh_null if v is pd.NA else v, array))) elif dh_null is not None: - np_array = np_array.fillna(dh_null) + array = array.fillna(dh_null) - return np_array + return array def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table: diff --git a/py/server/tests/test_pandas.py b/py/server/tests/test_pandas.py index 6298147488a..3b4cffea459 100644 --- a/py/server/tests/test_pandas.py +++ b/py/server/tests/test_pandas.py @@ -10,14 +10,15 @@ import pandas as pd import pyarrow as pa -from deephaven import dtypes, new_table, DHError +from deephaven import dtypes, new_table, DHError, empty_table from deephaven.column import byte_col, char_col, short_col, bool_col, int_col, long_col, float_col, double_col, \ string_col, datetime_col, pyobj_col, jobj_col -from deephaven.constants import NULL_LONG, NULL_SHORT, NULL_INT, NULL_BYTE, NULL_CHAR +from deephaven.constants import NULL_LONG, NULL_SHORT, NULL_INT, NULL_BYTE, NULL_CHAR, NULL_FLOAT, NULL_DOUBLE, \ + NULL_BOOLEAN from deephaven.jcompat import j_array_list from deephaven.pandas import to_pandas, to_table from deephaven.time import parse_instant, epoch_nanos_to_instant -from tests.testbase import BaseTestCase +from tests.testbase import BaseTestCase, table_equals @dataclass @@ -212,7 +213,8 @@ def test_to_table_nullable(self): self.assertEqual(table.size, 3) table_string = table.to_string() - self.assertEqual(10, table_string.count("null")) + self.assertEqual(8, table_string.count("null")) + self.assertEqual(2, table_string.count("NaN")) def test_arrow_backend(self): with self.subTest("pyarrow-backend"): @@ -296,19 +298,43 @@ def test_numpy_array(self): def test_pandas_category_type(self): df = pd.DataFrame({ - 'zipcode': {17384: 98125, 2680: 98107, 722: 98005, 18754: 98109, 14554: 98155}, - 'bathrooms': {17384: 1.5, 2680: 0.75, 722: 3.25, 18754: 1.0, 14554: 2.5}, - 'sqft_lot': {17384: 1650, 2680: 3700, 722: 51836, 18754: 2640, 14554: 9603}, - 'bedrooms': {17384: 2, 2680: 2, 722: 4, 18754: 2, 14554: 4}, - 'sqft_living': {17384: 1430, 2680: 1440, 722: 4670, 18754: 1130, 14554: 3180}, - 'floors': {17384: 3.0, 2680: 1.0, 722: 2.0, 18754: 1.0, 14554: 2.0} - }) + 'zipcode': {17384: 98125, 2680: 98107, 722: 98005, 18754: 98109, 14554: 98155}, + 'bathrooms': {17384: 1.5, 2680: 0.75, 722: 3.25, 18754: 1.0, 14554: 2.5}, + 'sqft_lot': {17384: 1650, 2680: 3700, 722: 51836, 18754: 2640, 14554: 9603}, + 'bedrooms': {17384: 2, 2680: 2, 722: 4, 18754: 2, 14554: 4}, + 'sqft_living': {17384: 1430, 2680: 1440, 722: 4670, 18754: 1130, 14554: 3180}, + 'floors': {17384: 3.0, 2680: 1.0, 722: 2.0, 18754: 1.0, 14554: 2.0} + }) df['zipcode'] = df.zipcode.astype('category') df['bathrooms'] = df.bathrooms.astype('category') t = to_table(df) self.assertEqual(t.columns[0].data_type, dtypes.int64) self.assertEqual(t.columns[1].data_type, dtypes.double) + def test_conv_null(self): + input_cols = [ + bool_col(name="Boolean", data=(True, NULL_BOOLEAN)), + byte_col(name="Byte", data=(1, NULL_BYTE)), + char_col(name="Char", data=(1, NULL_CHAR)), + short_col(name="Short", data=[1, NULL_SHORT]), + int_col(name="Int_", data=[1, NULL_INT]), + long_col(name="Long_", data=[1, NULL_LONG]), + float_col(name="Float_", data=[np.nan, NULL_FLOAT]), + double_col(name="Double_", data=[np.nan, NULL_DOUBLE]), + datetime_col(name="Datetime", data=[epoch_nanos_to_instant(1), None]), + ] + t = new_table(cols=input_cols) + df = to_pandas(t, conv_null=True) + dh_table = to_table(df) + self.assert_table_equals(t, dh_table) + + dtype_backends = ["numpy_nullable", "pyarrow"] + for dbe in dtype_backends: + with self.subTest(dbe): + df = to_pandas(t, dtype_backend=dbe) + dh_table = to_table(df) + self.assert_table_equals(t, dh_table) + if __name__ == '__main__': unittest.main() diff --git a/py/server/tests/test_parquet.py b/py/server/tests/test_parquet.py index de706e08d4d..df48feef33b 100644 --- a/py/server/tests/test_parquet.py +++ b/py/server/tests/test_parquet.py @@ -4,19 +4,19 @@ import os import shutil -import unittest import tempfile +import unittest + import pandas import pyarrow.parquet -from deephaven.pandas import to_pandas, to_table - from deephaven import empty_table, dtypes, new_table from deephaven.column import InputColumn +from deephaven.pandas import to_pandas, to_table from deephaven.parquet import write, batch_write, read, delete, ColumnInstruction - from tests.testbase import BaseTestCase + class ParquetTestCase(BaseTestCase): """ Test cases for the deephaven.ParquetTools module (performed locally) """ @@ -282,7 +282,8 @@ def round_trip_with_compression(self, compression_codec_name, dh_table, vector_c self.assert_table_equals(dh_table, result_table) # Write the pandas dataframe back to parquet (via pyarraow) and read it back using deephaven.parquet to compare - dataframe.to_parquet('data_from_pandas.parquet', compression=None if compression_codec_name is 'UNCOMPRESSED' else compression_codec_name) + dataframe.to_parquet('data_from_pandas.parquet', + compression=None if compression_codec_name is 'UNCOMPRESSED' else compression_codec_name) result_table = read('data_from_pandas.parquet') self.assert_table_equals(dh_table, result_table) @@ -294,6 +295,5 @@ def round_trip_with_compression(self, compression_codec_name, dh_table, vector_c # self.assert_table_equals(dh_table, result_table) - if __name__ == '__main__': unittest.main() From 97c7554e615958e7ca771bb4c4f3a7191d0bbcf5 Mon Sep 17 00:00:00 2001 From: jianfengmao Date: Wed, 2 Aug 2023 09:46:14 -0600 Subject: [PATCH 2/4] Make conv_null default to True --- py/server/deephaven/pandas.py | 15 +++++++-------- py/server/tests/test_pandas.py | 10 +++++----- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/py/server/deephaven/pandas.py b/py/server/deephaven/pandas.py index d4be9da032e..0257e6baaa9 100644 --- a/py/server/deephaven/pandas.py +++ b/py/server/deephaven/pandas.py @@ -22,10 +22,9 @@ _JDataAccessHelpers = jpy.get_type("io.deephaven.engine.table.impl.DataAccessHelpers") _is_dtype_backend_supported = pd.__version__ >= "2.0.0" - _DTYPE_NULL_MAPPING: Dict[DType, Tuple] = { dtypes.bool_: (_NULL_BOOLEAN_AS_BYTE, pd.BooleanDtype), - dtypes.byte: (NULL_BYTE, pd.Int8Dtype), + dtypes.byte: (NULL_BYTE, pd.Int8Dtype), dtypes.short: (NULL_SHORT, pd.Int16Dtype), dtypes.char: (NULL_CHAR, pd.UInt16Dtype), dtypes.int32: (NULL_INT, pd.Int32Dtype), @@ -113,7 +112,7 @@ def _column_to_series(table: Table, col_def: Column, conv_null: bool) -> pd.Seri } -def to_pandas(table: Table, cols: List[str] = None, dtype_backend: str = None, conv_null: bool = False) -> \ +def to_pandas(table: Table, cols: List[str] = None, dtype_backend: str = None, conv_null: bool = True) -> \ pd.DataFrame: """Produces a pandas DataFrame from a table. @@ -129,7 +128,7 @@ def to_pandas(table: Table, cols: List[str] = None, dtype_backend: str = None, c pyarrow is used for all dtypes if “pyarrow” is set. default is None, meaning Numpy backed DataFrames with no nullable dtypes. conv_null (bool): When dtype_backend is not set, whether to check for Deephaven nulls in the data and - automatically replace them with pd.NA. default is False. + automatically replace them with pd.NA. default is True. Returns: a pandas DataFrame @@ -142,12 +141,12 @@ def to_pandas(table: Table, cols: List[str] = None, dtype_backend: str = None, c raise DHError(message=f"the dtype_backend ({dtype_backend}) option is only available for pandas 2.0.0 and " f"above. {pd.__version__} is being used.") - if dtype_backend is not None and conv_null: - raise DHError(message=f"conv_null doesn't apply when dtype_backend is either numpy_nullable or pyarrow") + if dtype_backend is not None and not conv_null: + raise DHError(message="conv_null can't be turned off when dtype_backend is either numpy_nullable or " + "pyarrow") - type_mapper = _PYARROW_TO_PANDAS_TYPE_MAPPERS.get(dtype_backend) # if nullable dtypes (pandas or pyarrow) is requested - if type_mapper: + if type_mapper := _PYARROW_TO_PANDAS_TYPE_MAPPERS.get(dtype_backend): pa_table = arrow.to_arrow(table=table, cols=cols) df = pa_table.to_pandas(types_mapper=type_mapper) del pa_table diff --git a/py/server/tests/test_pandas.py b/py/server/tests/test_pandas.py index 3b4cffea459..26f182d593a 100644 --- a/py/server/tests/test_pandas.py +++ b/py/server/tests/test_pandas.py @@ -55,8 +55,8 @@ def tearDown(self) -> None: self.test_table = None super().tearDown() - def test_to_pandas(self): - df = to_pandas(self.test_table) + def test_to_pandas_no_conv_null(self): + df = to_pandas(self.test_table, conv_null=False) self.assertEqual(len(df.columns), len(self.test_table.columns)) self.assertEqual(df.size, 2 * len(self.test_table.columns)) df_series = [df[col] for col in list(df.columns)] @@ -72,7 +72,7 @@ def test_to_pandas_remaps(self): prepared_table = self.test_table.update( formulas=["Long = isNull(Long_) ? Double.NaN : Long_"]) - df = to_pandas(prepared_table, cols=["Boolean", "Long"]) + df = to_pandas(prepared_table, cols=["Boolean", "Long"], conv_null=False) self.assertEqual(df['Long'].dtype, np.float64) self.assertEqual(df['Boolean'].values.dtype, np.bool_) @@ -116,7 +116,7 @@ def test_to_table(self): double_col(name="Double", data=[1.01, -1.01]), ] test_table = new_table(cols=input_cols) - df = to_pandas(test_table) + df = to_pandas(test_table, conv_null=False) table_from_df = to_table(df) self.assert_table_equals(table_from_df, test_table) @@ -125,7 +125,7 @@ def test_to_table_boolean_with_none(self): table_with_null_bool = new_table(cols=input_cols) prepared_table = table_with_null_bool.update( formulas=["Boolean = isNull(Boolean) ? (byte)NULL_BYTE : (Boolean == true ? 1: 0)"]) - df = to_pandas(prepared_table) + df = to_pandas(prepared_table, conv_null=False) table_from_df = to_table(df) self.assert_table_equals(table_from_df, prepared_table) From 86ec6fb8036684987945deca88c80624d718bc69 Mon Sep 17 00:00:00 2001 From: jianfengmao Date: Wed, 2 Aug 2023 09:48:12 -0600 Subject: [PATCH 3/4] Fix a typo in docstring --- py/server/deephaven/pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/server/deephaven/pandas.py b/py/server/deephaven/pandas.py index 0257e6baaa9..0ea0a7d3e93 100644 --- a/py/server/deephaven/pandas.py +++ b/py/server/deephaven/pandas.py @@ -214,7 +214,7 @@ def _map_na(array: [np.ndarray, pd.api.extensions.ExtensionArray]): dh_null = _EX_DTYPE_NULL_MAP.get(type(pd_dtype)) or _EX_DTYPE_NULL_MAP.get(pd_dtype) # To preserve NaNs in floating point arrays, Pandas doesn't distinguish NaN/Null as far as NA testing is - # concerted, thus its fillna() method will replace both NaN/Null in the data. + # concerned, thus its fillna() method will replace both NaN/Null in the data. if isinstance(pd_dtype, (pd.Float32Dtype, pd.Float64Dtype)) and isinstance(getattr(array, "_data"), np.ndarray): np_array = array._data null_mask = np.logical_and(array._mask, np.logical_not(np.isnan(np_array))) From 52fe48dfe5c06a7eb53df56ba67840cb397217ee Mon Sep 17 00:00:00 2001 From: jianfengmao Date: Wed, 2 Aug 2023 10:51:18 -0600 Subject: [PATCH 4/4] Fix a test failure --- py/server/tests/test_learn_gather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/server/tests/test_learn_gather.py b/py/server/tests/test_learn_gather.py index 611a3ebcb50..64a316be3f7 100644 --- a/py/server/tests/test_learn_gather.py +++ b/py/server/tests/test_learn_gather.py @@ -141,7 +141,7 @@ def base_test(self, source, model, np_dtype): gatherer_colmajor = lambda rowset, colset: gather.table_to_numpy_2d(rowset, colset, gather.MemoryLayout.COLUMN_MAJOR, np_dtype) - array_from_table = to_pandas(source).values + array_from_table = to_pandas(source, conv_null=False).values gathered_rowmajor = gatherer_rowmajor(rows, cols) gathered_colmajor = gatherer_colmajor(rows, cols)