Add helper function dh_null_to_nan for explicit null conv of array el…

…ements by users (deephaven#5310) * Add dh_nulls_to_nan for explicit conv by users * Add input check for public API func * Add tests * Respond to review comments and fix a bug * Make the default for type_promotion False
lbooker42 · Apr 15, 2024 · f1316fa · f1316fa
1 parent c2017ce
commit f1316fa
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 49 deletions.
diff --git a/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java b/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java
@@ -286,6 +286,7 @@ public void verifyArguments(Class<?>[] argTypes) {
         StringBuilder argTypesStr = new StringBuilder();
         for (int i = 0; i < argTypes.length; i++) {
             Class<?> argType = argTypes[i];
+            argType = argType == boolean.class ? Boolean.class : argType;
 
             // if there are more arguments than parameters, we'll need to consider the last parameter as a varargs
             // parameter. This is not ideal. We should look for a better way to handle this, i.e. a way to convey that

diff --git a/py/server/deephaven/jcompat.py b/py/server/deephaven/jcompat.py
@@ -5,15 +5,15 @@
 """ This module provides Java compatibility support including convenience functions to create some widely used Java
 data structures from corresponding Python ones in order to be able to call Java methods. """
 
-from typing import Any, Callable, Dict, Iterable, List, Sequence, Set, TypeVar, Union, Tuple, Literal, Optional
+from typing import Any, Callable, Dict, Iterable, List, Sequence, Set, TypeVar, Union, Optional
 
 import jpy
 import numpy as np
 import pandas as pd
 
 from deephaven import dtypes, DHError
 from deephaven._wrapper import unwrap, wrap_j_object, JObjectWrapper
-from deephaven.dtypes import DType, _PRIMITIVE_DTYPE_NULL_MAP, _J_ARRAY_NP_TYPE_MAP
+from deephaven.dtypes import DType, _PRIMITIVE_DTYPE_NULL_MAP
 
 _NULL_BOOLEAN_AS_BYTE = jpy.get_type("io.deephaven.util.BooleanUtils").NULL_BOOLEAN_AS_BYTE
 _JPrimitiveArrayConversionUtility = jpy.get_type("io.deephaven.integrations.common.PrimitiveArrayConversionUtility")
@@ -216,14 +216,8 @@ def _j_array_to_numpy_array(dtype: DType, j_array: jpy.JType, conv_null: bool, t
         dtype (DType): The dtype of the Java array
         j_array (jpy.JType): The Java array to convert
         conv_null (bool): If True, convert nulls to the null value for the dtype
-        type_promotion (bool): Ignored when conv_null is False.  When type_promotion is False, (1) input Java integer,
-            boolean, or character arrays containing Deephaven nulls yield an exception, (2) input Java float or double
-            arrays containing Deephaven nulls have null values converted to np.nan, and (3) input Java arrays without
-            Deephaven nulls are converted to the target type.  When type_promotion is True, (1) input Java integer,
-            boolean, or character arrays containing Deephaven nulls are converted to np.float64 arrays and Deephaven
-            null values are converted to np.nan, (2) input Java float or double arrays containing Deephaven nulls have
-            null values converted to np.nan, and (3) input Java arrays without Deephaven nulls are converted to the
-            target type.  Defaults to False.
+        type_promotion (bool): Ignored when conv_null is False. When conv_null is True, see the description for the same
+            named parameter in dh_nulls_to_nan().
 
     Returns:
         np.ndarray: The numpy array or None if the Java array is None
@@ -255,26 +249,49 @@ def _j_array_to_numpy_array(dtype: DType, j_array: jpy.JType, conv_null: bool, t
         np_array = np.array(j_array, np.object_)
 
     if conv_null:
-        if dh_null := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype):
-            if dtype in (dtypes.float32, dtypes.float64):
-                np_array = np.copy(np_array)
-                np_array[np_array == dh_null] = np.nan
-            else:
-                if dtype is dtypes.bool_:  # needs to change its type to byte for dh null detection
-                    np_array = np.frombuffer(np_array, np.byte)
-
-                if any(np_array[np_array == dh_null]):
-                    if not type_promotion:
-                        raise DHError(f"Problem creating numpy array.  Java {dtype} array contains Deephaven null values, but numpy {np_array.dtype} array does not support null values")
-                    np_array = np_array.astype(np.float64)
-                    np_array[np_array == dh_null] = np.nan
-                else:
-                    if dtype is dtypes.bool_:  # needs to change its type back to bool
-                        np_array = np.frombuffer(np_array, np.bool_)
-                    return np_array
+        return dh_null_to_nan(np_array, type_promotion)
 
     return np_array
 
+def dh_null_to_nan(np_array: np.ndarray, type_promotion: bool = False) -> np.ndarray:
+    """Converts Deephaven primitive null values in the given numpy array to np.nan. No conversion is performed on
+    non-primitive types.
+
+    Note, the input numpy array is modified in place if it is of a float or double type. If that's not a desired behavior,
+    pass a copy of the array instead. For input arrays of other types, a new array is always returned.
+
+    Args:
+        np_array (np.ndarray): The numpy array to convert
+        type_promotion (bool): When False, integer, boolean, or character arrays will cause an exception to be raised.
+            When True, integer, boolean, or character arrays are converted to new np.float64 arrays and Deephaven null
+            values in them are converted to np.nan. Numpy arrays of float or double types are not affected by this flag
+            and Deephaven nulls will always be converted to np.nan in place. Defaults to False.
+
+    Returns:
+        np.ndarray: The numpy array with Deephaven nulls converted to np.nan.
+
+    Raises:
+        DHError
+    """
+    if not isinstance(np_array, np.ndarray):
+        raise DHError(message="The given np_array argument is not a numpy array.")
+
+    dtype = dtypes.from_np_dtype(np_array.dtype)
+    if dh_null := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype):
+        if dtype in (dtypes.float32, dtypes.float64):
+            np_array = np.copy(np_array)
+            np_array[np_array == dh_null] = np.nan
+        else:
+            if not type_promotion:
+                raise DHError(message=f"failed to convert DH nulls to np.nan in the numpy array. The array is "
+                                      f"of {np_array.dtype.type} type  but type_promotion is False")
+            if dtype is dtypes.bool_:  # needs to change its type to byte for dh null detection
+                np_array = np.frombuffer(np_array, np.byte)
+
+            np_array = np_array.astype(np.float64)
+            np_array[np_array == dh_null] = np.nan
+
+    return np_array
 
 def _j_array_to_series(dtype: DType, j_array: jpy.JType, conv_null: bool) -> pd.Series:
     """Produce a copy of the specified Java array as a pandas.Series object.

diff --git a/py/server/tests/test_udf_array_args.py b/py/server/tests/test_udf_array_args.py
@@ -8,6 +8,7 @@
 import numpy as np
 
 from deephaven import empty_table, DHError, dtypes
+from deephaven.jcompat import dh_null_to_nan
 from tests.testbase import BaseTestCase
 from .test_udf_scalar_args import _J_TYPE_NP_DTYPE_MAP, _J_TYPE_NULL_MAP, _J_TYPE_J_ARRAY_TYPE_MAP
 
@@ -100,21 +101,8 @@ def test_udf(x, y: np.ndarray[{_J_TYPE_NP_DTYPE_MAP[j_dtype]}]) -> bool:
                             """
                     exec(func_str, globals())
 
-                    # for floating point types, DH nulls are auto converted to np.nan
-                    # for integer types, DH nulls in the array raise exceptions
-                    if j_dtype in ("float", "double"):
-                        res = tbl.update("Z = test_udf(X, Y)")
-                        self.assertEqual(10, res.to_string().count("true"))
-                    else:
-                        res = tbl.update("Z = test_udf(X, Y)")
-                        self.assertEqual(10, res.to_string().count("true"))
-
-                        # TODO need to wait for https://github.com/deephaven/deephaven-core/issues/5213 to be resolved
-                        # with self.assertRaises(DHError) as cm:
-                        #     tbl.update("Z = test_udf(X, Y)")
-                        # self.assertRegex(str(cm.exception), "Java .* array contains Deephaven null values,
-                        # but numpy .* "
-                        #                                     "array does not support ")
+                    res = tbl.update("Z = test_udf(X, Y)")
+                    self.assertEqual(10, res.to_string().count("true"))
 
     def test_np_object_array(self):
         with self.subTest("PyObject"):
@@ -189,11 +177,6 @@ def test_udf(p1: np.ndarray[np.bool_], p2=None) -> bool:
             t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : null"]).group_by("X")
             t1 = t.update(["X1 = test_udf(Y)"])
             self.assertEqual(t1.columns[2].data_type, dtypes.bool_)
-            # TODO need to wait for https://github.com/deephaven/deephaven-core/issues/5213 to be resolved
-            # with self.assertRaises(DHError) as cm:
-            #     t1 = t.update(["X1 = test_udf(Y)"])
-            # self.assertRegex(str(cm.exception), "Java .* array contains Deephaven null values, but numpy .* "
-            #                                     "array does not support ")
             t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : false"]).group_by("X")
             t1 = t.update(["X1 = test_udf(Y)"])
             self.assertEqual(t1.columns[2].data_type, dtypes.bool_)
@@ -237,6 +220,55 @@ def test_udf(x, y: Union[{th}, np.ndarray[np.int64]]) -> bool:
                         ["Z = test_udf(X, Y.toArray())"])
                     self.assertEqual(t.columns[2].data_type, dtypes.bool_)
 
+    def test_dh_null_conversion(self):
+        x_formula = "X = i % 10"
+        for j_dtype, null_name in _J_TYPE_NULL_MAP.items():
+            y_formula = f"Y = i % 3 == 0? {null_name} : ({j_dtype})i"
+            with self.subTest(j_dtype):
+                tbl = empty_table(100).update([x_formula, y_formula]).group_by("X")
+
+                func_str = f"""
+def test_udf(x, y: np.ndarray[{_J_TYPE_NP_DTYPE_MAP[j_dtype]}]) -> bool:
+    z = dh_null_to_nan(y, type_promotion=True)
+    check_y = (isinstance(x, int) and isinstance(y, np.ndarray) and y.dtype.type == 
+{_J_TYPE_NP_DTYPE_MAP[j_dtype]} and np.nanmean(y) == np.mean( y))
+    check_z = np.any(np.isnan(z)) and (z.dtype.type == np.float64 if y.dtype.type not in {{np.float32, np.float64}} 
+    else z.dtype == y.dtype)
+    return check_y and check_z 
+                """
+                exec(func_str, globals())
+
+                res = tbl.update("Z = test_udf(X, Y)")
+                self.assertEqual(10, res.to_string().count("true"))
+
+                func_str = f"""
+def test_udf(x, y: np.ndarray[{_J_TYPE_NP_DTYPE_MAP[j_dtype]}]) -> bool:
+    z = dh_null_to_nan(y, type_promotion=False)
+    return True
+                """
+                exec(func_str, globals())
+                if j_dtype not in {"float", "double"}:
+                    with self.assertRaises(DHError) as cm:
+                        res = tbl.update("Z = test_udf(X, Y)")
+                    self.assertRegex(str(cm.exception), "failed to convert DH nulls to np.nan .* type_promotion is False")
+                else:
+                    res = tbl.update("Z = test_udf(X, Y)")
+                    self.assertEqual(10, res.to_string().count("true"))
+
+
+        with self.subTest("boolean"):
+            def test_udf(p1: np.ndarray[np.bool_], p2=None, tp: bool = True) -> bool:
+                z = dh_null_to_nan(p1, type_promotion=tp)
+                return z.dtype.type == np.float64 and np.any(np.isnan(z))
+
+            t = empty_table(100).update(["X = i % 10", "Y = i % 3 == 0? true : null"]).group_by("X")
+            rest = t.update(["X1 = test_udf(Y)"])
+            self.assertEqual(10, res.to_string().count("true"))
+
+            with self.assertRaises(DHError) as cm:
+                t.update(["X1 = test_udf(Y, null, false)"])
+            self.assertRegex(str(cm.exception), "failed to convert DH nulls to np.nan .* type_promotion is False")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/py/server/tests/test_udf_scalar_args.py b/py/server/tests/test_udf_scalar_args.py
@@ -408,7 +408,7 @@ def test_udf(p1: int, p2: float, kw1: str) -> bool:
 
             with self.assertRaises(DHError) as cm:
                 t = empty_table(1).update("X = `1`").update("Y = test_udf(1, 1.0, X = `1`)")
-            self.assertRegex(str(cm.exception), "test_udf: Expected argument .* got boolean")
+            self.assertRegex(str(cm.exception), "test_udf: Expected argument .* got class java.lang.Boolean")
 
         with self.subTest("with keyword only params"):
             def test_udf(p1: int, p2: float, *, kw1: str) -> bool:
@@ -538,7 +538,7 @@ def f6(*args: np.int32, col2: np.ndarray[np.int32]) -> bool:
 
             with self.assertRaises(DHError) as cm:
                 t1 = t.update(["X1 = f6(X, Y=null)"])
-            self.assertRegex(str(cm.exception), "f6: Expected argument \(col2\) to be either .* got boolean")
+            self.assertRegex(str(cm.exception), "f6: Expected argument \(col2\) to be either .* got class java.lang.Boolean")
 
         with self.subTest("f7"):
             def f1(x: int) -> Optional[float]: