From 0ee7509797157175d76532b0d54c23b6647663a0 Mon Sep 17 00:00:00 2001
From: everpcpc <git@everpcpc.com>
Date: Tue, 16 Jan 2024 17:19:42 +0800
Subject: [PATCH] fix: string and binary type

---
 .github/workflows/python.yml |   1 +
 python/.gitignore            |   1 +
 python/Pipfile               |  14 ++
 python/databend_udf/udf.py   | 206 +++++++++++++---------
 python/example/server.py     | 322 +++++++++++++++++++++++++++++++++++
 python/pyproject.toml        |   2 +-
 6 files changed, 463 insertions(+), 83 deletions(-)
 create mode 100644 python/Pipfile
 create mode 100644 python/example/server.py

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 379d440..a37724c 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -34,6 +34,7 @@ jobs:
           python -m pip install build
           python -m build
       - uses: pypa/gh-action-pypi-publish@release/v1
+        if: github.event_name == 'push'
         with:
           packages-dir: python/dist/
           skip-existing: true
diff --git a/python/.gitignore b/python/.gitignore
index 8be2526..0ea8ccd 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -2,3 +2,4 @@ venv/
 *.egg-info/
 dist/
 __pycache__/
+Pipfile.lock
diff --git a/python/Pipfile b/python/Pipfile
new file mode 100644
index 0000000..031e0f2
--- /dev/null
+++ b/python/Pipfile
@@ -0,0 +1,14 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+databend-udf = {file = "."}
+
+[dev-packages]
+flake8 = "*"
+black = "*"
+
+[requires]
+python_version = "3.12"
diff --git a/python/databend_udf/udf.py b/python/databend_udf/udf.py
index 98f88e2..4c1baa3 100644
--- a/python/databend_udf/udf.py
+++ b/python/databend_udf/udf.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import json
+import logging
 import inspect
-import traceback
 from concurrent.futures import ThreadPoolExecutor
 from typing import Iterator, Callable, Optional, Union, List, Dict
 
@@ -24,11 +24,13 @@
 # comes from Databend
 MAX_DECIMAL128_PRECISION = 38
 MAX_DECIMAL256_PRECISION = 76
-EXTENSION_KEY = "Extension"
-ARROW_EXT_TYPE_VARIANT = "Variant"
+EXTENSION_KEY = b"Extension"
+ARROW_EXT_TYPE_VARIANT = b"Variant"
 
 TIMESTAMP_UINT = "us"
 
+logger = logging.getLogger(__name__)
+
 
 class UserDefinedFunction:
     """
@@ -92,8 +94,8 @@ def __init__(
     def eval_batch(self, batch: pa.RecordBatch) -> Iterator[pa.RecordBatch]:
         inputs = [[v.as_py() for v in array] for array in batch]
         inputs = [
-            _process_func(pa.list_(type), False)(array)
-            for array, type in zip(inputs, self._input_schema.types)
+            _input_process_func(_list_field(field))(array)
+            for array, field in zip(inputs, self._input_schema)
         ]
         if self._executor is not None:
             # concurrently evaluate the function for each row
@@ -122,7 +124,7 @@ def eval_batch(self, batch: pa.RecordBatch) -> Iterator[pa.RecordBatch]:
                     for row in range(batch.num_rows)
                 ]
 
-        column = _process_func(pa.list_(self._result_schema.types[0]), True)(column)
+        column = _output_process_func(_list_field(self._result_schema.field(0)))(column)
 
         array = pa.array(column, type=self._result_schema.types[0])
         yield pa.RecordBatch.from_arrays([array], schema=self._result_schema)
@@ -231,7 +233,7 @@ def do_exchange(self, context, descriptor, reader, writer):
                 for output_batch in udf.eval_batch(batch.data):
                     writer.write_batch(output_batch)
         except Exception as e:
-            print(traceback.print_exc())
+            logger.exception(e)
             raise e
 
     def add_function(self, udf: UserDefinedFunction):
@@ -249,97 +251,112 @@ def add_function(self, udf: UserDefinedFunction):
             f"RETURNS {output_type} LANGUAGE python "
             f"HANDLER = '{name}' ADDRESS = 'http://{self._location}';"
         )
-        print(f"added function: {name}, corresponding SQL:\n{sql}\n")
+        logger.info(f"added function: {name}, SQL:\n{sql}\n")
 
     def serve(self):
         """Start the server."""
-        print(f"listening on {self._location}")
+        logger.info(f"listening on {self._location}")
         super(UDFServer, self).serve()
 
 
-def _null_func(*args):
-    return None
-
-
-def _process_func(type: pa.DataType, output: bool) -> Callable:
+def _input_process_func(field: pa.Field) -> Callable:
     """
-    Return a function to process input or output value.
-
-    For input type:
-        - String=pa.string(): bytes -> str
-        - Tuple=pa.struct(): dict -> tuple
-        - Json=pa.large_binary(): bytes -> Any
-        - Map=pa.map_(): list[tuple(k,v)] -> dict
+    Return a function to process input value.
 
-    For output type:
-        - Json=pa.large_binary(): Any -> str
-        - Map=pa.map_(): dict -> list[tuple(k,v)]
+    - Tuple=pa.struct(): dict -> tuple
+    - Json=pa.large_binary(): bytes -> Any
+    - Map=pa.map_(): list[tuple(k,v)] -> dict
     """
-    if pa.types.is_list(type):
-        func = _process_func(type.value_type, output)
+    if pa.types.is_list(field.type):
+        func = _input_process_func(field.type.value_field)
         return (
-            lambda array: [(func(v) if v is not None else None) for v in array]
+            lambda array: [func(v) if v is not None else None for v in array]
             if array is not None
             else None
         )
-    if pa.types.is_struct(type):
-        funcs = [_process_func(field.type, output) for field in type]
-        if output:
-            return (
-                lambda tup: tuple(
-                    (func(v) if v is not None else None) for v, func in zip(tup, funcs)
-                )
-                if tup is not None
-                else None
-            )
-        else:
-            # the input value of struct type is a dict
-            # we convert it into tuple here
-            return (
-                lambda map: tuple(
-                    (func(v) if v is not None else None)
-                    for v, func in zip(map.values(), funcs)
-                )
-                if map is not None
-                else None
+    if pa.types.is_struct(field.type):
+        funcs = [_input_process_func(f) for f in field.type]
+        # the input value of struct type is a dict
+        # we convert it into tuple here
+        return (
+            lambda map: tuple(
+                func(v) if v is not None else None
+                for v, func in zip(map.values(), funcs)
             )
-    if pa.types.is_map(type):
+            if map is not None
+            else None
+        )
+    if pa.types.is_map(field.type):
         funcs = [
-            _process_func(type.key_type, output),
-            _process_func(type.item_type, output),
+            _input_process_func(field.type.key_field),
+            _input_process_func(field.type.item_field),
         ]
-        if output:
-            # dict -> list[tuple[k,v]]
-            return (
-                lambda map: [
-                    tuple(func(v) for v, func in zip(item, funcs))
-                    for item in map.items()
-                ]
-                if map is not None
-                else None
+        # list[tuple[k,v]] -> dict
+        return (
+            lambda array: dict(
+                tuple(func(v) for v, func in zip(item, funcs)) for item in array
             )
-        else:
-            # list[tuple[k,v]] -> dict
-            return (
-                lambda array: dict(
-                    tuple(func(v) for v, func in zip(item, funcs)) for item in array
-                )
-                if array is not None
-                else None
+            if array is not None
+            else None
+        )
+    if pa.types.is_large_binary(field.type):
+        if _field_is_variant(field):
+            return lambda v: json.loads(v) if v is not None else None
+
+    return lambda v: v
+
+
+def _output_process_func(field: pa.Field) -> Callable:
+    """
+    Return a function to process output value.
+
+    - Json=pa.large_binary(): Any -> str
+    - Map=pa.map_(): dict -> list[tuple(k,v)]
+    """
+    if pa.types.is_list(field.type):
+        func = _output_process_func(field.type.value_field)
+        return (
+            lambda array: [func(v) if v is not None else None for v in array]
+            if array is not None
+            else None
+        )
+    if pa.types.is_struct(field.type):
+        funcs = [_output_process_func(f) for f in field.type]
+        return (
+            lambda tup: tuple(
+                func(v) if v is not None else None for v, func in zip(tup, funcs)
             )
+            if tup is not None
+            else None
+        )
+    if pa.types.is_map(field.type):
+        funcs = [
+            _output_process_func(field.type.key_field),
+            _output_process_func(field.type.item_field),
+        ]
+        # dict -> list[tuple[k,v]]
+        return (
+            lambda map: [
+                tuple(func(v) for v, func in zip(item, funcs)) for item in map.items()
+            ]
+            if map is not None
+            else None
+        )
+    if pa.types.is_large_binary(field.type):
+        if _field_is_variant(field):
+            return lambda v: json.dumps(_ensure_str(v)) if v is not None else None
 
-    if pa.types.is_string(type) and not output:
-        # string type is converted to LargeBinary in Databend,
-        # we cast it back to string here
-        return lambda v: v.decode("utf-8") if v is not None else None
-    if pa.types.is_large_binary(type):
-        if output:
-            return lambda v: json.dumps(v) if v is not None else None
-        else:
-            return lambda v: json.loads(v) if v is not None else None
     return lambda v: v
 
 
+def _null_func(*args):
+    return None
+
+
+def _list_field(field: pa.Field) -> pa.Field:
+    return pa.field("", pa.list_(field))
+
+
 def _to_list(x):
     if isinstance(x, list):
         return x
@@ -347,6 +364,25 @@ def _to_list(x):
         return [x]
 
 
+def _ensure_str(x):
+    if isinstance(x, bytes):
+        return x.decode("utf-8")
+    elif isinstance(x, list):
+        return [_ensure_str(v) for v in x]
+    elif isinstance(x, dict):
+        return {_ensure_str(k): _ensure_str(v) for k, v in x.items()}
+    else:
+        return x
+
+
+def _field_is_variant(field: pa.Field) -> bool:
+    if field.metadata is None:
+        return False
+    if field.metadata.get(EXTENSION_KEY) == ARROW_EXT_TYPE_VARIANT:
+        return True
+    return False
+
+
 def _to_arrow_field(t: Union[str, pa.DataType]) -> pa.Field:
     """
     Convert a string or pyarrow.DataType to pyarrow.Field.
@@ -401,7 +437,9 @@ def _type_str_to_arrow_field_inner(type_str: str) -> pa.Field:
     elif type_str in ("DATETIME", "TIMESTAMP"):
         return pa.field("", pa.timestamp(TIMESTAMP_UINT), False)
     elif type_str in ("STRING", "VARCHAR", "CHAR", "CHARACTER", "TEXT"):
-        return pa.field("", pa.string(), False)
+        return pa.field("", pa.large_utf8(), False)
+    elif type_str in ("BINARY"):
+        return pa.field("", pa.large_binary(), False)
     elif type_str in ("VARIANT", "JSON"):
         # In Databend, JSON type is identified by the "EXTENSION" key in the metadata.
         return pa.field(
@@ -460,20 +498,21 @@ def _arrow_field_to_string(field: pa.Field) -> str:
     """
     Convert a `pyarrow.Field` to a SQL data type string.
     """
-    type_str = _data_type_to_string(field.type)
+    type_str = _field_type_to_string(field)
     return f"{type_str} NOT NULL" if not field.nullable else type_str
 
 
 def _inner_field_to_string(field: pa.Field) -> str:
     # inner field default is NOT NULL in databend
-    type_str = _data_type_to_string(field.type)
+    type_str = _field_type_to_string(field)
     return f"{type_str} NULL" if field.nullable else type_str
 
 
-def _data_type_to_string(t: pa.DataType) -> str:
+def _field_type_to_string(field: pa.Field) -> str:
     """
     Convert a `pyarrow.DataType` to a SQL data type string.
     """
+    t = field.type
     if pa.types.is_boolean(t):
         return "BOOLEAN"
     elif pa.types.is_int8(t):
@@ -502,10 +541,13 @@ def _data_type_to_string(t: pa.DataType) -> str:
         return "DATE"
     elif pa.types.is_timestamp(t):
         return "TIMESTAMP"
-    elif pa.types.is_string(t):
+    elif pa.types.is_large_unicode(t):
         return "VARCHAR"
     elif pa.types.is_large_binary(t):
-        return "VARIANT"
+        if _field_is_variant(field):
+            return "VARIANT"
+        else:
+            return "BINARY"
     elif pa.types.is_list(t):
         return f"ARRAY({_inner_field_to_string(t.value_field)})"
     elif pa.types.is_map(t):
diff --git a/python/example/server.py b/python/example/server.py
new file mode 100644
index 0000000..42ad118
--- /dev/null
+++ b/python/example/server.py
@@ -0,0 +1,322 @@
+# Copyright 2023 RisingWave Labs
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import datetime
+from decimal import Decimal
+import time
+from typing import List, Dict, Any, Tuple, Optional
+
+from databend_udf import udf, UDFServer
+
+logging.basicConfig(level=logging.INFO)
+
+
+@udf(input_types=["TINYINT", "SMALLINT", "INT", "BIGINT"], result_type="BIGINT")
+def add_signed(a, b, c, d):
+    return a + b + c + d
+
+
+@udf(input_types=["UINT8", "UINT16", "UINT32", "UINT64"], result_type="UINT64")
+def add_unsigned(a, b, c, d):
+    return a + b + c + d
+
+
+@udf(input_types=["FLOAT", "DOUBLE"], result_type="DOUBLE")
+def add_float(a, b):
+    return a + b
+
+
+@udf(input_types=["BOOLEAN", "BIGINT", "BIGINT"], result_type="BIGINT")
+def bool_select(condition, a, b):
+    return a if condition else b
+
+
+@udf(
+    name="gcd",
+    input_types=["INT", "INT"],
+    result_type="INT",
+    skip_null=True,
+)
+def gcd(x: int, y: int) -> int:
+    while y != 0:
+        (x, y) = (y, x % y)
+    return x
+
+
+@udf(input_types=["VARCHAR", "VARCHAR", "VARCHAR"], result_type="VARCHAR")
+def split_and_join(s: str, split_s: str, join_s: str) -> str:
+    return join_s.join(s.split(split_s))
+
+
+@udf(input_types=["BINARY"], result_type="BINARY")
+def binary_reverse(s: bytes) -> bytes:
+    return s[::-1]
+
+
+@udf(input_types="VARCHAR", result_type="DECIMAL(36, 18)")
+def hex_to_dec(hex: str) -> Decimal:
+    hex = hex.strip()
+
+    dec = Decimal(0)
+    while hex:
+        chunk = hex[:16]
+        chunk_value = int(hex[:16], 16)
+        dec = dec * (1 << (4 * len(chunk))) + chunk_value
+        chunk_len = len(chunk)
+        hex = hex[chunk_len:]
+    return dec
+
+
+@udf(input_types=["DECIMAL(36, 18)", "DECIMAL(36, 18)"], result_type="DECIMAL(72, 28)")
+def decimal_div(v1: Decimal, v2: Decimal) -> Decimal:
+    result = v1 / v2
+    return result.quantize(Decimal("0." + "0" * 28))
+
+
+@udf(input_types=["DATE", "INT"], result_type="DATE")
+def add_days_py(dt: datetime.date, days: int):
+    return dt + datetime.timedelta(days=days)
+
+
+@udf(input_types=["TIMESTAMP", "INT"], result_type="TIMESTAMP")
+def add_hours_py(dt: datetime.datetime, hours: int):
+    return dt + datetime.timedelta(hours=hours)
+
+
+@udf(input_types=["ARRAY(VARCHAR)", "INT"], result_type="VARCHAR")
+def array_access(array: List[str], idx: int) -> Optional[str]:
+    if idx == 0 or idx > len(array):
+        return None
+    return array[idx - 1]
+
+
+@udf(
+    input_types=["ARRAY(INT64 NULL)", "INT64"],
+    result_type="INT NOT NULL",
+    skip_null=False,
+)
+def array_index_of(array: List[int], item: int):
+    if array is None:
+        return 0
+
+    try:
+        return array.index(item) + 1
+    except ValueError:
+        return 0
+
+
+@udf(input_types=["MAP(VARCHAR,VARCHAR)", "VARCHAR"], result_type="VARCHAR")
+def map_access(map: Dict[str, str], key: str) -> str:
+    return map[key] if key in map else None
+
+
+@udf(input_types=["VARIANT", "VARCHAR"], result_type="VARIANT")
+def json_access(data: Any, key: str) -> Any:
+    return data[key]
+
+
+@udf(input_types=["ARRAY(VARIANT)"], result_type="VARIANT")
+def json_concat(list: List[Any]) -> Any:
+    return list
+
+
+@udf(
+    input_types=["TUPLE(ARRAY(VARIANT NULL), INT, VARCHAR)", "INT", "INT"],
+    result_type="TUPLE(VARIANT NULL, VARIANT NULL)",
+)
+def tuple_access(
+    tup: Tuple[List[Any], int, str], idx1: int, idx2: int
+) -> Tuple[Any, Any]:
+    v1 = None if idx1 == 0 or idx1 > len(tup) else tup[idx1 - 1]
+    v2 = None if idx2 == 0 or idx2 > len(tup) else tup[idx2 - 1]
+    return v1, v2
+
+
+ALL_SCALAR_TYPES = [
+    "BOOLEAN",
+    "TINYINT",
+    "SMALLINT",
+    "INT",
+    "BIGINT",
+    "UINT8",
+    "UINT16",
+    "UINT32",
+    "UINT64",
+    "FLOAT",
+    "DOUBLE",
+    "DATE",
+    "TIMESTAMP",
+    "VARCHAR",
+    "VARIANT",
+]
+
+
+@udf(
+    input_types=ALL_SCALAR_TYPES,
+    result_type=f"TUPLE({','.join(f'{t} NULL' for t in ALL_SCALAR_TYPES)})",
+)
+def return_all(
+    bool,
+    i8,
+    i16,
+    i32,
+    i64,
+    u8,
+    u16,
+    u32,
+    u64,
+    f32,
+    f64,
+    date,
+    timestamp,
+    varchar,
+    json,
+):
+    return (
+        bool,
+        i8,
+        i16,
+        i32,
+        i64,
+        u8,
+        u16,
+        u32,
+        u64,
+        f32,
+        f64,
+        date,
+        timestamp,
+        varchar,
+        json,
+    )
+
+
+@udf(
+    input_types=[f"ARRAY({t})" for t in ALL_SCALAR_TYPES],
+    result_type=f"TUPLE({','.join(f'ARRAY({t})' for t in ALL_SCALAR_TYPES)})",
+)
+def return_all_arrays(
+    bool,
+    i8,
+    i16,
+    i32,
+    i64,
+    u8,
+    u16,
+    u32,
+    u64,
+    f32,
+    f64,
+    date,
+    timestamp,
+    varchar,
+    json,
+):
+    return (
+        bool,
+        i8,
+        i16,
+        i32,
+        i64,
+        u8,
+        u16,
+        u32,
+        u64,
+        f32,
+        f64,
+        date,
+        timestamp,
+        varchar,
+        json,
+    )
+
+
+@udf(
+    input_types=[f"{t} NOT NULL" for t in ALL_SCALAR_TYPES],
+    result_type=f"TUPLE({','.join(f'{t}' for t in ALL_SCALAR_TYPES)})",
+)
+def return_all_non_nullable(
+    bool,
+    i8,
+    i16,
+    i32,
+    i64,
+    u8,
+    u16,
+    u32,
+    u64,
+    f32,
+    f64,
+    date,
+    timestamp,
+    varchar,
+    json,
+):
+    return (
+        bool,
+        i8,
+        i16,
+        i32,
+        i64,
+        u8,
+        u16,
+        u32,
+        u64,
+        f32,
+        f64,
+        date,
+        timestamp,
+        varchar,
+        json,
+    )
+
+
+@udf(input_types=["INT"], result_type="INT")
+def wait(x):
+    time.sleep(0.1)
+    return x
+
+
+@udf(input_types=["INT"], result_type="INT", io_threads=32)
+def wait_concurrent(x):
+    time.sleep(0.1)
+    return x
+
+
+if __name__ == "__main__":
+    udf_server = UDFServer("0.0.0.0:8815")
+    udf_server.add_function(add_signed)
+    udf_server.add_function(add_unsigned)
+    udf_server.add_function(add_float)
+    udf_server.add_function(binary_reverse)
+    udf_server.add_function(bool_select)
+    udf_server.add_function(gcd)
+    udf_server.add_function(split_and_join)
+    udf_server.add_function(decimal_div)
+    udf_server.add_function(hex_to_dec)
+    udf_server.add_function(add_days_py)
+    udf_server.add_function(add_hours_py)
+    udf_server.add_function(array_access)
+    udf_server.add_function(array_index_of)
+    udf_server.add_function(map_access)
+    udf_server.add_function(json_access)
+    udf_server.add_function(json_concat)
+    udf_server.add_function(tuple_access)
+    udf_server.add_function(return_all)
+    udf_server.add_function(return_all_arrays)
+    udf_server.add_function(return_all_non_nullable)
+    udf_server.add_function(wait)
+    udf_server.add_function(wait_concurrent)
+    udf_server.serve()
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 4699c9c..bd430c8 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -7,7 +7,7 @@ classifiers = [
 description = "Databend UDF Server"
 license = { text = "Apache-2.0" }
 name = "databend-udf"
-version = "0.1.4"
+version = "0.2.0"
 readme = "README.md"
 requires-python = ">=3.7"
 dependencies = ["pyarrow"]