From 922b3a430c3f030db67f19b559e9fc231d7d5d33 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 31 Oct 2024 14:47:27 -0500
Subject: [PATCH 1/2] build wheels without build isolation (#301)

Contributes to https://github.com/rapidsai/build-planning/issues/108
Contributes to https://github.com/rapidsai/build-planning/issues/111

Proposes some small packaging/CI changes, matching similar changes being made across RAPIDS.

* building `libucxx` wheels with `--no-build-isolation` (for better `sccache` hit rate)
* printing `sccache` stats to CI logs
* updating to the latest `rapids-dependency-file-generator` (v1.16.0)
* always explicitly specifying `cpp` / `python` in calls to `rapids-upload-wheels-to-s3`
* moving more one-wheel-specific logic into `build_wheel_{project}.sh` scripts, mimicking how `cudf` has structured its scripts

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/ucxx/pull/301
---
 .pre-commit-config.yaml            |  2 +-
 ci/build_cpp.sh                    |  4 ++++
 ci/build_wheel.sh                  | 35 +++++++++---------------------
 ci/build_wheel_distributed_ucxx.sh |  6 ++++-
 ci/build_wheel_libucxx.sh          | 34 ++++++++++++++++++++++++++++-
 ci/build_wheel_ucxx.sh             | 25 ++++++++++++++++++++-
 6 files changed, 77 insertions(+), 29 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9e99aa12..89546be1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -49,7 +49,7 @@ repos:
                 - --fix
                 - --rapids-version=24.12
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.13.11
+        rev: v1.16.0
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 03d4449b..98410678 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -17,7 +17,11 @@ rapids-print-env
 
 rapids-logger "Begin C++ and Python builds"
 
+sccache --zero-stats
+
 rapids-conda-retry mambabuild \
   conda/recipes/ucxx
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 2d690d02..b1ede832 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -9,33 +9,18 @@ package_dir=$2
 source rapids-configure-sccache
 source rapids-date-string
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-
 rapids-generate-version > ./VERSION
 
-if [[ ${package_name} == "distributed-ucxx" ]]; then
-    python -m pip wheel "${package_dir}/" -w "${package_dir}/dist" -vvv --no-deps --disable-pip-version-check
-
-    RAPIDS_PY_WHEEL_NAME="distributed_ucxx_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/dist
-elif [[ ${package_name} == "libucxx" ]]; then
-    SKBUILD_CMAKE_ARGS="-DUCXX_ENABLE_RMM=ON" \
-        python -m pip wheel "${package_dir}"/ -w "${package_dir}"/dist -vvv --no-deps --disable-pip-version-check
-
-    python -m auditwheel repair -w ${package_dir}/final_dist --exclude "libucp.so.0" ${package_dir}/dist/*
-
-    RAPIDS_PY_WHEEL_NAME="libucxx_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
-elif [[ ${package_name} == "ucxx" ]]; then
-    CPP_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="libucxx_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libucxx_dist)
-    echo "libucxx-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${CPP_WHEELHOUSE}/libucxx_*.whl)" > "${package_dir}/constraints.txt"
+cd "${package_dir}"
 
-    PIP_CONSTRAINT="${package_dir}/constraints.txt" \
-    SKBUILD_CMAKE_ARGS="-DFIND_UCXX_CPP=ON;-DCMAKE_INSTALL_LIBDIR=ucxx/lib64;-DCMAKE_INSTALL_INCLUDEDIR=ucxx/include" \
-        python -m pip wheel "${package_dir}"/ -w "${package_dir}"/dist -vvv --no-deps --disable-pip-version-check
+sccache --zero-stats
 
-    python -m auditwheel repair -w ${package_dir}/final_dist --exclude "libucp.so.0" --exclude "libucxx.so" ${package_dir}/dist/*
+rapids-logger "Building '${package_name}' wheel"
+python -m pip wheel \
+    -w dist \
+    -v \
+    --no-deps \
+    --disable-pip-version-check \
+    .
 
-    RAPIDS_PY_WHEEL_NAME="ucxx_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
-else
-  echo "Unknown package '${package_name}'"
-  exit 1
-fi
+sccache --show-adv-stats
diff --git a/ci/build_wheel_distributed_ucxx.sh b/ci/build_wheel_distributed_ucxx.sh
index 77c2d988..f6ee95a7 100755
--- a/ci/build_wheel_distributed_ucxx.sh
+++ b/ci/build_wheel_distributed_ucxx.sh
@@ -5,4 +5,8 @@ set -euo pipefail
 
 package_dir="python/distributed-ucxx"
 
-./ci/build_wheel.sh distributed-ucxx ${package_dir}
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+./ci/build_wheel.sh distributed-ucxx "${package_dir}"
+
+RAPIDS_PY_WHEEL_NAME="distributed_ucxx_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python "${package_dir}/dist"
diff --git a/ci/build_wheel_libucxx.sh b/ci/build_wheel_libucxx.sh
index c5798f2f..e9262077 100755
--- a/ci/build_wheel_libucxx.sh
+++ b/ci/build_wheel_libucxx.sh
@@ -3,6 +3,38 @@
 
 set -euo pipefail
 
+package_name="libucxx"
 package_dir="python/libucxx"
 
-./ci/build_wheel.sh libucxx ${package_dir}
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+rapids-logger "Generating build requirements"
+
+rapids-dependency-file-generator \
+  --output requirements \
+  --file-key "py_build_${package_name}" \
+  --file-key "py_rapids_build_${package_name}" \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true" \
+| tee /tmp/requirements-build.txt
+
+rapids-logger "Installing build requirements"
+python -m pip install \
+    -v \
+    --prefer-binary \
+    -r /tmp/requirements-build.txt
+
+# build with '--no-build-isolation', for better sccache hit rate
+# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
+export PIP_NO_BUILD_ISOLATION=0
+
+export SKBUILD_CMAKE_ARGS="-DUCXX_ENABLE_RMM=ON"
+
+./ci/build_wheel.sh "${package_name}" "${package_dir}"
+
+mkdir -p "${package_dir}/final_dist"
+python -m auditwheel repair \
+    --exclude "libucp.so.0" \
+    -w "${package_dir}/final_dist" \
+    ${package_dir}/dist/*
+
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist"
diff --git a/ci/build_wheel_ucxx.sh b/ci/build_wheel_ucxx.sh
index 04897ffe..78602085 100755
--- a/ci/build_wheel_ucxx.sh
+++ b/ci/build_wheel_ucxx.sh
@@ -3,6 +3,29 @@
 
 set -euo pipefail
 
+package_name="ucxx"
 package_dir="python/ucxx"
 
-./ci/build_wheel.sh ucxx ${package_dir}
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+# Downloads libucxx wheel from this current build,
+# then ensures 'ucxx' wheel builds always use the 'libucxx' just built in the same CI run.
+#
+# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints
+# are used when creating the isolated build environment.
+RAPIDS_PY_WHEEL_NAME="libucxx_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libucxx_dist
+echo "libucxx-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libucxx_dist/libucxx_*.whl)" > /tmp/constraints.txt
+export PIP_CONSTRAINT="/tmp/constraints.txt"
+
+export SKBUILD_CMAKE_ARGS="-DFIND_UCXX_CPP=ON;-DCMAKE_INSTALL_LIBDIR=ucxx/lib64;-DCMAKE_INSTALL_INCLUDEDIR=ucxx/include"
+
+./ci/build_wheel.sh "${package_name}" "${package_dir}"
+
+mkdir -p "${package_dir}/final_dist"
+python -m auditwheel repair \
+    --exclude "libucp.so.0" \
+    --exclude "libucxx.so" \
+    -w "${package_dir}/final_dist" \
+    ${package_dir}/dist/*
+
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python "${package_dir}/final_dist"

From 706917415ec5cfe11c6d21093159ac32ddb051ff Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 4 Nov 2024 15:08:33 +0100
Subject: [PATCH 2/2] Extend Python benchmarks backends (#309)

Extend Python benchmarks with `socket` and `asyncio.Stream{Reader,Writer}` APIs. This should help us better understand the overhead of UCXX compared to Python's internal implementations, and thus help us improve potential suboptimal code in our implementation.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/ucxx/pull/309
---
 .../ucxx/ucxx/benchmarks/backends/asyncio.py  | 100 +++++++++++++++++
 .../ucxx/ucxx/benchmarks/backends/socket.py   | 102 ++++++++++++++++++
 python/ucxx/ucxx/benchmarks/send_recv.py      |  24 +++--
 3 files changed, 220 insertions(+), 6 deletions(-)
 create mode 100644 python/ucxx/ucxx/benchmarks/backends/asyncio.py
 create mode 100644 python/ucxx/ucxx/benchmarks/backends/socket.py

diff --git a/python/ucxx/ucxx/benchmarks/backends/asyncio.py b/python/ucxx/ucxx/benchmarks/backends/asyncio.py
new file mode 100644
index 00000000..45ae5dc6
--- /dev/null
+++ b/python/ucxx/ucxx/benchmarks/backends/asyncio.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: BSD-3-Clause
+
+import asyncio
+import warnings
+from time import monotonic
+
+from ucxx.benchmarks.backends.base import BaseClient, BaseServer
+
+
+class AsyncioServer(BaseServer):
+    has_cuda_support = False
+
+    def __init__(self, args, queue):
+        self.args = args
+        self.queue = queue
+        self._serve_task = None
+
+    async def _start_listener(self, port):
+        for i in range(10000, 60000):
+            try:
+                return i, await asyncio.start_server(self.handle_stream, "0.0.0.0", i)
+            except OSError:
+                continue
+        raise Exception("Could not start server")
+
+    async def handle_stream(self, reader, writer):
+        for i in range(self.args.n_iter + self.args.n_warmup_iter):
+            try:
+                recv_msg = await reader.read(self.args.n_bytes)
+                writer.write(recv_msg)
+                await writer.drain()
+            except ConnectionResetError:
+                break
+
+        writer.close()
+        await writer.wait_closed()
+
+        self._serve_task.cancel()
+
+    async def serve_forever(self):
+        if self.args.port is not None:
+            port, server = self.args.port, await asyncio.start_server(
+                self.handle_stream, "0.0.0.0", self.args.port
+            )
+        else:
+            port, server = await self._start_listener(None)
+
+        self.queue.put(port)
+        async with server:
+            await server.serve_forever()
+
+    async def run(self):
+        self._serve_task = asyncio.create_task(self.serve_forever())
+
+        try:
+            await self._serve_task
+        except asyncio.CancelledError:
+            pass
+
+
+class AsyncioClient(BaseClient):
+    has_cuda_support = False
+
+    def __init__(self, args, queue, server_address, port):
+        self.args = args
+        self.queue = queue
+        self.server_address = server_address
+        self.port = port
+
+    async def run(self):
+        reader, writer = await asyncio.open_connection(
+            self.server_address, self.port, limit=1024**3
+        )
+
+        if self.args.reuse_alloc:
+            warnings.warn(
+                "Reuse allocation not supported by 'asyncio' backend, it will be "
+                "ignored."
+            )
+
+        send_msg = ("x" * self.args.n_bytes).encode()
+
+        times = []
+        for i in range(self.args.n_iter + self.args.n_warmup_iter):
+            start = monotonic()
+
+            try:
+                writer.write(send_msg)
+                await writer.drain()
+                await reader.read(self.args.n_bytes)
+            except ConnectionResetError:
+                break
+
+            stop = monotonic()
+            if i >= self.args.n_warmup_iter:
+                times.append(stop - start)
+        self.queue.put(times)
+        writer.close()
+        await writer.wait_closed()
diff --git a/python/ucxx/ucxx/benchmarks/backends/socket.py b/python/ucxx/ucxx/benchmarks/backends/socket.py
new file mode 100644
index 00000000..b6cf75d0
--- /dev/null
+++ b/python/ucxx/ucxx/benchmarks/backends/socket.py
@@ -0,0 +1,102 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: BSD-3-Clause
+
+import socket
+import threading
+from time import monotonic
+
+import numpy as np
+
+from ucxx.benchmarks.backends.base import BaseClient, BaseServer
+
+
+class SocketServer(BaseServer):
+    has_cuda_support = False
+
+    def __init__(self, args, queue):
+        self.args = args
+        self.queue = queue
+        self.server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+
+    def _start_listener(self, server, port):
+        host = ""
+        if port is not None:
+            server.bind((host, port))
+        else:
+            for i in range(10000, 60000):
+                try:
+                    server.bind((host, i))
+                except OSError:
+                    continue
+                else:
+                    port = i
+                    break
+
+        server.listen()
+        return port
+
+    def handle_client(self, client_socket):
+        args = self.args
+
+        if args.reuse_alloc:
+            recv_msg = np.zeros(args.n_bytes, dtype="u1")
+            assert recv_msg.nbytes == args.n_bytes
+
+        for _ in range(args.n_iter + args.n_warmup_iter):
+            if not args.reuse_alloc:
+                recv_msg = np.zeros(args.n_bytes, dtype="u1")
+
+            try:
+                client_socket.recv_into(recv_msg.data)
+                client_socket.sendall(recv_msg.data)
+            except socket.error as e:
+                print(e)
+                break
+
+        client_socket.close()
+        return
+
+    def run(self):
+        port = self._start_listener(self.server, self.args.port)
+        self.queue.put(port)
+
+        client_socket, addr = self.server.accept()
+        threading.Thread(target=self.handle_client, args=(client_socket,)).start()
+
+        self.server.close()
+
+
+class SocketClient(BaseClient):
+    has_cuda_support = False
+
+    def __init__(self, args, queue, server_address, port):
+        self.args = args
+        self.queue = queue
+        self.server_address = server_address
+        self.port = port
+        self.client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+
+    def run(self) -> bool:
+        self.client.connect((self.server_address, self.port))
+        send_msg = np.arange(self.args.n_bytes, dtype="u1")
+        assert send_msg.nbytes == self.args.n_bytes
+
+        if self.args.reuse_alloc:
+            recv_msg = np.zeros(self.args.n_bytes, dtype="u1")
+            assert recv_msg.nbytes == self.args.n_bytes
+
+        times = []
+        for i in range(self.args.n_iter + self.args.n_warmup_iter):
+            start = monotonic()
+
+            if not self.args.reuse_alloc:
+                recv_msg = np.zeros(self.args.n_bytes, dtype="u1")
+
+            self.client.sendall(send_msg.data)
+            self.client.recv_into(recv_msg.data)
+
+            stop = monotonic()
+            if i >= self.args.n_warmup_iter:
+                times.append(stop - start)
+
+        self.queue.put(times)
diff --git a/python/ucxx/ucxx/benchmarks/send_recv.py b/python/ucxx/ucxx/benchmarks/send_recv.py
index ba07e8f8..572cd008 100644
--- a/python/ucxx/ucxx/benchmarks/send_recv.py
+++ b/python/ucxx/ucxx/benchmarks/send_recv.py
@@ -10,6 +10,8 @@
 
 import ucxx
 from ucxx._lib_async.utils import get_event_loop
+from ucxx.benchmarks.backends.asyncio import AsyncioClient, AsyncioServer
+from ucxx.benchmarks.backends.socket import SocketClient, SocketServer
 from ucxx.benchmarks.backends.ucxx_async import (
     UCXPyAsyncClient,
     UCXPyAsyncServer,
@@ -30,13 +32,22 @@ def _get_backend_implementation(backend):
         return {"client": UCXPyAsyncClient, "server": UCXPyAsyncServer}
     elif backend == "ucxx-core":
         return {"client": UCXPyCoreClient, "server": UCXPyCoreServer}
+    elif backend == "asyncio":
+        return {"client": AsyncioClient, "server": AsyncioServer}
+    elif backend == "socket":
+        return {"client": SocketClient, "server": SocketServer}
     elif backend == "tornado":
-        from ucxx.benchmarks.backends.tornado import (
-            TornadoClient,
-            TornadoServer,
-        )
+        try:
+            import tornado  # noqa: F401
+        except ImportError as e:
+            raise e
+        else:
+            from ucxx.benchmarks.backends.tornado import (
+                TornadoClient,
+                TornadoServer,
+            )
 
-        return {"client": TornadoClient, "server": TornadoServer}
+            return {"client": TornadoClient, "server": TornadoServer}
 
     raise ValueError(f"Unknown backend {backend}")
 
@@ -95,6 +106,7 @@ def client(queue, port, server_address, args):
     print_key_value(key="Number of buffers", value=f"{args.n_buffers}")
     print_key_value(key="Object type", value=f"{args.object_type}")
     print_key_value(key="Reuse allocation", value=f"{args.reuse_alloc}")
+    print_key_value(key="Backend", value=f"{args.backend}")
     client.print_backend_specific_config()
     print_separator(separator="=")
     if args.object_type == "numpy":
@@ -289,7 +301,7 @@ def parse_args():
         default="ucxx-async",
         type=str,
         help="Backend Library (-l) to use, options are: 'ucxx-async' (default), "
-        "'ucxx-core' and 'tornado'.",
+        "'ucxx-core', 'asyncio', 'socket' and 'tornado'.",
     )
     parser.add_argument(
         "--progress-mode",