From 28bac3b995ae6c543f23df624e6de12e206ed7f0 Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 14 Nov 2023 18:13:19 +0000 Subject: [PATCH 001/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 7f3fba164c..4313cfa9b3 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 7f3fba164c4dd28c701ea2941d0525fc782a639c +Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc From 4520be90ff72bf7b8077cc236dcfbe27fc8f16f2 Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 15 Nov 2023 00:22:23 +0000 Subject: [PATCH 002/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index b446a6f187..4313cfa9b3 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit b446a6f187241e765c925da1053ece2679313a06 +Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc From a14a15832d3a2397e152e6746f454f53b62344bb Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 15 Nov 2023 03:13:28 +0000 Subject: [PATCH 003/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 330d389b26..4313cfa9b3 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 330d389b26a05676d9f079503a3d96b571762337 +Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc From 000b1cdf655e9591d07171c18e211199480ead43 Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 15 Nov 2023 05:11:43 +0000 Subject: [PATCH 004/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 8a0a08f34f..4313cfa9b3 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 8a0a08f34ff804a7329ea640aa1e0a9b188d2162 +Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc From 80fea2665b4418b802dfb42d20c75eda064bd930 Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 21 Nov 2023 04:46:17 +0000 Subject: [PATCH 005/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 8a0a08f34f..4313cfa9b3 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 8a0a08f34ff804a7329ea640aa1e0a9b188d2162 +Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc From 6a7991ea70370cd66c55eccd8fd8daf53cb30fed Mon Sep 17 00:00:00 2001 From: Peixin Date: Tue, 21 Nov 2023 13:58:47 +0800 Subject: [PATCH 006/127] Initiate Version 24.02.0-SNAPSHOT (#1562) * Initiate version 24.02.0-SNAPSHOT Signed-off-by: Peixin Li * udpate cudf 24.02 to latest commit * update cudf submodule ref * update cudf commit --------- Signed-off-by: Peixin Li --- .gitmodules | 2 +- CONTRIBUTING.md | 2 +- pom.xml | 2 +- src/main/cpp/CMakeLists.txt | 2 +- thirdparty/cudf | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.gitmodules b/.gitmodules index 5051589232..6b6f69d695 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ [submodule "thirdparty/cudf"] path = thirdparty/cudf url = https://github.com/rapidsai/cudf.git - branch = branch-23.12 + branch = branch-24.02 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9c230cf1c6..13edca987a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -149,7 +149,7 @@ $ ./build/build-in-docker install ... ``` Now cd to ~/repos/NVIDIA/spark-rapids and build with one of the options from -[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-23.12/CONTRIBUTING.md#building-from-source). +[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-24.02/CONTRIBUTING.md#building-from-source). ```bash $ ./build/buildall diff --git a/pom.xml b/pom.xml index ba03282637..4f2d19e45c 100644 --- a/pom.xml +++ b/pom.xml @@ -21,7 +21,7 @@ com.nvidia spark-rapids-jni - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT jar RAPIDS Accelerator JNI for Apache Spark diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 706bcfa30f..4c90b8de82 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -32,7 +32,7 @@ rapids_cuda_init_architectures(SPARK_RAPIDS_JNI) project( SPARK_RAPIDS_JNI - VERSION 23.12.00 + VERSION 24.02.00 LANGUAGES C CXX CUDA ) diff --git a/thirdparty/cudf b/thirdparty/cudf index 4313cfa9b3..947081f5b1 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc +Subproject commit 947081f5b10ca972826942b84c5c2530050325d8 From a94a28fd76fc4fe882ba968e19dd8f2ff2287297 Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 21 Nov 2023 06:23:10 +0000 Subject: [PATCH 007/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 8a0a08f34f..947081f5b1 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 8a0a08f34ff804a7329ea640aa1e0a9b188d2162 +Subproject commit 947081f5b10ca972826942b84c5c2530050325d8 From 213f986a6e0ae50f0511e06aa34f3ca12bc3c869 Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 21 Nov 2023 11:03:12 +0000 Subject: [PATCH 008/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 823d3214a9..947081f5b1 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3 +Subproject commit 947081f5b10ca972826942b84c5c2530050325d8 From f18a5c2359932c7660f6ab8be6d28f9d23d7dc54 Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 21 Nov 2023 15:39:37 +0000 Subject: [PATCH 009/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 823d3214a9..947081f5b1 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3 +Subproject commit 947081f5b10ca972826942b84c5c2530050325d8 From 1df862f55745918dde04769b4c69c5e67331326c Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 22 Nov 2023 05:30:30 +0800 Subject: [PATCH 010/127] Update submodule cudf to fcc89503c1f1e15ec287519959013adcf2bf8a52 (#1586) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 947081f5b1..fcc89503c1 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 947081f5b10ca972826942b84c5c2530050325d8 +Subproject commit fcc89503c1f1e15ec287519959013adcf2bf8a52 From 771857b6adc0a7ac270f9a06307610383fd01fce Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 21 Nov 2023 22:56:16 +0000 Subject: [PATCH 011/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 823d3214a9..fcc89503c1 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3 +Subproject commit fcc89503c1f1e15ec287519959013adcf2bf8a52 From b17a2bf403a6aac9c99e8de008bfbbf7523d5235 Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 22 Nov 2023 01:04:14 +0000 Subject: [PATCH 012/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 823d3214a9..fcc89503c1 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3 +Subproject commit fcc89503c1f1e15ec287519959013adcf2bf8a52 From 6eab7dbec2b41c904db58315e2c03f2fa84a40fe Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 22 Nov 2023 16:36:10 +0000 Subject: [PATCH 013/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 823d3214a9..fcc89503c1 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3 +Subproject commit fcc89503c1f1e15ec287519959013adcf2bf8a52 From 77192a0ee883e8f8331a361838d751401cd143db Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 23 Nov 2023 05:24:59 +0800 Subject: [PATCH 014/127] Update submodule cudf to f02fde9de9354a829d6f4425e086c84d36c076ae (#1593) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index fcc89503c1..f02fde9de9 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit fcc89503c1f1e15ec287519959013adcf2bf8a52 +Subproject commit f02fde9de9354a829d6f4425e086c84d36c076ae From cbb3be51f2a7a6b2a8d0a19e91d45241bf81a5b2 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 23 Nov 2023 11:27:19 +0800 Subject: [PATCH 015/127] Update submodule cudf to 168533a8ad4086bd020be4f7bf9264a08b6d2243 (#1594) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index f02fde9de9..168533a8ad 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit f02fde9de9354a829d6f4425e086c84d36c076ae +Subproject commit 168533a8ad4086bd020be4f7bf9264a08b6d2243 From face74b240be186fc589536eaa5f8d0da50ded41 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Sat, 25 Nov 2023 06:01:51 +0800 Subject: [PATCH 016/127] Update submodule cudf to db6745b5909233f0090d617c2eadb58a39c1348c (#1595) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 168533a8ad..db6745b590 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 168533a8ad4086bd020be4f7bf9264a08b6d2243 +Subproject commit db6745b5909233f0090d617c2eadb58a39c1348c From d12c76014752a693cbbc2a6231acf5a82cdb51c3 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 28 Nov 2023 06:02:30 +0800 Subject: [PATCH 017/127] Update submodule cudf to c8d481e24a8cf6054cb9400213df00a4b42a1566 (#1596) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index db6745b590..c8d481e24a 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit db6745b5909233f0090d617c2eadb58a39c1348c +Subproject commit c8d481e24a8cf6054cb9400213df00a4b42a1566 From 6f601cf599ad04193a6e6dd7f82a5192bd282817 Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 28 Nov 2023 02:31:58 +0000 Subject: [PATCH 018/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 823d3214a9..c8d481e24a 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3 +Subproject commit c8d481e24a8cf6054cb9400213df00a4b42a1566 From 63f22a124f290beb13e3fe35c07264ffc5198f98 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 28 Nov 2023 12:02:23 +0800 Subject: [PATCH 019/127] Update submodule cudf to 5e58e71836fd69ead04fbed5fdccb5e2e2c4d95c (#1599) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index c8d481e24a..5e58e71836 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit c8d481e24a8cf6054cb9400213df00a4b42a1566 +Subproject commit 5e58e71836fd69ead04fbed5fdccb5e2e2c4d95c From db0cc73dc17c0c0a7ce1f43c28043c404f04c592 Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 28 Nov 2023 07:32:04 +0000 Subject: [PATCH 020/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 823d3214a9..5e58e71836 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3 +Subproject commit 5e58e71836fd69ead04fbed5fdccb5e2e2c4d95c From 10c74c37da4994ae912c202b44ed2575f81bcf75 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 29 Nov 2023 06:05:21 +0800 Subject: [PATCH 021/127] Update submodule cudf to 94ca0b11d94b07f991c53a9413156f90a4f73597 (#1601) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 5e58e71836..94ca0b11d9 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 5e58e71836fd69ead04fbed5fdccb5e2e2c4d95c +Subproject commit 94ca0b11d94b07f991c53a9413156f90a4f73597 From e7dff0b3fc2a8eb6bae494e0fb4c6f82f31eb0ec Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 28 Nov 2023 23:02:10 +0000 Subject: [PATCH 022/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 68cb1d944b..94ca0b11d9 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 68cb1d944b8b2f1c7e3564dc66eacc7f0b19ecee +Subproject commit 94ca0b11d94b07f991c53a9413156f90a4f73597 From 76f6030ac732b3593f4164a8eb1785a08f24b08a Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 29 Nov 2023 11:23:30 +0800 Subject: [PATCH 023/127] Update submodule cudf to 8da62049aee750b391ff6d8ca4937428f94fd10c (#1604) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 94ca0b11d9..8da62049ae 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 94ca0b11d94b07f991c53a9413156f90a4f73597 +Subproject commit 8da62049aee750b391ff6d8ca4937428f94fd10c From e5cf1afec33cec3258b188f578f685f40acb6066 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Wed, 29 Nov 2023 08:13:30 -0600 Subject: [PATCH 024/127] Update state retry state machine for CPU alloc support (#1543) Signed-off-by: Robert (Bobby) Evans --- src/main/cpp/src/SparkResourceAdaptorJni.cpp | 1591 +++++++++++------ .../nvidia/spark/rapids/jni/CpuRetryOOM.java | 32 + .../spark/rapids/jni/CpuSplitAndRetryOOM.java | 32 + .../jni/{RetryOOM.java => GpuRetryOOM.java} | 8 +- ...RetryOOM.java => GpuSplitAndRetryOOM.java} | 8 +- .../nvidia/spark/rapids/jni/OffHeapOOM.java | 32 + .../com/nvidia/spark/rapids/jni/RmmSpark.java | 307 +++- .../spark/rapids/jni/RmmSparkThreadState.java | 26 +- .../rapids/jni/SparkResourceAdaptor.java | 156 +- .../spark/rapids/jni/ThreadStateRegistry.java | 67 + .../jni/LimitingOffHeapAllocForTests.java | 90 + .../spark/rapids/jni/RmmSparkMonteCarlo.java | 78 +- .../nvidia/spark/rapids/jni/RmmSparkTest.java | 638 +++++-- 13 files changed, 2265 insertions(+), 800 deletions(-) create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/CpuRetryOOM.java create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/CpuSplitAndRetryOOM.java rename src/main/java/com/nvidia/spark/rapids/jni/{RetryOOM.java => GpuRetryOOM.java} (85%) rename src/main/java/com/nvidia/spark/rapids/jni/{SplitAndRetryOOM.java => GpuSplitAndRetryOOM.java} (85%) create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/OffHeapOOM.java create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/ThreadStateRegistry.java create mode 100644 src/test/java/com/nvidia/spark/rapids/jni/LimitingOffHeapAllocForTests.java diff --git a/src/main/cpp/src/SparkResourceAdaptorJni.cpp b/src/main/cpp/src/SparkResourceAdaptorJni.cpp index 16c950d121..d3821fcc18 100644 --- a/src/main/cpp/src/SparkResourceAdaptorJni.cpp +++ b/src/main/cpp/src/SparkResourceAdaptorJni.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -32,8 +33,44 @@ namespace { -constexpr char const* RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/RetryOOM"; -constexpr char const* SPLIT_AND_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/SplitAndRetryOOM"; +constexpr char const* GPU_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/GpuRetryOOM"; +constexpr char const* GPU_SPLIT_AND_RETRY_OOM_CLASS = + "com/nvidia/spark/rapids/jni/GpuSplitAndRetryOOM"; +constexpr char const* CPU_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/CpuRetryOOM"; +constexpr char const* CPU_SPLIT_AND_RETRY_OOM_CLASS = + "com/nvidia/spark/rapids/jni/CpuSplitAndRetryOOM"; +constexpr char const* THREAD_REG_CLASS = "com/nvidia/spark/rapids/jni/ThreadStateRegistry"; +constexpr char const* IS_THREAD_BLOCKED = "isThreadBlocked"; +constexpr char const* IS_THREAD_BLOCKED_SIG = "(J)Z"; +constexpr char const* REMOVE_THREAD = "removeThread"; +constexpr char const* REMOVE_THREAD_SIG = "(J)V"; + +// This is a bit of a hack to cache the methods because CUDF is getting java to do an onload +// there. +std::mutex jni_mutex; +bool is_jni_loaded = false; +jclass ThreadStateRegistry_jclass; +jmethodID removeThread_method; +jmethodID isThreadBlocked_method; + +void cache_thread_reg_jni(JNIEnv* env) +{ + std::unique_lock lock(jni_mutex); + if (is_jni_loaded) { return; } + jclass cls = env->FindClass(THREAD_REG_CLASS); + if (cls == nullptr) { return; } + + removeThread_method = env->GetStaticMethodID(cls, REMOVE_THREAD, REMOVE_THREAD_SIG); + if (removeThread_method == nullptr) { return; } + + isThreadBlocked_method = env->GetStaticMethodID(cls, IS_THREAD_BLOCKED, IS_THREAD_BLOCKED_SIG); + if (isThreadBlocked_method == nullptr) { return; } + + // Convert local reference to global so it cannot be garbage collected. + ThreadStateRegistry_jclass = static_cast(env->NewGlobalRef(cls)); + if (ThreadStateRegistry_jclass == nullptr) { return; } + is_jni_loaded = true; +} // In the task states BUFN means Block Until Further Notice. // Meaning the thread should be blocked until another task finishes. @@ -42,28 +79,19 @@ constexpr char const* SPLIT_AND_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/S // again until we know that progress has been made. We might add an API // in the future to know when a retry section has passed, which would // probably be a preferable time to restart all BUFN threads. -enum thread_state { +enum class thread_state { UNKNOWN = -1, // unknown state, this is really here for logging and anything transitioning to // this state should actually be accomplished by deleting the thread from the state - TASK_RUNNING = 0, // task thread running normally - TASK_WAIT_ON_SHUFFLE = 1, // task thread waiting on shuffle - TASK_BUFN_WAIT_ON_SHUFFLE = 2, // task thread waiting on shuffle, but marked as BUFN - TASK_ALLOC = 3, // task thread in the middle of doing an allocation - TASK_ALLOC_FREE = 4, // task thread in the middle of doing an allocation and a free happened - TASK_BLOCKED = 5, // task thread that is temporarily blocked - TASK_BUFN_THROW = 6, // task thread that should throw an exception to roll back before blocking - TASK_BUFN_WAIT = 7, // task thread that threw an exception to roll back and now should - // block the next time alloc or block_until_ready is called - TASK_BUFN = 8, // task thread that is blocked until higher priority tasks start to succeed - TASK_SPLIT_THROW = 9, // task thread that should throw an exception to split input and retry - TASK_REMOVE_THROW = 10, // task thread that is being removed and needs to throw an exception - // to start the blocked thread running again. - SHUFFLE_RUNNING = 11, // shuffle thread that is running normally - SHUFFLE_ALLOC = 12, // shuffle thread that is in the middle of doing an alloc - SHUFFLE_ALLOC_FREE = 13, // shuffle thread that is doing an alloc and a free happened. - SHUFFLE_BLOCKED = 14, // shuffle thread that is temporarily blocked - SHUFFLE_THROW = 15, // shuffle thread that needs to throw an OOM - SHUFFLE_REMOVE_THROW = 16 // shuffle thread that is being removed and needs to throw an exception + THREAD_RUNNING = 0, // task thread running normally + THREAD_ALLOC = 1, // task thread in the middle of doing an allocation + THREAD_ALLOC_FREE = 2, // task thread in the middle of doing an allocation and a free happened + THREAD_BLOCKED = 3, // task thread that is temporarily blocked + THREAD_BUFN_THROW = 4, // task thread that should throw an exception to roll back before blocking + THREAD_BUFN_WAIT = 5, // task thread that threw an exception to roll back and now should + // block the next time alloc or block_until_ready is called + THREAD_BUFN = 6, // task thread that is blocked until higher priority tasks start to succeed + THREAD_SPLIT_THROW = 7, // task thread that should throw an exception to split input and retry + THREAD_REMOVE_THROW = 8, // task thread that is being removed and needs to throw an exception }; /** @@ -72,23 +100,15 @@ enum thread_state { const char* as_str(thread_state state) { switch (state) { - case TASK_RUNNING: return "TASK_RUNNING"; - case TASK_WAIT_ON_SHUFFLE: return "TASK_WAIT_ON_SHUFFLE"; - case TASK_BUFN_WAIT_ON_SHUFFLE: return "TASK_BUFN_WAIT_ON_SHUFFLE"; - case TASK_ALLOC: return "TASK_ALLOC"; - case TASK_ALLOC_FREE: return "TASK_ALLOC_FREE"; - case TASK_BLOCKED: return "TASK_BLOCKED"; - case TASK_BUFN_THROW: return "TASK_BUFN_THROW"; - case TASK_BUFN_WAIT: return "TASK_BUFN_WAIT"; - case TASK_BUFN: return "TASK_BUFN"; - case TASK_SPLIT_THROW: return "TASK_SPLIT_THROW"; - case TASK_REMOVE_THROW: return "TASK_REMOVE_THROW"; - case SHUFFLE_RUNNING: return "SHUFFLE_RUNNING"; - case SHUFFLE_ALLOC: return "SHUFFLE_ALLOC"; - case SHUFFLE_ALLOC_FREE: return "SHUFFLE_ALLOC_FREE"; - case SHUFFLE_BLOCKED: return "SHUFFLE_BLOCKED"; - case SHUFFLE_THROW: return "SHUFFLE_THROW"; - case SHUFFLE_REMOVE_THROW: return "SHUFFLE_REMOVE_THROW"; + case thread_state::THREAD_RUNNING: return "THREAD_RUNNING"; + case thread_state::THREAD_ALLOC: return "THREAD_ALLOC"; + case thread_state::THREAD_ALLOC_FREE: return "THREAD_ALLOC_FREE"; + case thread_state::THREAD_BLOCKED: return "THREAD_BLOCKED"; + case thread_state::THREAD_BUFN_THROW: return "THREAD_BUFN_THROW"; + case thread_state::THREAD_BUFN_WAIT: return "THREAD_BUFN_WAIT"; + case thread_state::THREAD_BUFN: return "THREAD_BUFN"; + case thread_state::THREAD_SPLIT_THROW: return "THREAD_SPLIT_THROW"; + case thread_state::THREAD_REMOVE_THROW: return "THREAD_REMOVE_THROW"; default: return "UNKNOWN"; } } @@ -120,13 +140,13 @@ static auto make_logger(std::string const& filename) * the lowest priority task and is constantly retried while newer tasks move to the front of the * line. So a higher task_id should be a lower priority. * - * We also want all shuffle threads to have the highest priority possible. So we assign them + * We also want all non-task threads to have the highest priority possible. So we assign them * a task id of -1. The problem is overflow on a long, so for the priority of a task the formula * will be MAX_LONG - (task_id + 1). */ class thread_priority { public: - thread_priority(long tsk_id, long t_id) : task_id(tsk_id), thread_id(t_id) {} + thread_priority(long const tsk_id, long const t_id) : task_id(tsk_id), thread_id(t_id) {} long get_thread_id() const { return thread_id; } @@ -169,6 +189,43 @@ class thread_priority { long task_priority() const { return std::numeric_limits::max() - (task_id + 1); } }; +/** + * Holds metrics for a given task/thread about retry counts and times. It is here + * because the mapping between tasks and threads can be complicated and can span + * different time ranges too. + */ +struct task_metrics { + // metric for being able to report how many times each type of exception was thrown, + // and some timings + int num_times_retry_throw = 0; + int num_times_split_retry_throw = 0; + long time_blocked_nanos = 0; + // The amount of time that this thread has lost due to retries (not including blocked time) + long time_lost_nanos = 0; + + void take_from(task_metrics& other) + { + add(other); + other.clear(); + } + + void add(task_metrics const& other) + { + this->num_times_retry_throw += other.num_times_retry_throw; + this->num_times_split_retry_throw += other.num_times_split_retry_throw; + this->time_blocked_nanos += other.time_blocked_nanos; + this->time_lost_nanos += other.time_lost_nanos; + } + + void clear() + { + num_times_retry_throw = 0; + num_times_split_retry_throw = 0; + time_blocked_nanos = 0; + time_lost_nanos = 0; + } +}; + /** * This is the full state of a thread. Some things like the thread_id and task_id * should not change after the state is set up. Everything else is up for change, @@ -177,35 +234,37 @@ class thread_priority { */ class full_thread_state { public: - full_thread_state(thread_state state, long thread_id) : state(state), thread_id(thread_id) {} - full_thread_state(thread_state state, long thread_id, long task_id) + full_thread_state(thread_state const state, long const thread_id) + : state(state), thread_id(thread_id) + { + } + full_thread_state(thread_state const state, long const thread_id, long const task_id) : state(state), thread_id(thread_id), task_id(task_id) { } thread_state state; long thread_id; - long task_id = -1; + long task_id = -1; + bool is_for_shuffle = false; + std::unordered_set pool_task_ids; + bool is_cpu_alloc = false; + // Is the thread transitively blocked on a pool or not. + bool pool_blocked = false; int retry_oom_injected = 0; int split_and_retry_oom_injected = 0; int cudf_exception_injected = 0; // watchdog limit on maximum number of retries to avoid unexpected live lock situations int num_times_retried = 0; - // metric for being able to report how many times each type of exception was thrown, - // and some timings - int num_times_retry_throw = 0; - int num_times_split_retry_throw = 0; - long time_blocked_nanos = 0; - // The amount of time that this thread has lost due to retries (not inclduing blocked time) - long time_lost_nanos = 0; - // The amount of time that this thread has spent in the current retry block (not inclucing block - // time) - long time_retry_running_nanos = 0; // When did the retry time for this thread start, or when did the block time end. std::chrono::time_point retry_start_or_block_end; // Is this thread currently in a marked retry block. This is only used for metrics. bool is_in_retry = false; - + // The amount of time that this thread has spent in the current retry block (not including block + // time) + long time_retry_running_nanos = 0; std::chrono::time_point block_start; + // metrics for the current thread + task_metrics metrics; std::unique_ptr wake_condition = std::make_unique(); @@ -214,7 +273,7 @@ class full_thread_state { * Transition to a new state. Ideally this is what is called when doing a state transition instead * of setting the state directly. */ - void transition_to(thread_state new_state) + void transition_to(thread_state const new_state) { if (new_state == thread_state::UNKNOWN) { throw std::runtime_error( @@ -232,24 +291,18 @@ class full_thread_state { void after_block() { - auto end = std::chrono::steady_clock::now(); - auto diff = end - block_start; - time_blocked_nanos += std::chrono::duration_cast(diff).count(); + auto const end = std::chrono::steady_clock::now(); + auto const diff = end - block_start; + metrics.time_blocked_nanos += + std::chrono::duration_cast(diff).count(); if (is_in_retry) { retry_start_or_block_end = end; } } - long get_and_reset_failed_retry_time() - { - long ret = time_lost_nanos; - time_lost_nanos = 0; - return ret; - } - void record_failed_retry_time() { if (is_in_retry) { record_and_reset_pending_retry_time(); - time_lost_nanos += time_retry_running_nanos; + metrics.time_lost_nanos += time_retry_running_nanos; time_retry_running_nanos = 0; } } @@ -257,15 +310,15 @@ class full_thread_state { void record_and_reset_pending_retry_time() { if (is_in_retry) { - auto end = std::chrono::steady_clock::now(); - auto diff = end - retry_start_or_block_end; + auto const end = std::chrono::steady_clock::now(); + auto const diff = end - retry_start_or_block_end; time_retry_running_nanos += std::chrono::duration_cast(diff).count(); retry_start_or_block_end = end; } } - void reset_retry_state(bool is_in_retry) + void reset_retry_state(bool const is_in_retry) { time_retry_running_nanos = 0; if (is_in_retry) { retry_start_or_block_end = std::chrono::steady_clock::now(); } @@ -275,7 +328,21 @@ class full_thread_state { /** * Get the priority of this thread. */ - thread_priority priority() { return thread_priority(task_id, thread_id); } + thread_priority priority() const + { + if (task_id < 0 && !is_for_shuffle) { + // The task id for a non-shuffle pool thread is the same as the lowest task id + // it is currently working on. + auto const min_id = std::min_element(pool_task_ids.begin(), pool_task_ids.end()); + if (min_id != pool_task_ids.end()) { + return thread_priority(*min_id, thread_id); + } else { + return thread_priority(-1, thread_id); + } + } else { + return thread_priority(task_id, thread_id); + } + } }; /** @@ -289,8 +356,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { public: spark_resource_adaptor(JNIEnv* env, rmm::mr::device_memory_resource* mr, - std::shared_ptr& logger) - : resource{mr}, logger{logger} + std::shared_ptr& logger, + bool const is_log_enabled) + : resource{mr}, logger{logger}, is_log_enabled{is_log_enabled} { if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); } logger->flush_on(spdlog::level::info); @@ -306,38 +374,69 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { bool supports_streams() const noexcept override { return resource->supports_streams(); } /** - * Update the internal state so that a specific thread is associated with a task. + * Update the internal state so that a specific thread is dedicated to a task. * This may be called multiple times for a given thread and if the thread is already - * associated with the task, then most of the time this is a noop. The only exception + * dedicated to the task, then most of the time this is a noop. The only exception * is if the thread is marked that it is shutting down, but has not completed yet. * This should never happen in practice with Spark because the only time we would * shut down a task thread on a thread that is different from itself is if there * was an error and the entire executor is shutting down. So there should be no * reuse. */ - void associate_thread_with_task(long thread_id, long task_id) + void start_dedicated_task_thread(long const thread_id, long const task_id) { std::unique_lock lock(state_mutex); if (shutting_down) { throw std::runtime_error("spark_resource_adaptor is shutting down"); } - auto was_threads_inserted = - threads.emplace(thread_id, full_thread_state(thread_state::TASK_RUNNING, thread_id, task_id)); + auto const found = threads.find(thread_id); + if (found != threads.end()) { + if (found->second.task_id >= 0 && found->second.task_id != task_id) { + if (is_log_enabled) { + std::stringstream ss; + ss << "desired task_id " << task_id; + + log_status("FIXUP", thread_id, found->second.task_id, found->second.state, ss.str()); + } + remove_thread_association(thread_id, found->second.task_id, lock); + } + } + auto const was_threads_inserted = threads.emplace( + thread_id, full_thread_state(thread_state::THREAD_RUNNING, thread_id, task_id)); if (was_threads_inserted.second == false) { - if (was_threads_inserted.first->second.task_id != task_id) { - throw std::invalid_argument("a thread can only be associated with a single task."); + if (was_threads_inserted.first->second.state == thread_state::THREAD_REMOVE_THROW) { + std::stringstream ss; + ss << "A thread " << thread_id << " is shutting down " + << was_threads_inserted.first->second.task_id << " vs " << task_id; + + auto const msg = ss.str(); + log_status("ERROR", + thread_id, + was_threads_inserted.first->second.task_id, + was_threads_inserted.first->second.state, + msg); + throw std::invalid_argument(msg); } - if (was_threads_inserted.first->second.state == thread_state::TASK_REMOVE_THROW) { - throw std::invalid_argument("the thread is in the process of shutting down."); + if (was_threads_inserted.first->second.task_id != task_id) { + std::stringstream ss; + ss << "A thread " << thread_id << " can only be dedicated to a single task." + << was_threads_inserted.first->second.task_id << " != " << task_id; + auto const msg = ss.str(); + log_status("ERROR", + thread_id, + was_threads_inserted.first->second.task_id, + was_threads_inserted.first->second.state, + msg); + throw std::invalid_argument(msg); } } try { - auto was_inserted = task_to_threads.insert({task_id, {thread_id}}); + auto const was_inserted = task_to_threads.insert({task_id, {thread_id}}); if (was_inserted.second == false) { // task_to_threads already has a task_id for this, so insert the thread_id was_inserted.first->second.insert(thread_id); } - } catch (const std::exception&) { + } catch (std::exception const&) { if (was_threads_inserted.second == true) { // roll back the thread insertion threads.erase(thread_id); @@ -345,60 +444,100 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { throw; } if (was_threads_inserted.second == true) { - log_transition(thread_id, task_id, thread_state::UNKNOWN, thread_state::TASK_RUNNING); + log_transition(thread_id, task_id, thread_state::UNKNOWN, thread_state::THREAD_RUNNING); } } - void start_retry_block(long thread_id) + void start_retry_block(long const thread_id) { std::unique_lock lock(state_mutex); - auto thread = threads.find(thread_id); + auto const thread = threads.find(thread_id); if (thread != threads.end()) { thread->second.reset_retry_state(true); } } - void end_retry_block(long thread_id) + void end_retry_block(long const thread_id) { std::unique_lock lock(state_mutex); - auto thread = threads.find(thread_id); + auto const thread = threads.find(thread_id); if (thread != threads.end()) { thread->second.reset_retry_state(false); } } - long get_and_reset_lost_time(long task_id) + /** + * Update the internal state so that a specific thread is associated with transitive + * thread pools and is working on a set of tasks. + * This may be called multiple times for a given thread and the set of tasks will be + * updated accordingly. + */ + void pool_thread_working_on_tasks(bool const is_for_shuffle, + long const thread_id, + std::unordered_set const& task_ids) { std::unique_lock lock(state_mutex); - long ret = 0; - auto task_at = task_to_threads.find(task_id); - if (task_at != task_to_threads.end()) { - for (auto thread_id : task_at->second) { - auto threads_at = threads.find(thread_id); - if (threads_at != threads.end()) { - ret += threads_at->second.get_and_reset_failed_retry_time(); - } + if (shutting_down) { throw std::runtime_error("spark_resource_adaptor is shutting down"); } + + auto const was_inserted = + threads.emplace(thread_id, full_thread_state(thread_state::THREAD_RUNNING, thread_id)); + if (was_inserted.second == true) { + was_inserted.first->second.is_for_shuffle = is_for_shuffle; + log_transition(thread_id, -1, thread_state::UNKNOWN, thread_state::THREAD_RUNNING); + } else if (was_inserted.first->second.task_id != -1) { + throw std::invalid_argument("the thread is associated with a non-pool task already"); + } else if (was_inserted.first->second.state == thread_state::THREAD_REMOVE_THROW) { + throw std::invalid_argument("the thread is in the process of shutting down."); + } else if (was_inserted.first->second.is_for_shuffle != is_for_shuffle) { + if (is_for_shuffle) { + throw std::invalid_argument( + "the thread is marked as a non-shuffle thread, and we cannot change it while there are " + "active tasks"); + } else { + throw std::invalid_argument( + "the thread is marked as a shuffle thread, and we cannot change it while there are " + "active tasks"); } } - return ret; + + // save the metrics for all tasks before we add any new ones. + checkpoint_metrics(was_inserted.first->second); + + was_inserted.first->second.pool_task_ids.insert(task_ids.begin(), task_ids.end()); + if (is_log_enabled) { + std::stringstream ss; + ss << "CURRENT IDs "; + for (const auto& task_id : was_inserted.first->second.pool_task_ids) { + ss << task_id << " "; + } + log_status("ADD_TASKS", thread_id, -1, was_inserted.first->second.state, ss.str()); + } } - /** - * Update the internal state so that a specific thread is associated with shuffle. - * This may be called multiple times for a given thread and if the thread is already - * associated with shuffle, the this is a noop in most cases. The only time - * this is an error is if the thread is already marked as shutting down and has - * not completed that transition yet. - */ - void associate_thread_with_shuffle(long thread_id) + void pool_thread_finished_for_tasks(long const thread_id, + std::unordered_set const& task_ids) { std::unique_lock lock(state_mutex); if (shutting_down) { throw std::runtime_error("spark_resource_adaptor is shutting down"); } - auto was_inserted = - threads.emplace(thread_id, full_thread_state(thread_state::SHUFFLE_RUNNING, thread_id)); - if (was_inserted.second == true) { - log_transition(thread_id, -1, thread_state::UNKNOWN, thread_state::SHUFFLE_RUNNING); - } else if (was_inserted.first->second.task_id != -1) { - throw std::invalid_argument("the thread is associated with a non-shuffle task already"); - } else if (was_inserted.first->second.state == thread_state::SHUFFLE_REMOVE_THROW) { - throw std::invalid_argument("the thread is in the process of shutting down."); + auto const thread = threads.find(thread_id); + if (thread != threads.end()) { + // save the metrics for all tasks before we remove any of them. + checkpoint_metrics(thread->second); + + // Now drop the tasks from the pool + for (auto const& id : task_ids) { + thread->second.pool_task_ids.erase(id); + } + if (is_log_enabled) { + std::stringstream ss; + ss << "CURRENT IDs "; + for (const auto& id : thread->second.pool_task_ids) { + ss << id << " "; + } + log_status("REMOVE_TASKS", thread_id, -1, thread->second.state, ss.str()); + } + if (thread->second.pool_task_ids.empty()) { + if (remove_thread_association(thread_id, -1, lock)) { + wake_up_threads_after_task_finishes(lock); + } + } } } @@ -409,10 +548,12 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * up and throw an exception. At that point the thread's state will be completely * removed. */ - void remove_thread_association(long thread_id) + void remove_thread_association(long const thread_id, long const task_id) { std::unique_lock lock(state_mutex); - if (remove_thread_association(thread_id, lock)) { wake_up_threads_after_task_finishes(lock); } + if (remove_thread_association(thread_id, task_id, lock)) { + wake_up_threads_after_task_finishes(lock); + } } /** @@ -421,22 +562,65 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * threads are currently blocked/waiting then the state will not be totally * removed until the thread is woken. */ - void task_done(long task_id) + void task_done(long const task_id) { std::unique_lock lock(state_mutex); - auto task_at = task_to_threads.find(task_id); + bool run_checks = false; + auto const task_at = task_to_threads.find(task_id); if (task_at != task_to_threads.end()) { // we want to make a copy so there is no conflict here... - std::set threads_to_remove = task_at->second; - bool run_checks = false; - for (auto thread_id : threads_to_remove) { - run_checks = remove_thread_association(thread_id, lock) || run_checks; + std::set const threads_to_remove = task_at->second; + for (auto const thread_id : threads_to_remove) { + run_checks = remove_thread_association(thread_id, task_id, lock) || run_checks; } - if (run_checks) { wake_up_threads_after_task_finishes(lock); } } + std::unordered_set thread_ids; + for (auto const& [thread_id, ignored] : threads) { + thread_ids.insert(thread_id); + } + for (auto const& thread_id : thread_ids) { + auto const thread = threads.find(thread_id); + if (thread != threads.end()) { + if (thread->second.pool_task_ids.erase(task_id) != 0) { + if (is_log_enabled) { + std::stringstream ss; + ss << "CURRENT IDs "; + for (const auto& id : thread->second.pool_task_ids) { + ss << id << " "; + } + log_status("REMOVE_TASKS", thread_id, -1, thread->second.state, ss.str()); + } + if (thread->second.pool_task_ids.empty()) { + run_checks = remove_thread_association(thread_id, task_id, lock) || run_checks; + } + } + } + } + + if (run_checks) { wake_up_threads_after_task_finishes(lock); } task_to_threads.erase(task_id); } + /** + * A dedicated task thread is submitting to a pool. + */ + void submitting_to_pool(long const thread_id) { waiting_on_pool_status_changed(thread_id, true); } + + /** + * A dedicated task thread is waiting on a result from a pool. + */ + void waiting_on_pool(long const thread_id) { waiting_on_pool_status_changed(thread_id, true); } + + /** + * A dedicated task thread is no longer blocked on a pool. + * It got the answer, an exception, or it submitted the + * work successfully. + */ + void done_waiting_on_pool(long const thread_id) + { + waiting_on_pool_status_changed(thread_id, false); + } + /** * This should be called before shutting down the adaptor. It will try * to shut down everything in an orderly way and wait for all of the @@ -449,12 +633,12 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { // 1. Mark all threads that need to be removed as such // make a copy of the ids so we don't modify threads while walking it std::vector threads_to_remove; - for (auto thread = threads.begin(); thread != threads.end(); thread++) { - threads_to_remove.push_back(thread->first); + for (auto const& thread : threads) { + threads_to_remove.push_back(thread.first); } - for (auto thread_id : threads_to_remove) { - remove_thread_association(thread_id, lock); + for (auto const thread_id : threads_to_remove) { + remove_thread_association(thread_id, -1, lock); } shutting_down = true; } @@ -479,10 +663,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * Force a specific thread to throw one or more RetryOOM exceptions when an * alloc is called. This is intended only for testing. */ - void force_retry_oom(long thread_id, int num_ooms) + void force_retry_oom(long const thread_id, int const num_ooms) { std::unique_lock lock(state_mutex); - auto threads_at = threads.find(thread_id); + auto const threads_at = threads.find(thread_id); if (threads_at != threads.end()) { threads_at->second.retry_oom_injected = num_ooms; } else { @@ -494,10 +678,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * Force a specific thread to throw one or more SplitAndRetryOOM exceptions * when an alloc is called. This is intended only for testing. */ - void force_split_and_retry_oom(long thread_id, int num_ooms) + void force_split_and_retry_oom(long const thread_id, int const num_ooms) { std::unique_lock lock(state_mutex); - auto threads_at = threads.find(thread_id); + auto const threads_at = threads.find(thread_id); if (threads_at != threads.end()) { threads_at->second.split_and_retry_oom_injected = num_ooms; } else { @@ -509,10 +693,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * force a specific thread to throw one or more CudfExceptions when an * alloc is called. This is intended only for testing. */ - void force_cudf_exception(long thread_id, int num_times) + void force_cudf_exception(long const thread_id, int const num_times) { std::unique_lock lock(state_mutex); - auto threads_at = threads.find(thread_id); + auto const threads_at = threads.find(thread_id); if (threads_at != threads.end()) { threads_at->second.cudf_exception_injected = num_times; } else { @@ -520,130 +704,107 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { } } - /** - * get the number of times a retry was thrown and reset the value to 0. - */ - int get_and_reset_num_retry(long task_id) + // Some C++ magic to get and reset a single metric. + // Metrics are recorded on a per-thread basis, but are reported per-task + // But the life time of threads and tasks are not directly tied together + // so they are check-pointed periodically. This reads and resets + // the metric for both the threads and the tasks + template + T get_and_reset_metric(long const task_id, T task_metrics::*MetricPtr) { std::unique_lock lock(state_mutex); - int ret = 0; - auto task_at = task_to_threads.find(task_id); + T ret = 0; + auto const task_at = task_to_threads.find(task_id); if (task_at != task_to_threads.end()) { - for (auto thread_id : task_at->second) { - auto threads_at = threads.find(thread_id); + for (auto const thread_id : task_at->second) { + auto const threads_at = threads.find(thread_id); if (threads_at != threads.end()) { - ret += threads_at->second.num_times_retry_throw; - threads_at->second.num_times_retry_throw = 0; + ret += (threads_at->second.metrics.*MetricPtr); + (threads_at->second.metrics.*MetricPtr) = 0; } } } + + auto const metrics_at = task_to_metrics.find(task_id); + if (metrics_at != task_to_metrics.end()) { + ret += (metrics_at->second.*MetricPtr); + (metrics_at->second.*MetricPtr) = 0; + } return ret; } + /** + * get the number of times a retry was thrown and reset the value to 0. + */ + int get_and_reset_num_retry(long const task_id) + { + return get_and_reset_metric(task_id, &task_metrics::num_times_retry_throw); + } + /** * get the number of times a split and retry was thrown and reset the value to 0. */ - int get_and_reset_num_split_retry(long task_id) + int get_and_reset_num_split_retry(long const task_id) { - std::unique_lock lock(state_mutex); - int ret = 0; - auto task_at = task_to_threads.find(task_id); - if (task_at != task_to_threads.end()) { - for (auto thread_id : task_at->second) { - auto threads_at = threads.find(thread_id); - if (threads_at != threads.end()) { - ret += threads_at->second.num_times_split_retry_throw; - threads_at->second.num_times_split_retry_throw = 0; - } - } - } - return ret; + return get_and_reset_metric(task_id, &task_metrics::num_times_split_retry_throw); } /** * get the time in ns that the task was blocked for. */ - long get_and_reset_block_time(long task_id) + long get_and_reset_block_time(long const task_id) { - std::unique_lock lock(state_mutex); - long ret = 0; - auto task_at = task_to_threads.find(task_id); - if (task_at != task_to_threads.end()) { - for (auto thread_id : task_at->second) { - auto threads_at = threads.find(thread_id); - if (threads_at != threads.end()) { - ret += threads_at->second.time_blocked_nanos; - threads_at->second.time_blocked_nanos = 0; - } - } - } - return ret; + return get_and_reset_metric(task_id, &task_metrics::time_blocked_nanos); } /** - * Update the internal state so that this thread is known that it is going to enter a - * shuffle stage and could indirectly block on a shuffle thread (UCX). + * get the time in ns that was lost because a retry was thrown. */ - void thread_could_block_on_shuffle(long thread_id) + long get_and_reset_lost_time(long const task_id) + { + return get_and_reset_metric(task_id, &task_metrics::time_lost_nanos); + } + + void check_and_break_deadlocks() { std::unique_lock lock(state_mutex); - auto threads_at = threads.find(thread_id); - if (threads_at != threads.end()) { - switch (threads_at->second.state) { - case TASK_RUNNING: - transition(threads_at->second, thread_state::TASK_WAIT_ON_SHUFFLE); - break; - case TASK_BUFN_WAIT: - transition(threads_at->second, thread_state::TASK_BUFN_WAIT_ON_SHUFFLE); - break; - case TASK_WAIT_ON_SHUFFLE: - // fall through - case TASK_BUFN_WAIT_ON_SHUFFLE: - // noop already in an expected state... - break; - default: { - std::stringstream ss; - ss << "thread " << thread_id << " is in an unexpected state " - << as_str(threads_at->second.state) << " to start shuffle"; - throw std::invalid_argument(ss.str()); - } - } - check_and_update_for_bufn(lock); - } else { - throw std::invalid_argument("the thread is not associated with any task/shuffle"); - } + check_and_update_for_bufn(lock); } - /** - * Indicate that the thread no longer will block indirectly on a shuffle thread. - */ - void thread_done_with_shuffle(long thread_id) + bool cpu_prealloc(size_t const amount, bool const blocking) { + // amount is not used yet, but is here in case we want it in the future. std::unique_lock lock(state_mutex); - auto threads_at = threads.find(thread_id); - if (threads_at != threads.end()) { - switch (threads_at->second.state) { - case TASK_WAIT_ON_SHUFFLE: - transition(threads_at->second, thread_state::TASK_RUNNING); - break; - case TASK_BUFN_WAIT_ON_SHUFFLE: - transition(threads_at->second, thread_state::TASK_BUFN_WAIT); - break; - case TASK_RUNNING: - // fall through - case TASK_BUFN_WAIT: - // noop already in an expected state... - break; - default: { - std::stringstream ss; - ss << "thread " << thread_id << " is in an unexpected state " - << as_str(threads_at->second.state) << " to end shuffle"; - throw std::invalid_argument(ss.str()); - } - } - } else { - throw std::invalid_argument("the thread is not associated with any task/shuffle"); - } + auto const thread_id = static_cast(pthread_self()); + return pre_alloc_core(thread_id, true, blocking, lock); + } + + void cpu_postalloc_success(void const* addr, + size_t const amount, + bool const blocking, + bool const was_recursive) + { + // addr is not used yet, but is here in case we want it in the future. + // amount is not used yet, but is here in case we want it for debugging/metrics. + // blocking is not used yet. It could be used for some debugging so we are keeping it. + std::unique_lock lock(state_mutex); + auto const thread_id = static_cast(pthread_self()); + post_alloc_success_core(thread_id, true, was_recursive, lock); + } + + bool cpu_postalloc_failed(bool const was_oom, bool const blocking, bool const was_recursive) + { + std::unique_lock lock(state_mutex); + auto const thread_id = static_cast(pthread_self()); + return post_alloc_failed_core(thread_id, true, was_oom, blocking, was_recursive, lock); + } + + void cpu_dealloc(void const* addr, size_t const amount) + { + // addr is not used yet, but is here in case we want it in the future. + // amount is not used yet, but is here in case we want it for debugging/metrics. + std::unique_lock lock(state_mutex); + dealloc_core(true, lock); } /** @@ -654,7 +815,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { */ void block_thread_until_ready() { - auto thread_id = static_cast(pthread_self()); + auto const thread_id = static_cast(pthread_self()); std::unique_lock lock(state_mutex); block_thread_until_ready(thread_id, lock); } @@ -663,10 +824,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * This is really here just for testing. It provides a way to look at the * current state of a thread. */ - int get_thread_state_as_int(long thread_id) + int get_thread_state_as_int(long const thread_id) { std::unique_lock lock(state_mutex); - auto threads_at = threads.find(thread_id); + auto const threads_at = threads.find(thread_id); if (threads_at != threads.end()) { return static_cast(threads_at->second.state); } else { @@ -677,6 +838,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { private: rmm::mr::device_memory_resource* const resource; std::shared_ptr logger; ///< spdlog logger object + bool const is_log_enabled; // The state mutex must be held when modifying the state of threads or tasks // it must never be held when calling into the child resource or after returning @@ -685,39 +847,41 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { std::condition_variable task_has_woken_condition; std::map threads; std::map> task_to_threads; + // Metrics are a little complicated. Spark reports metrics at a task level + // but we track and collect them at a thread level. The life time of a thread + // and a task are not tied to each other, and a thread can work on things for + // multiple tasks at the same time. So whenever a thread changes status + // the metrics for the tasks it is working on are aggregated here. When a task + // finishes the metrics for that task are then deleted. + std::map task_to_metrics; bool shutting_down = false; JavaVM* jvm; /** * log a status change that does not involve a state transition. */ - void log_status( - const char* op, long thread_id, long task_id, thread_state state, const char* notes = nullptr) + void log_status(std::string const& op, + long const thread_id, + long const task_id, + thread_state const state, + std::string const& notes = "") const { - auto this_id = static_cast(pthread_self()); - logger->info("{},{},{},{},{},,{}", - op, - this_id, - thread_id, - task_id, - as_str(state), - (notes == nullptr ? "" : notes)); + auto const this_id = static_cast(pthread_self()); + logger->info("{},{},{},{},{},,{}", op, this_id, thread_id, task_id, as_str(state), notes); } /** * log that a state transition happened. */ - void log_transition( - long thread_id, long task_id, thread_state from, thread_state to, const char* notes = nullptr) + void log_transition(long const thread_id, + long const task_id, + thread_state const from, + thread_state const to, + std::string const& notes = "") const { - auto this_id = static_cast(pthread_self()); - logger->info("TRANSITION,{},{},{},{},{},{}", - this_id, - thread_id, - task_id, - as_str(from), - as_str(to), - (notes == nullptr ? "" : notes)); + auto const this_id = static_cast(pthread_self()); + logger->info( + "TRANSITION,{},{},{},{},{},{}", this_id, thread_id, task_id, as_str(from), as_str(to), notes); } /** @@ -725,7 +889,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * of setting the state directly. This will log the transition and do a little bit of * verification. */ - void transition(full_thread_state& state, thread_state new_state, const char* message = nullptr) + void transition(full_thread_state& state, + thread_state const new_state, + std::string const& message = "") { thread_state original = state.state; state.transition_to(new_state); @@ -735,17 +901,51 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { /** * throw a java exception using the cached jvm/env. */ - void throw_java_exception(const char* ex_class_name, const char* msg) + void throw_java_exception(char const* ex_class_name, char const* msg) { JNIEnv* env = cudf::jni::get_jni_env(jvm); cudf::jni::throw_java_exception(env, ex_class_name, msg); } + void waiting_on_pool_status_changed(long const thread_id, bool const pool_blocked) + { + std::unique_lock lock(state_mutex); + auto const thread = threads.find(thread_id); + long task_id = -1; + if (thread != threads.end()) { task_id = thread->second.task_id; } + + if (task_id < 0) { + std::stringstream ss; + ss << "thread " << thread_id << " is not a dedicated task thread"; + throw std::invalid_argument(ss.str()); + } + + thread->second.pool_blocked = pool_blocked; + } + + /** + * Checkpoint all of the metrics for a thread. + */ + void checkpoint_metrics(full_thread_state& state) + { + if (state.task_id < 0) { + // save the metrics for all tasks before we add any new ones. + for (auto const task_id : state.pool_task_ids) { + auto const metrics_at = task_to_metrics.try_emplace(task_id, task_metrics()); + metrics_at.first->second.add(state.metrics); + } + state.metrics.clear(); + } else { + auto const metrics_at = task_to_metrics.try_emplace(state.task_id, task_metrics()); + metrics_at.first->second.take_from(state.metrics); + } + } + /** * This is a watchdog to prevent us from live locking. It should be called before we throw an * RetryOOM or a SplitAndRetryOOM to know if we actually should throw something else. */ - void check_before_oom(full_thread_state& state, const std::unique_lock& lock) + void check_before_oom(full_thread_state& state, std::unique_lock const& lock) { // The limit is an arbitrary number, large enough that we should not hit it in "normal" // operation, but also small enough that we can detect a livelock fairly quickly. @@ -758,34 +958,40 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { state.num_times_retried++; } - void throw_retry_oom(const char* msg, + void throw_retry_oom(char const* msg, full_thread_state& state, - const std::unique_lock& lock) + std::unique_lock const& lock) { - state.num_times_retry_throw++; + state.metrics.num_times_retry_throw++; check_before_oom(state, lock); state.record_failed_retry_time(); - throw_java_exception(RETRY_OOM_CLASS, "GPU OutOfMemory"); + if (state.is_cpu_alloc) { + throw_java_exception(CPU_RETRY_OOM_CLASS, "CPU OutOfMemory"); + } else { + throw_java_exception(GPU_RETRY_OOM_CLASS, "GPU OutOfMemory"); + } } - void throw_split_and_retry_oom(const char* msg, + void throw_split_and_retry_oom(char const* msg, full_thread_state& state, - const std::unique_lock& lock) + std::unique_lock const& lock) { - state.num_times_split_retry_throw++; + state.metrics.num_times_split_retry_throw++; check_before_oom(state, lock); state.record_failed_retry_time(); - throw_java_exception(SPLIT_AND_RETRY_OOM_CLASS, "GPU OutOfMemory"); + if (state.is_cpu_alloc) { + throw_java_exception(CPU_SPLIT_AND_RETRY_OOM_CLASS, "CPU OutOfMemory"); + } else { + throw_java_exception(GPU_SPLIT_AND_RETRY_OOM_CLASS, "GPU OutOfMemory"); + } } - bool is_blocked(thread_state state) + bool is_blocked(thread_state state) const { switch (state) { - case TASK_BLOCKED: - // fall through - case TASK_BUFN: + case thread_state::THREAD_BLOCKED: // fall through - case SHUFFLE_BLOCKED: return true; + case thread_state::THREAD_BUFN: return true; default: return false; } } @@ -793,23 +999,21 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { /** * Internal implementation that will block a thread until it is ready to continue. */ - void block_thread_until_ready(long thread_id, std::unique_lock& lock) + void block_thread_until_ready(long const thread_id, std::unique_lock& lock) { bool done = false; bool first_time = true; // Because this is called from alloc as well as from the public facing block_thread_until_ready // there are states that should only show up in relation to alloc failing. These include - // TASK_BUFN_THROW and TASK_SPLIT_THROW. They should never happen unless this is being called - // from within an alloc. + // THREAD_BUFN_THROW and THREAD_SPLIT_THROW. They should never happen unless this is being + // called from within an alloc. while (!done) { auto thread = threads.find(thread_id); if (thread != threads.end()) { switch (thread->second.state) { - case TASK_BLOCKED: + case thread_state::THREAD_BLOCKED: // fall through - case TASK_BUFN: - // fall through - case SHUFFLE_BLOCKED: + case thread_state::THREAD_BUFN: log_status("WAITING", thread_id, thread->second.task_id, thread->second.state); thread->second.before_block(); do { @@ -819,19 +1023,13 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { thread->second.after_block(); task_has_woken_condition.notify_all(); break; - case SHUFFLE_THROW: - transition(thread->second, thread_state::SHUFFLE_RUNNING); - thread->second.record_failed_retry_time(); - throw_java_exception(cudf::jni::OOM_CLASS, - "GPU OutOfMemory: could not allocate enough for shuffle"); - break; - case TASK_BUFN_THROW: - transition(thread->second, thread_state::TASK_BUFN_WAIT); + case thread_state::THREAD_BUFN_THROW: + transition(thread->second, thread_state::THREAD_BUFN_WAIT); thread->second.record_failed_retry_time(); throw_retry_oom("rollback and retry operation", thread->second, lock); break; - case TASK_BUFN_WAIT: - transition(thread->second, thread_state::TASK_BUFN); + case thread_state::THREAD_BUFN_WAIT: + transition(thread->second, thread_state::THREAD_BUFN); // Before we can wait it is possible that the throw didn't release anything // and the other threads didn't get unblocked by this, so we need to // check again to see if this was fixed or not. @@ -849,15 +1047,13 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { task_has_woken_condition.notify_all(); } break; - case TASK_SPLIT_THROW: - transition(thread->second, thread_state::TASK_RUNNING); + case thread_state::THREAD_SPLIT_THROW: + transition(thread->second, thread_state::THREAD_RUNNING); thread->second.record_failed_retry_time(); throw_split_and_retry_oom( "rollback, split input, and retry operation", thread->second, lock); break; - case TASK_REMOVE_THROW: - // fall through - case SHUFFLE_REMOVE_THROW: + case thread_state::THREAD_REMOVE_THROW: log_transition( thread_id, thread->second.task_id, thread->second.state, thread_state::UNKNOWN); // don't need to record failed time metric the thread is already gone... @@ -888,32 +1084,28 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { void wake_up_threads_after_task_finishes(const std::unique_lock& lock) { bool are_any_tasks_just_blocked = false; - for (auto thread = threads.begin(); thread != threads.end(); thread++) { - switch (thread->second.state) { - case TASK_BLOCKED: - transition(thread->second, thread_state::TASK_RUNNING); - thread->second.wake_condition->notify_all(); + for (auto& [thread_id, t_state] : threads) { + switch (t_state.state) { + case thread_state::THREAD_BLOCKED: + transition(t_state, thread_state::THREAD_RUNNING); + t_state.wake_condition->notify_all(); are_any_tasks_just_blocked = true; break; - case SHUFFLE_BLOCKED: - transition(thread->second, thread_state::SHUFFLE_RUNNING); - thread->second.wake_condition->notify_all(); - break; default: break; } } if (!are_any_tasks_just_blocked) { // wake up all of the BUFN tasks. - for (auto thread = threads.begin(); thread != threads.end(); thread++) { - switch (thread->second.state) { - case TASK_BUFN: + for (auto& [thread_id, t_state] : threads) { + switch (t_state.state) { + case thread_state::THREAD_BUFN: // fall through - case TASK_BUFN_THROW: + case thread_state::THREAD_BUFN_THROW: // fall through - case TASK_BUFN_WAIT: - transition(thread->second, thread_state::TASK_RUNNING); - thread->second.wake_condition->notify_all(); + case thread_state::THREAD_BUFN_WAIT: + transition(t_state, thread_state::THREAD_RUNNING); + t_state.wake_condition->notify_all(); break; default: break; } @@ -926,35 +1118,57 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * returns true if the thread that ended was a normally running task thread. * This should be used to decide if wake_up_threads_after_task_finishes is called or not. */ - bool remove_thread_association(long thread_id, const std::unique_lock& lock) + bool remove_thread_association(long thread_id, + long remove_task_id, + const std::unique_lock& lock) { - bool ret = false; - auto threads_at = threads.find(thread_id); + bool thread_should_be_removed = false; + bool ret = false; + auto const threads_at = threads.find(thread_id); if (threads_at != threads.end()) { - auto task_id = threads_at->second.task_id; - if (task_id >= 0) { - auto task_at = task_to_threads.find(task_id); - if (task_at != task_to_threads.end()) { task_at->second.erase(thread_id); } + // save the metrics no matter what + checkpoint_metrics(threads_at->second); + + if (remove_task_id < 0) { + thread_should_be_removed = true; + } else { + auto const task_id = threads_at->second.task_id; + if (task_id >= 0) { + if (task_id == remove_task_id) { thread_should_be_removed = true; } + } else { + threads_at->second.pool_task_ids.erase(remove_task_id); + if (threads_at->second.pool_task_ids.empty()) { thread_should_be_removed = true; } + } } - switch (threads_at->second.state) { - case TASK_BLOCKED: - // fall through - case TASK_BUFN: - transition(threads_at->second, thread_state::TASK_REMOVE_THROW); - threads_at->second.wake_condition->notify_all(); - break; - case SHUFFLE_BLOCKED: - transition(threads_at->second, thread_state::SHUFFLE_REMOVE_THROW); - threads_at->second.wake_condition->notify_all(); - break; - case TASK_RUNNING: - ret = true; - // fall through; - default: - log_transition( - thread_id, threads_at->second.task_id, threads_at->second.state, thread_state::UNKNOWN); - threads.erase(threads_at); + if (thread_should_be_removed) { + JNIEnv* env = nullptr; + if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) { + cache_thread_reg_jni(env); + env->CallStaticVoidMethod(ThreadStateRegistry_jclass, removeThread_method, thread_id); + } + if (remove_task_id >= 0) { + auto const task_at = task_to_threads.find(remove_task_id); + if (task_at != task_to_threads.end()) { task_at->second.erase(thread_id); } + } + + switch (threads_at->second.state) { + case thread_state::THREAD_BLOCKED: + // fall through + case thread_state::THREAD_BUFN: + transition(threads_at->second, thread_state::THREAD_REMOVE_THROW); + threads_at->second.wake_condition->notify_all(); + break; + case thread_state::THREAD_RUNNING: + ret = true; + // fall through; + default: + log_transition(thread_id, + threads_at->second.task_id, + threads_at->second.state, + thread_state::UNKNOWN); + threads.erase(threads_at); + } } } return ret; @@ -969,30 +1183,61 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * entered the state machine. The only known case is GPU memory required for setup in * cuDF for a spill operation. */ - bool pre_alloc(long thread_id) + bool pre_alloc(long const thread_id) { std::unique_lock lock(state_mutex); + return pre_alloc_core(thread_id, false, true, lock); + } - auto thread = threads.find(thread_id); + /** + * Called prior to processing an alloc attempt (CPU or GPU). This will throw any injected + * exception and wait until the thread is ready to actually do/retry the allocation (if + * the allocation is blocking). That blocking API may throw other exceptions if rolling + * back or splitting the input is considered needed. + * + * @return true if the call finds our thread in an ALLOC state, meaning that we recursively + * entered the state machine. This happens when we need to spill in a few cases for + * the CPU. + */ + bool pre_alloc_core(long const thread_id, + bool const is_for_cpu, + bool const blocking, + std::unique_lock& lock) + { + auto const thread = threads.find(thread_id); if (thread != threads.end()) { switch (thread->second.state) { // If the thread is in one of the ALLOC or ALLOC_FREE states, we have detected a loop // likely due to spill setup required in cuDF. We will treat this allocation differently // and skip transitions. - case TASK_ALLOC: - case SHUFFLE_ALLOC: - case TASK_ALLOC_FREE: - case SHUFFLE_ALLOC_FREE: return true; + case thread_state::THREAD_ALLOC: + // fall through + case thread_state::THREAD_ALLOC_FREE: + if (is_for_cpu && blocking) { + // On the CPU we want the spill code to be explicit so we don't have to detect it + // on the GPU we detect it and adjust dynamically + std::stringstream ss; + ss << "thread " << thread_id + << " is trying to do a blocking allocate while already in the state " + << as_str(thread->second.state); + throw std::invalid_argument(ss.str()); + } + // We are in a recursive allocation. + return true; default: break; } if (thread->second.retry_oom_injected > 0) { thread->second.retry_oom_injected--; - thread->second.num_times_retry_throw++; + thread->second.metrics.num_times_retry_throw++; log_status("INJECTED_RETRY_OOM", thread_id, thread->second.task_id, thread->second.state); thread->second.record_failed_retry_time(); - throw_java_exception(RETRY_OOM_CLASS, "injected RetryOOM"); + if (is_for_cpu) { + throw_java_exception(CPU_RETRY_OOM_CLASS, "injected RetryOOM"); + } else { + throw_java_exception(GPU_RETRY_OOM_CLASS, "injected RetryOOM"); + } } if (thread->second.cudf_exception_injected > 0) { @@ -1005,21 +1250,24 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { if (thread->second.split_and_retry_oom_injected > 0) { thread->second.split_and_retry_oom_injected--; - thread->second.num_times_split_retry_throw++; + thread->second.metrics.num_times_split_retry_throw++; log_status( "INJECTED_SPLIT_AND_RETRY_OOM", thread_id, thread->second.task_id, thread->second.state); thread->second.record_failed_retry_time(); - throw_java_exception(SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM"); + if (is_for_cpu) { + throw_java_exception(CPU_SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM"); + } else { + throw_java_exception(GPU_SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM"); + } } - block_thread_until_ready(thread_id, lock); + if (blocking) { block_thread_until_ready(thread_id, lock); } switch (thread->second.state) { - case TASK_RUNNING: transition(thread->second, thread_state::TASK_ALLOC); break; - case SHUFFLE_RUNNING: transition(thread->second, thread_state::SHUFFLE_ALLOC); break; - - // TODO I don't think there are other states that we need to handle, but - // this needs more testing. + case thread_state::THREAD_RUNNING: + transition(thread->second, thread_state::THREAD_ALLOC); + thread->second.is_cpu_alloc = is_for_cpu; + break; default: { std::stringstream ss; ss << "thread " << thread_id << " in unexpected state pre alloc " @@ -1029,6 +1277,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { } } } + // Not a recursive allocation return false; } @@ -1042,22 +1291,37 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * `likely_spill` if this allocation should be treated differently, because * we detected recursion while handling a prior allocation in this thread. */ - void post_alloc_success(long thread_id, bool likely_spill) + void post_alloc_success(long const thread_id, bool const likely_spill) { std::unique_lock lock(state_mutex); + post_alloc_success_core(thread_id, false, likely_spill, lock); + } + + void post_alloc_success_core(long const thread_id, + bool const is_for_cpu, + bool const was_recursive, + std::unique_lock& lock) + { // pre allocate checks - auto thread = threads.find(thread_id); - if (!likely_spill && thread != threads.end()) { + auto const thread = threads.find(thread_id); + if (!was_recursive && thread != threads.end()) { switch (thread->second.state) { - case TASK_ALLOC: - // fall through - case TASK_ALLOC_FREE: transition(thread->second, thread_state::TASK_RUNNING); break; - case SHUFFLE_ALLOC: + case thread_state::THREAD_ALLOC: // fall through - case SHUFFLE_ALLOC_FREE: transition(thread->second, thread_state::SHUFFLE_RUNNING); break; + case thread_state::THREAD_ALLOC_FREE: + if (thread->second.is_cpu_alloc != is_for_cpu) { + std::stringstream ss; + ss << "thread " << thread_id << " has a mismatch on CPU vs GPU post alloc " + << as_str(thread->second.state); + + throw std::invalid_argument(ss.str()); + } + transition(thread->second, thread_state::THREAD_RUNNING); + thread->second.is_cpu_alloc = false; + break; default: break; } - wake_next_highest_priority_blocked(lock, false); + wake_next_highest_priority_blocked(lock, false, is_for_cpu); } } @@ -1068,17 +1332,19 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * * This is typically called when a free happens, or an alloc succeeds. * @param is_from_free true if a free happen. + * @param is_for_cpu true if it was a CPU operation (free or alloc) */ - void wake_next_highest_priority_blocked(const std::unique_lock& lock, - bool is_from_free) + void wake_next_highest_priority_blocked(std::unique_lock const& lock, + bool const is_from_free, + bool const is_for_cpu) { - // 1. Find the highest priority blocked thread, including shuffle. + // 1. Find the highest priority blocked thread, for the alloc that matches thread_priority to_wake(-1, -1); bool is_to_wake_set = false; - for (auto thread = threads.begin(); thread != threads.end(); thread++) { - thread_state state = thread->second.state; - if (state == thread_state::TASK_BLOCKED || state == thread_state::SHUFFLE_BLOCKED) { - thread_priority current = thread->second.priority(); + for (auto const& [thread_d, t_state] : threads) { + thread_state const& state = t_state.state; + if (state == thread_state::THREAD_BLOCKED && is_for_cpu == t_state.is_cpu_alloc) { + thread_priority current = t_state.priority(); if (!is_to_wake_set || to_wake < current) { to_wake = current; is_to_wake_set = true; @@ -1086,17 +1352,13 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { } } // 2. wake up that thread - long thread_id_to_wake = to_wake.get_thread_id(); + long const thread_id_to_wake = to_wake.get_thread_id(); if (thread_id_to_wake > 0) { - auto thread = threads.find(thread_id_to_wake); + auto const thread = threads.find(thread_id_to_wake); if (thread != threads.end()) { switch (thread->second.state) { - case TASK_BLOCKED: - transition(thread->second, thread_state::TASK_RUNNING); - thread->second.wake_condition->notify_all(); - break; - case SHUFFLE_BLOCKED: - transition(thread->second, thread_state::SHUFFLE_RUNNING); + case thread_state::THREAD_BLOCKED: + transition(thread->second, thread_state::THREAD_RUNNING); thread->second.wake_condition->notify_all(); break; default: { @@ -1115,67 +1377,175 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { // instead of trying to split its input. But we only do this if it // is a different thread that is freeing memory from the one we want to wake up. // This is because if the threads are the same no new memory is being added - // to what that task has access to and the task may never thow a retry and split. + // to what that task has access to and the task may never throw a retry and split. // Instead it would just keep retrying and freeing the same memory each time. - std::set tasks_with_threads; - std::set tasks_with_threads_bufn; + std::map pool_bufn_task_thread_count; + std::map pool_task_thread_count; + std::unordered_set bufn_task_ids; + std::unordered_set all_task_ids; + is_in_deadlock( + pool_bufn_task_thread_count, pool_task_thread_count, bufn_task_ids, all_task_ids, lock); + bool const all_bufn = all_task_ids.size() == bufn_task_ids.size(); + if (all_bufn) { + thread_priority to_wake(-1, -1); + bool is_to_wake_set = false; + for (auto const& [thread_id, t_state] : threads) { + switch (t_state.state) { + case thread_state::THREAD_BUFN: { + if (is_for_cpu == t_state.is_cpu_alloc) { + thread_priority current = t_state.priority(); + if (!is_to_wake_set || to_wake < current) { + to_wake = current; + is_to_wake_set = true; + } + } + } break; + default: break; + } + } + // 4. Wake up the BUFN thread if we should + if (is_to_wake_set) { + long const thread_id_to_wake = to_wake.get_thread_id(); + if (thread_id_to_wake > 0) { + // Don't wake up yourself on a free. It is not adding more memory for this thread + // to use on a retry and we might need a split instead to break a deadlock + auto const this_id = static_cast(pthread_self()); + auto const thread = threads.find(thread_id_to_wake); + if (thread != threads.end() && thread->first != this_id) { + switch (thread->second.state) { + case thread_state::THREAD_BUFN: + transition(thread->second, thread_state::THREAD_RUNNING); + thread->second.wake_condition->notify_all(); + break; + case thread_state::THREAD_BUFN_WAIT: + transition(thread->second, thread_state::THREAD_RUNNING); + // no need to notify anyone, we will just retry without blocking... + break; + case thread_state::THREAD_BUFN_THROW: + // This should really never happen, this is a temporary state that is here only + // while the lock is held, but just in case we don't want to mess it up, or throw + // an exception. + break; + default: { + std::stringstream ss; + ss << "internal error expected to only wake up blocked threads " + << thread_id_to_wake << " " << as_str(thread->second.state); + throw std::runtime_error(ss.str()); + } + } + } + } + } + } + } + } - thread_priority to_wake(-1, -1); - bool is_to_wake_set = false; - for (auto thread = threads.begin(); thread != threads.end(); thread++) { - if (thread->second.task_id >= 0) { tasks_with_threads.insert(thread->second.task_id); } + bool is_thread_bufn_or_above(JNIEnv* env, full_thread_state const& state) + { + bool ret = false; + if (state.pool_blocked) { + ret = true; + } else { + switch (state.state) { + case thread_state::THREAD_BLOCKED: ret = false; break; + case thread_state::THREAD_BUFN: + // empty we are looking for even a single thread that is not blocked + ret = true; + break; + default: + ret = env->CallStaticBooleanMethod( + ThreadStateRegistry_jclass, isThreadBlocked_method, state.thread_id); + break; + } + } + return ret; + } - switch (thread->second.state) { - case TASK_BUFN_THROW: - // fall through - case TASK_BUFN_WAIT: - // fall through - case TASK_BUFN: { - tasks_with_threads_bufn.insert(thread->second.task_id); - thread_priority current = thread->second.priority(); - if (!is_to_wake_set || to_wake < current) { - to_wake = current; - is_to_wake_set = true; - } - } break; - default: break; + bool is_in_deadlock(std::map& pool_bufn_task_thread_count, + std::map& pool_task_thread_count, + std::unordered_set& bufn_task_ids, + std::unordered_set& all_task_ids, + std::unique_lock const& lock) + { + JNIEnv* env = nullptr; + if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) { + throw std::runtime_error("Cloud not init JNI callbacks"); + } + cache_thread_reg_jni(env); + + // If all of the tasks are blocked, then we are in a deadlock situation + // and we need to wake something up. In theory if any one thread is still + // doing something, then we are not deadlocked. But the problem is detecting + // if a thread is blocked cheaply and accurately. We can tell if this code has + // blocked a thread. We can also have code we control inform us if a thread is + // blocked. We even have a callback to the JVM to see if the state of the java + // thread indicates if it is blocked or not. But I/O in java most of the time + // shows the thread as RUNNABLE. We also don't want to look at stack traces if + // we can avoid it as it is expensive. The reason this matters is because of + // python UDFs. When a python process runs to execute UDFs at least two dedicated + // task threads are used for a single task. One will write data to the python + // process and another will read results from it. Because both involve + // I/O we need a solution. For now we assume that a task is blocked if any + // one of the dedicated task threads are blocked and if all of the pool + // threads working on that task are also blocked. This is because the pool + // threads, even if they are blocked on I/O will eventually finish without + // needing to worry about it. + // + // We also need a way to detect if we need to split the input and retry. + // This happens when all of the tasks are also blocked until + // further notice. So we are going to treat a task as blocked until + // further notice if any of the dedicated threads for it are blocked until + // further notice, or all of the pool threads working on things for it are + // blocked until further notice. + std::unordered_set blocked_task_ids; + + // We are going to do two passes through the threads to deal with this. + // First pass is to look at the dedicated task threads + for (auto const& [thread_id, t_state] : threads) { + long const task_id = t_state.task_id; + if (task_id >= 0) { + all_task_ids.insert(task_id); + bool const is_bufn_plus = is_thread_bufn_or_above(env, t_state); + if (is_bufn_plus) { bufn_task_ids.insert(task_id); } + if (is_bufn_plus || t_state.state == thread_state::THREAD_BLOCKED) { + blocked_task_ids.insert(task_id); } } + } - // 4. Wake up the BUFN thread if we should - if (tasks_with_threads.size() == tasks_with_threads_bufn.size() && is_to_wake_set) { - long thread_id_to_wake = to_wake.get_thread_id(); - if (thread_id_to_wake > 0) { - // Don't wake up yourself on a free. It is not adding more memory for this thread - // to use on a retry and we might need a split instead to break a deadlock - auto this_id = static_cast(pthread_self()); - auto thread = threads.find(thread_id_to_wake); - if (thread != threads.end() && thread->first != this_id) { - switch (thread->second.state) { - case TASK_BUFN: - transition(thread->second, thread_state::TASK_RUNNING); - thread->second.wake_condition->notify_all(); - break; - case TASK_BUFN_WAIT: - transition(thread->second, thread_state::TASK_RUNNING); - // no need to notify anyone, we will just retry without blocking... - break; - case TASK_BUFN_THROW: - // This should really never happen, this is a temporary state that is here only - // while the lock is held, but just in case we don't want to mess it up, or throw - // an exception. - break; - default: { - std::stringstream ss; - ss << "internal error expected to only wake up blocked threads " - << thread_id_to_wake << " " << as_str(thread->second.state); - throw std::runtime_error(ss.str()); - } + // Second pass is to look at the pool threads + for (auto const& [thread_id, t_state] : threads) { + long const is_pool_thread = t_state.task_id < 0; + if (is_pool_thread) { + for (auto const& task_id : t_state.pool_task_ids) { + auto const it = pool_task_thread_count.find(task_id); + if (it != pool_task_thread_count.end()) { + it->second += 1; + } else { + pool_task_thread_count[task_id] = 1; + } + } + + bool const is_bufn_plus = is_thread_bufn_or_above(env, t_state); + if (is_bufn_plus) { + for (auto const& task_id : t_state.pool_task_ids) { + auto const it = pool_bufn_task_thread_count.find(task_id); + if (it != pool_bufn_task_thread_count.end()) { + it->second += 1; + } else { + pool_bufn_task_thread_count[task_id] = 1; } } } + if (!is_bufn_plus && t_state.state != thread_state::THREAD_BLOCKED) { + for (auto const& task_id : t_state.pool_task_ids) { + blocked_task_ids.erase(task_id); + } + } } } + // Now if all of the tasks are blocked, then we need to break a deadlock + return all_task_ids.size() == blocked_task_ids.size(); } /** @@ -1185,54 +1555,20 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { */ void check_and_update_for_bufn(const std::unique_lock& lock) { - // We want to know if all active tasks have at least one thread that - // is effectively blocked or not. We could change the definitions here, - // but for now this sounds like a good starting point. - std::set tasks_with_threads; - std::set tasks_with_threads_effectively_blocked; - bool is_any_shuffle_thread_blocked = false; - - // To keep things simple we are going to do multiple passes through - // the state. The first is to find out if any shuffle thread is blocked - // because if it is, then there is a possibility that any task thread - // in a shuffle could also be blocked. - for (auto thread = threads.begin(); thread != threads.end(); thread++) { - switch (thread->second.state) { - case SHUFFLE_BLOCKED: is_any_shuffle_thread_blocked = true; break; - default: break; - } - } - - for (auto thread = threads.begin(); thread != threads.end(); thread++) { - if (thread->second.task_id >= 0) { tasks_with_threads.insert(thread->second.task_id); } - - switch (thread->second.state) { - case TASK_WAIT_ON_SHUFFLE: - // fall through - case TASK_BUFN_WAIT_ON_SHUFFLE: - if (is_any_shuffle_thread_blocked) { - tasks_with_threads_effectively_blocked.insert(thread->second.task_id); - } - break; - case TASK_BLOCKED: - // fall through - case TASK_BUFN: - tasks_with_threads_effectively_blocked.insert(thread->second.task_id); - break; - default: break; - } - } - - bool need_to_break_deadlock = - tasks_with_threads.size() == tasks_with_threads_effectively_blocked.size(); + std::map pool_bufn_task_thread_count; + std::map pool_task_thread_count; + std::unordered_set bufn_task_ids; + std::unordered_set all_task_ids; + bool const need_to_break_deadlock = is_in_deadlock( + pool_bufn_task_thread_count, pool_task_thread_count, bufn_task_ids, all_task_ids, lock); if (need_to_break_deadlock) { // Find the task thread with the lowest priority that is not already BUFN thread_priority to_bufn(-1, -1); bool is_to_bufn_set = false; - for (auto thread = threads.begin(); thread != threads.end(); thread++) { - switch (thread->second.state) { - case TASK_BLOCKED: { - thread_priority current = thread->second.priority(); + for (auto const& [thread_id, t_state] : threads) { + switch (t_state.state) { + case thread_state::THREAD_BLOCKED: { + thread_priority const& current = t_state.priority(); if (!is_to_bufn_set || current < to_bufn) { to_bufn = current; is_to_bufn_set = true; @@ -1242,56 +1578,52 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { } } if (is_to_bufn_set) { - long thread_id_to_bufn = to_bufn.get_thread_id(); - auto thread = threads.find(thread_id_to_bufn); + long const thread_id_to_bufn = to_bufn.get_thread_id(); + auto const thread = threads.find(thread_id_to_bufn); if (thread != threads.end()) { - transition(thread->second, thread_state::TASK_BUFN_THROW); + transition(thread->second, thread_state::THREAD_BUFN_THROW); thread->second.wake_condition->notify_all(); + // We are explicitly not going to update the state around BUFN + // here, because we really want to wait for the retry to run + // it's course instead of doing a split right away. + } + } + // We now need a way to detect if we need to split the input and retry. + // This happens when all of the tasks are also blocked until + // further notice. So we are going to treat a task as blocked until + // further notice if any of the dedicated threads for it are blocked until + // further notice, or all of the pool threads working on things for it are + // blocked until further notice. + + for (auto const& [task_id, bufn_count] : pool_bufn_task_thread_count) { + auto const pttc = pool_task_thread_count.find(task_id); + if (pttc != pool_task_thread_count.end() && pttc->second <= bufn_count) { + bufn_task_ids.insert(task_id); } } - // Now we need to check if all of the threads are BUFN - // Are all BUFN?? - bool all_bufn_or_shuffle = true; - thread_priority to_wake(-1, -1); - bool is_to_wake_set = false; - for (auto thread = threads.begin(); thread != threads.end(); thread++) { - if (thread->second.task_id >= 0) { - switch (thread->second.state) { - case TASK_BUFN: { - thread_priority current = thread->second.priority(); + bool const all_bufn = all_task_ids.size() == bufn_task_ids.size(); + + if (all_bufn) { + thread_priority to_wake(-1, -1); + bool is_to_wake_set = false; + for (auto const& [thread_id, t_state] : threads) { + switch (t_state.state) { + case thread_state::THREAD_BUFN: { + thread_priority const& current = t_state.priority(); if (!is_to_wake_set || to_wake < current) { to_wake = current; is_to_wake_set = true; } } break; - case TASK_WAIT_ON_SHUFFLE: - if (!is_any_shuffle_thread_blocked) { all_bufn_or_shuffle = false; } - break; - default: all_bufn_or_shuffle = false; break; + default: break; } } - } - if (all_bufn_or_shuffle) { - long thread_id = to_wake.get_thread_id(); - auto found_thread = threads.find(thread_id); + long const thread_id = to_wake.get_thread_id(); + auto const found_thread = threads.find(thread_id); if (found_thread != threads.end()) { - transition(found_thread->second, thread_state::TASK_SPLIT_THROW); + transition(found_thread->second, thread_state::THREAD_SPLIT_THROW); found_thread->second.wake_condition->notify_all(); - } else { - // the only threads left are blocked on shuffle. No way for shuffle - // to split and throw, and ideally all of the data for those threads - // should already be spillable, so at this point shuffle needs to - // throw an OOM. - for (auto thread = threads.begin(); thread != threads.end(); thread++) { - switch (thread->second.state) { - case SHUFFLE_BLOCKED: - transition(thread->second, thread_state::SHUFFLE_THROW); - thread->second.wake_condition->notify_all(); - break; - default: break; - } - } } } } @@ -1302,30 +1634,41 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * typically happen after this has run, and we loop around to retry the alloc * if the state says we should. */ - bool post_alloc_failed(long thread_id, bool is_oom, bool likely_spill) + bool post_alloc_failed(long const thread_id, bool const is_oom, bool const likely_spill) { std::unique_lock lock(state_mutex); - auto thread = threads.find(thread_id); + return post_alloc_failed_core(thread_id, false, is_oom, true, likely_spill, lock); + } + + bool post_alloc_failed_core(long const thread_id, + bool const is_for_cpu, + bool const is_oom, + bool const blocking, + bool const was_recursive, + std::unique_lock& lock) + { + auto const thread = threads.find(thread_id); // only retry if this was due to an out of memory exception. bool ret = true; - if (!likely_spill && thread != threads.end()) { + if (!was_recursive && thread != threads.end()) { + if (thread->second.is_cpu_alloc != is_for_cpu) { + std::stringstream ss; + ss << "thread " << thread_id << " has a mismatch on CPU vs GPU post alloc " + << as_str(thread->second.state); + + throw std::invalid_argument(ss.str()); + } + switch (thread->second.state) { - case TASK_ALLOC_FREE: transition(thread->second, thread_state::TASK_RUNNING); break; - case TASK_ALLOC: - if (is_oom) { - transition(thread->second, thread_state::TASK_BLOCKED); - } else { - // don't block unless it is OOM - transition(thread->second, thread_state::TASK_RUNNING); - } + case thread_state::THREAD_ALLOC_FREE: + transition(thread->second, thread_state::THREAD_RUNNING); break; - case SHUFFLE_ALLOC_FREE: transition(thread->second, thread_state::SHUFFLE_RUNNING); break; - case SHUFFLE_ALLOC: - if (is_oom) { - transition(thread->second, thread_state::SHUFFLE_BLOCKED); + case thread_state::THREAD_ALLOC: + if (is_oom && blocking) { + transition(thread->second, thread_state::THREAD_BLOCKED); } else { - // don't block unless it is OOM - transition(thread->second, thread_state::SHUFFLE_RUNNING); + // don't block unless it is OOM on a blocking allocation + transition(thread->second, thread_state::THREAD_RUNNING); } break; default: { @@ -1343,21 +1686,21 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { return ret; } - void* do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override + void* do_allocate(std::size_t const num_bytes, rmm::cuda_stream_view stream) override { - auto tid = static_cast(pthread_self()); + auto const tid = static_cast(pthread_self()); while (true) { - bool likely_spill = pre_alloc(tid); + bool const likely_spill = pre_alloc(tid); try { void* ret = resource->allocate(num_bytes, stream); post_alloc_success(tid, likely_spill); return ret; - } catch (const rmm::out_of_memory& e) { + } catch (rmm::out_of_memory const& e) { // rmm::out_of_memory is what is thrown when an allocation failed // but there are other rmm::bad_alloc exceptions that could be // thrown as well, which are handled by the std::exception case. if (!post_alloc_failed(tid, true, likely_spill)) { throw; } - } catch (const std::exception& e) { + } catch (std::exception const& e) { post_alloc_failed(tid, false, likely_spill); throw; } @@ -1366,42 +1709,49 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { throw rmm::bad_alloc("Internal Error"); } + void dealloc_core(bool const is_for_cpu, std::unique_lock& lock) + { + auto const tid = static_cast(pthread_self()); + auto const thread = threads.find(tid); + if (thread != threads.end()) { + log_status("DEALLOC", tid, thread->second.task_id, thread->second.state); + } else { + log_status("DEALLOC", tid, -2, thread_state::UNKNOWN); + } + + for (auto& [thread_id, t_state] : threads) { + // Only update state for _other_ threads. We update only other threads, for the case + // where we are handling a free from the recursive case: when an allocation/free + // happened while handling an allocation failure in onAllocFailed. + // + // If we moved all threads to *_ALLOC_FREE, after we exit the recursive state and + // are back handling the original allocation failure, we are left with a thread + // in a state that won't be retried in `post_alloc_failed`. + // + // By not changing our thread's state to THREAD_ALLOC_FREE, we keep the state + // the same, but we still let other threads know that there was a free and they should + // handle accordingly. + if (t_state.thread_id != tid) { + switch (t_state.state) { + case thread_state::THREAD_ALLOC: + if (is_for_cpu == t_state.is_cpu_alloc) { + transition(t_state, thread_state::THREAD_ALLOC_FREE); + } + break; + default: break; + } + } + } + wake_next_highest_priority_blocked(lock, true, is_for_cpu); + } + void do_deallocate(void* p, std::size_t size, rmm::cuda_stream_view stream) override { resource->deallocate(p, size, stream); // deallocate success if (size > 0) { std::unique_lock lock(state_mutex); - - auto tid = static_cast(pthread_self()); - auto thread = threads.find(tid); - if (thread != threads.end()) { - log_status("DEALLOC", tid, thread->second.task_id, thread->second.state); - } else { - log_status("DEALLOC", tid, -2, thread_state::UNKNOWN); - } - - for (auto thread = threads.begin(); thread != threads.end(); thread++) { - // Only update state for _other_ threads. We update only other threads, for the case - // where we are handling a free from the recursive case: when an allocation/free - // happened while handling an allocation failure in onAllocFailed. - // - // If we moved all threads to *_ALLOC_FREE, after we exit the recursive state and - // are back handling the original allocation failure, we are left with a thread - // in a state that won't be retried in `post_alloc_failed`. - // - // By not changing our thread's state to TASK_ALLOC_FREE, we keep the state - // the same, but we still let other threads know that there was a free and they should - // handle accordingly. - if (thread->second.thread_id != tid) { - switch (thread->second.state) { - case TASK_ALLOC: transition(thread->second, thread_state::TASK_ALLOC_FREE); break; - case SHUFFLE_ALLOC: transition(thread->second, thread_state::SHUFFLE_ALLOC_FREE); break; - default: break; - } - } - } - wake_next_highest_priority_blocked(lock, true); + dealloc_core(false, lock); } } @@ -1434,9 +1784,12 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_cr auto wrapped = reinterpret_cast(child); cudf::jni::native_jstring nlogloc(env, log_loc); std::shared_ptr logger; + bool is_log_enabled; if (nlogloc.is_null()) { - logger = make_logger(); + logger = make_logger(); + is_log_enabled = false; } else { + is_log_enabled = true; std::string slog_loc(nlogloc.get()); if (slog_loc == "stderr") { logger = make_logger(std::cerr); @@ -1447,7 +1800,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_cr } } - auto ret = new spark_resource_adaptor(env, wrapped, logger); + auto ret = new spark_resource_adaptor(env, wrapped, logger, is_log_enabled); return cudf::jni::ptr_as_jlong(ret); } CATCH_STD(env, 0) @@ -1466,44 +1819,59 @@ Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_releaseAdaptor(JNIEnv* env } JNIEXPORT void JNICALL -Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_associateThreadWithTask( +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_startDedicatedTaskThread( JNIEnv* env, jclass, jlong ptr, jlong thread_id, jlong task_id) { JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); auto mr = reinterpret_cast(ptr); - mr->associate_thread_with_task(thread_id, task_id); + mr->start_dedicated_task_thread(thread_id, task_id); } CATCH_STD(env, ) } JNIEXPORT void JNICALL -Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_associateThreadWithShuffle(JNIEnv* env, - jclass, - jlong ptr, - jlong thread_id) +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_poolThreadWorkingOnTasks( + JNIEnv* env, jclass, jlong ptr, jboolean is_for_shuffle, jlong thread_id, jlongArray task_ids) { JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); + JNI_NULL_CHECK(env, task_ids, "task_ids is null", ); try { cudf::jni::auto_set_device(env); + cudf::jni::native_jlongArray jtask_ids(env, task_ids); + std::unordered_set task_set(jtask_ids.begin(), jtask_ids.end()); auto mr = reinterpret_cast(ptr); - mr->associate_thread_with_shuffle(thread_id); + mr->pool_thread_working_on_tasks(is_for_shuffle, thread_id, task_set); } CATCH_STD(env, ) } JNIEXPORT void JNICALL -Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_removeThreadAssociation(JNIEnv* env, - jclass, - jlong ptr, - jlong thread_id) +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_poolThreadFinishedForTasks( + JNIEnv* env, jclass, jlong ptr, jlong thread_id, jlongArray task_ids) { JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); + JNI_NULL_CHECK(env, task_ids, "task_ids is null", ); try { cudf::jni::auto_set_device(env); + cudf::jni::native_jlongArray jtask_ids(env, task_ids); + std::unordered_set task_set(jtask_ids.begin(), jtask_ids.end()); auto mr = reinterpret_cast(ptr); - mr->remove_thread_association(thread_id); + mr->pool_thread_finished_for_tasks(thread_id, task_set); + } + CATCH_STD(env, ) +} + +JNIEXPORT void JNICALL +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_removeThreadAssociation( + JNIEnv* env, jclass, jlong ptr, jlong thread_id, jlong task_id) +{ + JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); + try { + cudf::jni::auto_set_device(env); + auto mr = reinterpret_cast(ptr); + mr->remove_thread_association(thread_id, task_id); } CATCH_STD(env, ) } @@ -1522,29 +1890,38 @@ JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_tas CATCH_STD(env, ) } -JNIEXPORT void JNICALL -Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_threadCouldBlockOnShuffle(JNIEnv* env, - jclass, - jlong ptr, - jlong thread_id) +JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_submittingToPool( + JNIEnv* env, jclass, jlong ptr, jlong thread_id) +{ + JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); + try { + cudf::jni::auto_set_device(env); + auto mr = reinterpret_cast(ptr); + mr->submitting_to_pool(thread_id); + } + CATCH_STD(env, ) +} + +JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_waitingOnPool( + JNIEnv* env, jclass, jlong ptr, jlong thread_id) { JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); auto mr = reinterpret_cast(ptr); - mr->thread_could_block_on_shuffle(thread_id); + mr->waiting_on_pool(thread_id); } CATCH_STD(env, ) } -JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_threadDoneWithShuffle( +JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_doneWaitingOnPool( JNIEnv* env, jclass, jlong ptr, jlong thread_id) { JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); auto mr = reinterpret_cast(ptr); - mr->thread_done_with_shuffle(thread_id); + mr->done_waiting_on_pool(thread_id); } CATCH_STD(env, ) } @@ -1688,4 +2065,70 @@ JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_end } CATCH_STD(env, ) } + +JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_checkAndBreakDeadlocks( + JNIEnv* env, jclass, jlong ptr) +{ + JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); + try { + cudf::jni::auto_set_device(env); + auto mr = reinterpret_cast(ptr); + mr->check_and_break_deadlocks(); + } + CATCH_STD(env, ) +} + +JNIEXPORT jboolean JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_preCpuAlloc( + JNIEnv* env, jclass, jlong ptr, jlong amount, jboolean blocking) +{ + JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0); + try { + cudf::jni::auto_set_device(env); + auto mr = reinterpret_cast(ptr); + return mr->cpu_prealloc(amount, blocking); + } + CATCH_STD(env, 0) +} + +JNIEXPORT void JNICALL +Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_postCpuAllocSuccess(JNIEnv* env, + jclass, + jlong ptr, + jlong addr, + jlong amount, + jboolean blocking, + jboolean was_recursive) +{ + JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); + try { + cudf::jni::auto_set_device(env); + auto mr = reinterpret_cast(ptr); + mr->cpu_postalloc_success(reinterpret_cast(addr), amount, blocking, was_recursive); + } + CATCH_STD(env, ) +} + +JNIEXPORT jboolean JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_postCpuAllocFailed( + JNIEnv* env, jclass, jlong ptr, jboolean was_oom, jboolean blocking, jboolean was_recursive) +{ + JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0); + try { + cudf::jni::auto_set_device(env); + auto mr = reinterpret_cast(ptr); + return mr->cpu_postalloc_failed(was_oom, blocking, was_recursive); + } + CATCH_STD(env, 0) +} + +JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_cpuDeallocate( + JNIEnv* env, jclass, jlong ptr, jlong addr, jlong amount) +{ + JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); + try { + cudf::jni::auto_set_device(env); + auto mr = reinterpret_cast(ptr); + mr->cpu_dealloc(reinterpret_cast(addr), amount); + } + CATCH_STD(env, ) +} } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CpuRetryOOM.java b/src/main/java/com/nvidia/spark/rapids/jni/CpuRetryOOM.java new file mode 100644 index 0000000000..a8fb42390a --- /dev/null +++ b/src/main/java/com/nvidia/spark/rapids/jni/CpuRetryOOM.java @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +/** + * A special version of an out of memory error that indicates we ran out of off heap memory, but + * should roll back to a point when all memory for the task is spillable and then retry the + * operation. + */ +public class CpuRetryOOM extends OffHeapOOM { + public CpuRetryOOM() { + super(); + } + + public CpuRetryOOM(String message) { + super(message); + } +} diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CpuSplitAndRetryOOM.java b/src/main/java/com/nvidia/spark/rapids/jni/CpuSplitAndRetryOOM.java new file mode 100644 index 0000000000..16e6e7239f --- /dev/null +++ b/src/main/java/com/nvidia/spark/rapids/jni/CpuSplitAndRetryOOM.java @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +/** + * A special version of an out of memory error that indicates we ran out of off heap memory, but + * should roll back to a point when all memory for the task is spillable and then retry the + * operation with the input data split to make it ideally use less off heap memory overall. + */ +public class CpuSplitAndRetryOOM extends OffHeapOOM { + public CpuSplitAndRetryOOM() { + super(); + } + + public CpuSplitAndRetryOOM(String message) { + super(message); + } +} diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RetryOOM.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuRetryOOM.java similarity index 85% rename from src/main/java/com/nvidia/spark/rapids/jni/RetryOOM.java rename to src/main/java/com/nvidia/spark/rapids/jni/GpuRetryOOM.java index 62d5e28cca..f7eec8be46 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/RetryOOM.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuRetryOOM.java @@ -17,15 +17,15 @@ package com.nvidia.spark.rapids.jni; /** - * A special version of an out of memory error that indicates we ran out of memory, but should + * A special version of an out of memory error that indicates we ran out of GPU memory, but should * roll back to a point when all memory for the task is spillable and then retry the operation. */ -public class RetryOOM extends GpuOOM { - public RetryOOM() { +public class GpuRetryOOM extends GpuOOM { + public GpuRetryOOM() { super(); } - public RetryOOM(String message) { + public GpuRetryOOM(String message) { super(message); } } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/SplitAndRetryOOM.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuSplitAndRetryOOM.java similarity index 85% rename from src/main/java/com/nvidia/spark/rapids/jni/SplitAndRetryOOM.java rename to src/main/java/com/nvidia/spark/rapids/jni/GpuSplitAndRetryOOM.java index 022c6952a1..4c3b3baeba 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/SplitAndRetryOOM.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuSplitAndRetryOOM.java @@ -17,16 +17,16 @@ package com.nvidia.spark.rapids.jni; /** - * A special version of an out of memory error that indicates we ran out of memory, but should + * A special version of an out of memory error that indicates we ran out of GPU memory, but should * roll back to a point when all memory for the task is spillable and then retry the operation * with the input data split to make it ideally use less GPU memory overall. */ -public class SplitAndRetryOOM extends GpuOOM { - public SplitAndRetryOOM() { +public class GpuSplitAndRetryOOM extends GpuOOM { + public GpuSplitAndRetryOOM() { super(); } - public SplitAndRetryOOM(String message) { + public GpuSplitAndRetryOOM(String message) { super(message); } } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/OffHeapOOM.java b/src/main/java/com/nvidia/spark/rapids/jni/OffHeapOOM.java new file mode 100644 index 0000000000..9379775072 --- /dev/null +++ b/src/main/java/com/nvidia/spark/rapids/jni/OffHeapOOM.java @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +/** + * A special version of an out of memory error that indicates we ran out of off heap CPU memory. + * This is mostly to avoid a fatal error that would force the worker process to restart. This + * should be recoverable. + */ +public class OffHeapOOM extends RuntimeException { + public OffHeapOOM() { + super(); + } + + public OffHeapOOM(String message) { + super(message); + } +} diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java b/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java index 3132dd9cd0..558124e2fe 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java @@ -114,47 +114,122 @@ public static long getCurrentThreadId() { } /** - * Associate a thread with a given task id. + * Indicate that a given thread is dedicated to a specific task. This thread can be part of a + * thread pool, but if it blocks it can never transitively block another active task. * @param threadId the thread ID to use - * @param taskId the task ID this thread is associated with. + * @param taskId the task ID this thread is working on. */ - public static void associateThreadWithTask(long threadId, long taskId) { + public static void startDedicatedTaskThread(long threadId, long taskId, Thread thread) { synchronized (Rmm.class) { if (sra != null && sra.isOpen()) { - sra.associateThreadWithTask(threadId, taskId); + ThreadStateRegistry.addThread(threadId, thread); + sra.startDedicatedTaskThread(threadId, taskId); } } } /** - * Associate the current thread with a given task id. - * @param taskId the task ID this thread is associated with. + * Indicate that the current thread is dedicated to a specific task. This thread can be part of + * a thread pool, but if this blocks it can never transitively block another active task. + * @param taskId the task ID this thread is working on. */ - public static void associateCurrentThreadWithTask(long taskId) { - associateThreadWithTask(getCurrentThreadId(), taskId); + public static void currentThreadIsDedicatedToTask(long taskId) { + startDedicatedTaskThread(getCurrentThreadId(), taskId, Thread.currentThread()); } /** - * Associate a thread with shuffle. - * @param threadId the thread ID to associate (not java thread id). + * A shuffle thread has started to work on some tasks. + * @param threadId the thread ID (not java thread id). + * @param thread the java thread + * @param taskIds the IDs of tasks that this is starting work on. */ - public static void associateThreadWithShuffle(long threadId) { + public static void shuffleThreadWorkingTasks(long threadId, Thread thread, long[] taskIds) { synchronized (Rmm.class) { if (sra != null && sra.isOpen()) { - sra.associateThreadWithShuffle(threadId); + ThreadStateRegistry.addThread(threadId, thread); + sra.poolThreadWorkingOnTasks(true, threadId, taskIds); } } } /** - * Associate the current thread with shuffle. + * The current thread is a shuffle thread and has started to work on some tasks. + * @param taskIds the IDs of the tasks that this is starting work on. */ - public static void associateCurrentThreadWithShuffle() { - associateThreadWithShuffle(getCurrentThreadId()); + public static void shuffleThreadWorkingOnTasks(long[] taskIds) { + shuffleThreadWorkingTasks(getCurrentThreadId(), Thread.currentThread(), taskIds); } + /** + * The current thread which is in a thread pool that could transitively block other tasks has + * started to work on a task. + * @param taskId the ID of the task that this is starting work on. + */ + public static void poolThreadWorkingOnTask(long taskId) { + long threadId = getCurrentThreadId(); + Thread thread = Thread.currentThread(); + long[] taskIds = new long[]{taskId}; + synchronized (Rmm.class) { + if (sra != null && sra.isOpen()) { + ThreadStateRegistry.addThread(threadId, thread); + sra.poolThreadWorkingOnTasks(false, threadId, taskIds); + } + } + } + + /** + * A thread in a thread pool that could transitively block other tasks has finished work + * on some tasks. + * @param threadId the thread ID (not java thread id). + * @param taskIds the IDs of the tasks that are done. + */ + public static void poolThreadFinishedForTasks(long threadId, long[] taskIds) { + synchronized (Rmm.class) { + if (sra != null && sra.isOpen()) { + sra.poolThreadFinishedForTasks(threadId, taskIds); + } + } + } + + /** + * A shuffle thread has finished work on some tasks. + * @param threadId the thread ID (not java thread id). + * @param taskIds the IDs of the tasks that are done. + */ + private static void shuffleThreadFinishedForTasks(long threadId, long[] taskIds) { + poolThreadFinishedForTasks(threadId, taskIds); + } + + /** + * The current thread which is in a thread pool that could transitively block other tasks + * has finished work on some tasks. + * @param taskIds the IDs of the tasks that are done. + */ + public static void poolThreadFinishedForTasks(long[] taskIds) { + poolThreadFinishedForTasks(getCurrentThreadId(), taskIds); + } + /** + * The current shuffle thread has finished work on some tasks. + * @param taskIds the IDs of the tasks that are done. + */ + public static void shuffleThreadFinishedForTasks(long[] taskIds) { + shuffleThreadFinishedForTasks(getCurrentThreadId(), taskIds); + } + /** + * The current thread which is in a thread pool that could transitively block other tasks + * has finished work on a task. + * @param taskId the ID of the task that is done. + */ + public static void poolThreadFinishedForTask(long taskId) { + poolThreadFinishedForTasks(getCurrentThreadId(), new long[]{taskId}); + } + + /** + * Indicate that a retry block has started for a given thread. + * @param threadId the id of the thread, not the java ID. + */ public static void startRetryBlock(long threadId) { synchronized (Rmm.class) { if (sra != null && sra.isOpen()) { @@ -163,10 +238,17 @@ public static void startRetryBlock(long threadId) { } } + /** + * Indicate that the current thread is entering a retry block. + */ public static void currentThreadStartRetryBlock() { startRetryBlock(getCurrentThreadId()); } + /** + * Indicate that a retry block has ended for a given thread. + * @param threadId the id of the thread, not the java ID. + */ public static void endRetryBlock(long threadId) { synchronized (Rmm.class) { if (sra != null && sra.isOpen()) { @@ -175,28 +257,62 @@ public static void endRetryBlock(long threadId) { } } + /** + * Indicate that the current thread is exiting a retry block. + */ public static void currentThreadEndRetryBlock() { - startRetryBlock(getCurrentThreadId()); + endRetryBlock(getCurrentThreadId()); + } + + private static void checkAndBreakDeadlocks() { + synchronized (Rmm.class) { + if (sra != null && sra.isOpen()) { + sra.checkAndBreakDeadlocks(); + } + } } /** - * Remove the given thread ID from any association. + * Remove the given thread ID from being associated with a given task * @param threadId the ID of the thread that is no longer a part of a task or shuffle * (not java thread id). */ - public static void removeThreadAssociation(long threadId) { + public static void removeDedicatedThreadAssociation(long threadId, long taskId) { + synchronized (Rmm.class) { + if (sra != null && sra.isOpen()) { + sra.removeThreadAssociation(threadId, taskId); + } + } + } + + /** + * Remove the current thread from being associated with the given task. + */ + public static void removeCurrentDedicatedThreadAssociation(long taskId) { + removeDedicatedThreadAssociation(getCurrentThreadId(), taskId); + } + + /** + * Remove all task associations for a given thread. This is intended to be used as a part + * of tests when a thread is shutting down, or for a pool thread when it is fully done. + * Dedicated task thread typically are cleaned when the task itself completes. + * @param threadId the id of the thread to clean up + */ + public static void removeAllThreadAssociation(long threadId) { synchronized (Rmm.class) { if (sra != null && sra.isOpen()) { - sra.removeThreadAssociation(threadId); + sra.removeThreadAssociation(threadId, -1); } } } /** - * Remove any association the current thread has. + * Remove all task associations for the current thread. This is intended to be used as a part + * of tests when a thread is shutting down, or for a pool thread when it is fully done. + * Dedicated task thread typically are cleaned when the task itself completes. */ - public static void removeCurrentThreadAssociation() { - removeThreadAssociation(getCurrentThreadId()); + public static void removeAllCurrentThreadAssociation() { + removeAllThreadAssociation(getCurrentThreadId()); } /** @@ -213,51 +329,75 @@ public static void taskDone(long taskId) { } /** - * Indicate that the given thread could block on shuffle. - * @param threadId the id of the thread that could block (not java thread id). + * A dedicated task thread is about to submit work to a pool that could transitively block it. + * @param threadId the ID of the thread that is about to submit the work. + */ + public static void submittingToPool(long threadId) { + synchronized (Rmm.class) { + if (sra != null && sra.isOpen()) { + sra.submittingToPool(threadId); + } + } + } + + /** + * The current thread is about to submit work to a thread pool that might transitively block + * this thread. This thread must be a dedicated task thread. + */ + public static void submittingToPool() { + submittingToPool(getCurrentThreadId()); + } + + /** + * A dedicated task thread is about to wait on work done on a pool that could transitively + * block it. + * @param threadId the ID of the thread that is about to wait. */ - public static void threadCouldBlockOnShuffle(long threadId) { + public static void waitingOnPool(long threadId) { synchronized (Rmm.class) { if (sra != null && sra.isOpen()) { - sra.threadCouldBlockOnShuffle(threadId); + sra.waitingOnPool(threadId); } } } /** - * Indicate that the current thread could block on shuffle. + * The current thread is about to wait on work done on a thread pool that might transitively block + * this thread. This thread must be a dedicated task thread. */ - public static void threadCouldBlockOnShuffle() { - threadCouldBlockOnShuffle(getCurrentThreadId()); + public static void waitingOnPool() { + waitingOnPool(getCurrentThreadId()); } /** - * Indicate that the given thread can no longer block on shuffle. - * @param threadId the ID of the thread that o longer can block on shuffle (not java thread id). + * A dedicated task thread is done waiting on a pool, either for a result or after submitting + * something to the pool. + * @param threadId the ID of the thread that is done. */ - public static void threadDoneWithShuffle(long threadId) { + public static void doneWaitingOnPool(long threadId) { synchronized (Rmm.class) { if (sra != null && sra.isOpen()) { - sra.threadDoneWithShuffle(threadId); + sra.doneWaitingOnPool(threadId); } } } /** - * Indicate that the current thread can no longer block on shuffle. + * The current thread is done waiting on a pool either for a result or after submitting something + * to the pool. This thread must be a dedicated task thread. */ - public static void threadDoneWithShuffle() { - threadDoneWithShuffle(getCurrentThreadId()); + public static void doneWaitingOnPool() { + doneWaitingOnPool(getCurrentThreadId()); } /** - * This should be called as a part of handling any RetryOOM or SplitAndRetryOOM exception. + * This should be called as a part of handling any GpuRetryOOM or GpuSplitAndRetryOOM exception. * The order should be something like. *
    *
  1. Catch Exception
  2. *
  3. Mark any GPU input as spillable, (should have already had contig split called on it)
  4. *
  5. call blockUntilReady
  6. - *
  7. split the input data if SplitAndRetryOOM
  8. + *
  9. split the input data if GpuSplitAndRetryOOM
  10. *
  11. retry processing with the data
  12. *
* This should be a NOOP if the thread is not in a state where it would need to block. Note @@ -279,7 +419,8 @@ public static void blockThreadUntilReady() { } /** - * Force the thread with the given ID to throw a RetryOOM on their next allocation attempt. + * Force the thread with the given ID to throw a GpuRetryOOM or CpuRetryOOM on their next + * allocation attempt, depending on the type of allocation being done. * @param threadId the ID of the thread to throw the exception (not java thread id). */ public static void forceRetryOOM(long threadId) { @@ -287,9 +428,10 @@ public static void forceRetryOOM(long threadId) { } /** - * Force the thread with the given ID to throw a RetryOOM on their next allocation attempt. + * Force the thread with the given ID to throw a GpuRetryOOM or CpuRetryOOM on their next + * allocation attempt, depending on the type of allocation being done. * @param threadId the ID of the thread to throw the exception (not java thread id). - * @param numOOMs the number of times the RetryOOM should be thrown + * @param numOOMs the number of times the *RetryOOM should be thrown */ public static void forceRetryOOM(long threadId, int numOOMs) { synchronized (Rmm.class) { @@ -302,7 +444,8 @@ public static void forceRetryOOM(long threadId, int numOOMs) { } /** - * Force the thread with the given ID to throw a SplitAndRetryOOM on their next allocation attempt. + * Force the thread with the given ID to throw a GpuSplitAndRetryOOM of CpuSplitAndRetryOOM + * on their next allocation attempt, depending on the allocation being done. * @param threadId the ID of the thread to throw the exception (not java thread id). */ public static void forceSplitAndRetryOOM(long threadId) { @@ -310,9 +453,10 @@ public static void forceSplitAndRetryOOM(long threadId) { } /** - * Force the thread with the given ID to throw a SplitAndRetryOOM on their next allocation attempt. + * Force the thread with the given ID to throw a GpuSplitAndRetryOOM or CpuSplitAndRetryOOm + * on their next allocation attempt, depending on the allocation being done. * @param threadId the ID of the thread to throw the exception (not java thread id). - * @param numOOMs the number of times the SplitAndRetryOOM should be thrown + * @param numOOMs the number of times the *SplitAndRetryOOM should be thrown */ public static void forceSplitAndRetryOOM(long threadId, int numOOMs) { synchronized (Rmm.class) { @@ -423,4 +567,79 @@ public static long getAndResetComputeTimeLostToRetryNs(long taskId) { } } } + + /** + * Called before doing an allocation on the CPU. This could throw an injected exception to help + * with testing. + * @param amount the amount of memory being requested + * @param blocking is this for a blocking allocate or a non-blocking one. + * @return a boolean that indicates if the allocation is recursive. Note that recursive + * allocations on the CPU are only allowed with non-blocking allocations. This must be passed + * back into the post allocations calls. + */ + public static boolean preCpuAlloc(long amount, boolean blocking) { + SparkResourceAdaptor local; + synchronized (Rmm.class) { + local = sra; + } + if (local != null && local.isOpen()) { + return local.preCpuAlloc(amount, blocking); + } else { + return false; + } + } + + /** + * The allocation that was going to be done succeeded. + * @param ptr a pointer to the memory that was allocated. + * @param amount the amount of memory that was allocated. + * @param blocking is this for a blocking allocate or a non-blocking one. + * @param wasRecursive the boolean that was returned from `preCpuAlloc`. + */ + public static void postCpuAllocSuccess(long ptr, long amount, boolean blocking, + boolean wasRecursive) { + SparkResourceAdaptor local; + synchronized (Rmm.class) { + local = sra; + } + if (local != null && local.isOpen()) { + local.postCpuAllocSuccess(ptr, amount, blocking, wasRecursive); + } + } + + /** + * The allocation failed, and spilling didn't save it. + * @param wasOom was the failure caused by an OOM or something else. + * @param blocking is this for a blocking allocate or a non-blocking one. + * @param wasRecursive the boolean that was returned from `preCpuAlloc`. + * @return true if the allocation should be retried else false if the state machine + * thinks that a retry would not help. + */ + public static boolean postCpuAllocFailed(boolean wasOom, boolean blocking, boolean wasRecursive) { + SparkResourceAdaptor local; + synchronized (Rmm.class) { + local = sra; + } + if (local != null && local.isOpen()) { + return local.postCpuAllocFailed(wasOom, blocking, wasRecursive); + } else { + return false; + } + } + + /** + * Some CPU memory was freed. + * @param ptr a pointer to the memory being deallocated. + * @param amount the amount that was made available. + */ + public static void cpuDeallocate(long ptr, long amount) { + SparkResourceAdaptor local; + synchronized (Rmm.class) { + local = sra; + } + if (local != null && local.isOpen()) { + local.cpuDeallocate(ptr, amount); + } + } + } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RmmSparkThreadState.java b/src/main/java/com/nvidia/spark/rapids/jni/RmmSparkThreadState.java index 8cd35f1a40..1a6a61b783 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/RmmSparkThreadState.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/RmmSparkThreadState.java @@ -22,24 +22,16 @@ */ public enum RmmSparkThreadState { UNKNOWN(-1), // thread is not associated with anything... - TASK_RUNNING(0), // task thread running normally - TASK_WAIT_ON_SHUFFLE(1), // task thread waiting on shuffle - TASK_BUFN_WAIT_ON_SHUFFLE(2), // task thread waiting on shuffle, but marked as BUFN - TASK_ALLOC(3), // task thread in the middle of doing an allocation - TASK_ALLOC_FREE(4), // task thread in the middle of doing an allocation and a free happened - TASK_BLOCKED(5), // task thread that is temporarily blocked - TASK_BUFN_THROW(6), // task thread that should throw an exception to roll back before blocking - TASK_BUFN_WAIT(7), // task thread that threw an exception to roll back and now should + THREAD_RUNNING(0), // task thread running normally + THREAD_ALLOC(1), // task thread in the middle of doing an allocation + THREAD_ALLOC_FREE(2), // task thread in the middle of doing an allocation and a free happened + THREAD_BLOCKED(3), // task thread that is temporarily blocked + THREAD_BUFN_THROW(4), // task thread that should throw an exception to roll back before blocking + THREAD_BUFN_WAIT(5), // task thread that threw an exception to roll back and now should // block the next time alloc is called - TASK_BUFN(8), // task thread that is blocked until higher priority tasks start to succeed - TASK_SPLIT_THROW(9), // task thread that should throw an exception to split input and retry - TASK_REMOVE_THROW(10), // task thread that is being removed and needs to throw an exception - SHUFFLE_RUNNING(11), // shuffle thread that is running normally - SHUFFLE_ALLOC(12), // shuffle thread that is in the middle of doing an alloc - SHUFFLE_ALLOC_FREE(13), // shuffle thread that is doing an alloc and a free happened. - SHUFFLE_BLOCKED(14), // shuffle thread that is temporarily blocked - SHUFFLE_THROW(15), // shuffle thread that needs to throw an OOM - SHUFFLE_REMOVE_THROW(16); // shuffle thread that is being removed and needs to throw an exception + THREAD_BUFN(6), // task thread that is blocked until higher priority tasks start to succeed + THREAD_SPLIT_THROW(7), // task thread that should throw an exception to split input and retry + THREAD_REMOVE_THROW(8); // task thread that is being removed and needs to throw an exception private final int nativeId; diff --git a/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java b/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java index 8d98729dfc..74f1946748 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java @@ -26,7 +26,15 @@ public class SparkResourceAdaptor NativeDepsLoader.loadNativeDeps(); } + /** + * How long does the SparkResourceAdaptor pool thread states as a watchdog to break up potential + * deadlocks. + */ + private static final long pollingPeriod = Long.getLong( + "ai.rapids.cudf.spark.rmmWatchdogPollingPeriod", 100); + private long handle = 0; + private Thread watchDog; /** * Create a new tracking resource adaptor. @@ -46,6 +54,17 @@ public SparkResourceAdaptor(RmmEventHandlerResourceAdaptor wrapped, String logLoc) { super(wrapped); + watchDog = new Thread(() -> { + try { + while (handle > 0) { + checkAndBreakDeadlocks(); + Thread.sleep(pollingPeriod); + } + } catch (InterruptedException e) { + // We are going to exit, so ignore the exception + Thread.currentThread().interrupt(); + } + }, "SparkResourceAdaptor WatchDog"); // Do a little normalization before setting up logging... if ("stderr".equalsIgnoreCase(logLoc)) { logLoc = "stderr"; @@ -53,6 +72,8 @@ public SparkResourceAdaptor(RmmEventHandlerResourceAdaptor 0) { + poolThreadWorkingOnTasks(getHandle(), isForShuffle, threadId, taskIds); + } + } + + public void poolThreadFinishedForTasks(long threadId, long[] taskIds) { + if (taskIds.length > 0) { + poolThreadFinishedForTasks(getHandle(), threadId, taskIds); + } } /** * Remove the given thread ID from any association. * @param threadId the ID of the thread that is no longer a part of a task or shuffle (not java thread id). + * @param taskId the task that is being removed. If the task id is -1, then any/all tasks are removed. */ - public void removeThreadAssociation(long threadId) { - removeThreadAssociation(getHandle(), threadId); + public void removeThreadAssociation(long threadId, long taskId) { + removeThreadAssociation(getHandle(), threadId, taskId); } /** @@ -117,41 +158,50 @@ public void taskDone(long taskId) { } /** - * Indicate that the given thread could block on shuffle. - * @param threadId the id of the thread that could block (not java thread id). + * A dedicated task thread is going to submit work to a pool. + * @param threadId the ID of the thread that will submit the work. */ - public void threadCouldBlockOnShuffle(long threadId) { - threadCouldBlockOnShuffle(getHandle(), threadId); + public void submittingToPool(long threadId) { + submittingToPool(getHandle(), threadId); } /** - * Indicate that the given thread can no longer block on shuffle. - * @param threadId the ID of the thread that o longer can block on shuffle (not java thread id). + * A dedicated task thread is going to wait on work in a pool to complete. + * @param threadId the ID of the thread that will submit the work. */ - public void threadDoneWithShuffle(long threadId) { - threadDoneWithShuffle(getHandle(), threadId); + public void waitingOnPool(long threadId) { + waitingOnPool(getHandle(), threadId); } /** - * Force the thread with the given ID to throw a RetryOOM on their next allocation attempt. + * A dedicated task thread is done waiting on a pool. This could be because of submitting + * something to the pool or waiting on a result from the pool. + * @param threadId the ID of the thread that is done. + */ + public void doneWaitingOnPool(long threadId) { + doneWaitingOnPool(getHandle(), threadId); + } + + /** + * Force the thread with the given ID to throw a GpuRetryOOM on their next allocation attempt. * @param threadId the ID of the thread to throw the exception (not java thread id). - * @param numOOMs the number of times the RetryOOM should be thrown + * @param numOOMs the number of times the GpuRetryOOM should be thrown */ public void forceRetryOOM(long threadId, int numOOMs) { forceRetryOOM(getHandle(), threadId, numOOMs); } /** - * Force the thread with the given ID to throw a SplitAndRetryOOM on their next allocation attempt. + * Force the thread with the given ID to throw a GpuSplitAndRetryOOM on their next allocation attempt. * @param threadId the ID of the thread to throw the exception (not java thread id). - * @param numOOMs the number of times the SplitAndRetryOOM should be thrown + * @param numOOMs the number of times the GpuSplitAndRetryOOM should be thrown */ public void forceSplitAndRetryOOM(long threadId, int numOOMs) { forceSplitAndRetryOOM(getHandle(), threadId, numOOMs); } /** - * Force the thread with the given ID to throw a SplitAndRetryOOM on their next allocation attempt. + * Force the thread with the given ID to throw a GpuSplitAndRetryOOM on their next allocation attempt. * @param threadId the ID of the thread to throw the exception (not java thread id). * @param numTimes the number of times the CudfException should be thrown */ @@ -186,6 +236,49 @@ public long getAndResetComputeTimeLostToRetry(long taskId) { return getAndResetComputeTimeLostToRetry(getHandle(), taskId); } + + /** + * Called before doing an allocation on the CPU. This could throw an injected exception to help + * with testing. + * @param amount the amount of memory being requested + * @param blocking is this for a blocking allocate or a non-blocking one. + */ + public boolean preCpuAlloc(long amount, boolean blocking) { + return preCpuAlloc(getHandle(), amount, blocking); + } + + /** + * The allocation that was going to be done succeeded. + * @param ptr a pointer to the memory that was allocated. + * @param amount the amount of memory that was allocated. + * @param blocking is this for a blocking allocate or a non-blocking one. + * @param wasRecursive the result of calling preCpuAlloc. + */ + public void postCpuAllocSuccess(long ptr, long amount, boolean blocking, boolean wasRecursive) { + postCpuAllocSuccess(getHandle(), ptr, amount, blocking, wasRecursive); + } + + /** + * The allocation failed, and spilling didn't save it. + * @param wasOom was the failure caused by an OOM or something else. + * @param blocking is this for a blocking allocate or a non-blocking one. + * @param wasRecursive the result of calling preCpuAlloc + * @return true if the allocation should be retried else false if the state machine + * thinks that a retry would not help. + */ + public boolean postCpuAllocFailed(boolean wasOom, boolean blocking, boolean wasRecursive) { + return postCpuAllocFailed(getHandle(), wasOom, blocking, wasRecursive); + } + + /** + * Some CPU memory was freed. + * @param ptr a pointer to the memory being deallocated. + * @param amount the amount that was made available. + */ + public void cpuDeallocate(long ptr, long amount) { + cpuDeallocate(getHandle(), ptr, amount); + } + /** * Get the ID of the current thread that can be used with the other SparkResourceAdaptor APIs. * Don't use the java thread ID. They are not related. @@ -194,12 +287,14 @@ public long getAndResetComputeTimeLostToRetry(long taskId) { private native static long createNewAdaptor(long wrappedHandle, String logLoc); private native static void releaseAdaptor(long handle); - private static native void associateThreadWithTask(long handle, long threadId, long taskId); - private static native void associateThreadWithShuffle(long handle, long threadId); - private static native void removeThreadAssociation(long handle, long threadId); + private static native void startDedicatedTaskThread(long handle, long threadId, long taskId); + private static native void poolThreadWorkingOnTasks(long handle, boolean isForShuffle, long threadId, long[] taskIds); + private static native void poolThreadFinishedForTasks(long handle, long threadId, long[] taskIds); + private static native void removeThreadAssociation(long handle, long threadId, long taskId); private static native void taskDone(long handle, long taskId); - private static native void threadCouldBlockOnShuffle(long handle, long threadId); - private static native void threadDoneWithShuffle(long handle, long threadId); + private static native void submittingToPool(long handle, long threadId); + private static native void waitingOnPool(long handle, long threadId); + private static native void doneWaitingOnPool(long handle, long threadId); private static native void forceRetryOOM(long handle, long threadId, int numOOMs); private static native void forceSplitAndRetryOOM(long handle, long threadId, int numOOMs); private static native void forceCudfException(long handle, long threadId, int numTimes); @@ -211,4 +306,11 @@ public long getAndResetComputeTimeLostToRetry(long taskId) { private static native long getAndResetComputeTimeLostToRetry(long handle, long taskId); private static native void startRetryBlock(long handle, long threadId); private static native void endRetryBlock(long handle, long threadId); + private static native void checkAndBreakDeadlocks(long handle); + private static native boolean preCpuAlloc(long handle, long amount, boolean blocking); + private static native void postCpuAllocSuccess(long handle, long ptr, long amount, + boolean blocking, boolean wasRecursive); + private static native boolean postCpuAllocFailed(long handle, boolean wasOom, + boolean blocking, boolean wasRecursive); + private static native void cpuDeallocate(long handle, long ptr, long amount); } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/ThreadStateRegistry.java b/src/main/java/com/nvidia/spark/rapids/jni/ThreadStateRegistry.java new file mode 100644 index 0000000000..4e7021e6ea --- /dev/null +++ b/src/main/java/com/nvidia/spark/rapids/jni/ThreadStateRegistry.java @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.HashSet; + +/** + * This is used to allow us to map a native thread id to a java thread so we can look at the + * state from a java perspective. + */ +class ThreadStateRegistry { + private static final Logger LOG = LoggerFactory.getLogger(ThreadStateRegistry.class); + + private static final HashMap knownThreads = new HashMap<>(); + + public static synchronized void addThread(long nativeId, Thread t) { + knownThreads.put(nativeId, t); + } + + // Typically called from JNI + public static synchronized void removeThread(long threadId) { + knownThreads.remove(threadId); + } + + // This is likely called from JNI + public static synchronized boolean isThreadBlocked(long nativeId) { + Thread t = knownThreads.get(nativeId); + if (t == null || !t.isAlive()) { + // Dead is as good as blocked. This is mostly for tests, not so much for + // production + return true; + } + Thread.State state = t.getState(); + switch (state) { + case BLOCKED: + // fall through + case WAITING: + // fall through + case TIMED_WAITING: + return true; + case TERMINATED: + // Technically there is a race with `!t.isAlive` check above, and dead is as good as + // blocked. + return true; + default: + return false; + } + } +} diff --git a/src/test/java/com/nvidia/spark/rapids/jni/LimitingOffHeapAllocForTests.java b/src/test/java/com/nvidia/spark/rapids/jni/LimitingOffHeapAllocForTests.java new file mode 100644 index 0000000000..eb32667dc7 --- /dev/null +++ b/src/test/java/com/nvidia/spark/rapids/jni/LimitingOffHeapAllocForTests.java @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +import ai.rapids.cudf.HostMemoryBuffer; + +import java.util.Optional; + +/** + * This provides a way to allocate and deallocate off heap buffers using the RmmSpark APIs for + * retry on allocations. + */ +public class LimitingOffHeapAllocForTests { + private static long limit; + private static long amountAllocated = 0; + public static synchronized void setLimit(long limit) { + LimitingOffHeapAllocForTests.limit = limit; + if (amountAllocated > 0) { + throw new IllegalStateException("PREVIOUS TEST LEAKED MEMORY!!!"); + } + } + + private static Optional allocInternal(long amount, boolean blocking) { + Optional ret = Optional.empty(); + boolean wasOom = true; + boolean isRecursive = RmmSpark.preCpuAlloc(amount, blocking); + try { + synchronized (LimitingOffHeapAllocForTests.class) { + if (amountAllocated + amount <= limit) { + amountAllocated += amount; + wasOom = false; + HostMemoryBuffer buff = HostMemoryBuffer.allocate(amount); + final long ptr = buff.getAddress(); + buff.setEventHandler(refCount -> { + if (refCount == 0) { + synchronized (LimitingOffHeapAllocForTests.class) { + amountAllocated -= amount; + } + RmmSpark.cpuDeallocate(ptr, amount); + } + }); + ret = Optional.of(buff); + } + } + } finally { + if (ret.isPresent()) { + RmmSpark.postCpuAllocSuccess(ret.get().getAddress(), amount, blocking, isRecursive); + } else { + RmmSpark.postCpuAllocFailed(wasOom, blocking, isRecursive); + } + } + return ret; + } + + /** + * Do a non-blocking allocation + * @param amount the amount to allocate + * @return the allocated buffer or not. + */ + public static Optional tryAlloc(long amount) { + return allocInternal(amount, false); + } + + /** + * Do a blocking allocation + * @param amount the amount to allocate + * @return the allocated buffer + */ + public static HostMemoryBuffer alloc(long amount) { + Optional ret = Optional.empty(); + while (!ret.isPresent()) { + ret = allocInternal(amount, true); + } + return ret.get(); + } +} \ No newline at end of file diff --git a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkMonteCarlo.java b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkMonteCarlo.java index e7d4c2a4da..1d1626935e 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkMonteCarlo.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkMonteCarlo.java @@ -329,6 +329,7 @@ public void run() { if (runner.debugOoms) { System.err.println("OOM for task: " + t.taskId + " and thread: " + RmmSpark.getCurrentThreadId() + " " + oom); + oom.printStackTrace(System.err); } // ignored } @@ -360,7 +361,7 @@ public void run() { } } } catch (Throwable e) { - System.err.println("ERROR: " + e); + System.err.println("ERROR: TID: " + RmmSpark.getCurrentThreadId() + " " + e); e.printStackTrace(System.err); hadOtherFailures = true; } @@ -373,18 +374,10 @@ public boolean hadOtherFailures() { static class ShuffleThreadFactory implements ThreadFactory { static final AtomicLong idGen = new AtomicLong(0); - long id = idGen.getAndIncrement(); @Override public Thread newThread(Runnable runnable) { - Runnable wrapped = () -> { - RmmSpark.associateCurrentThreadWithShuffle(); - try { - runnable.run(); - } finally { - RmmSpark.removeCurrentThreadAssociation(); - } - }; - Thread t = new Thread(wrapped); + long id = idGen.getAndIncrement(); + Thread t = new Thread(runnable); t.setDaemon(true); t.setName("SHUFFLE-THREAD-" + id); return t; @@ -543,6 +536,37 @@ public synchronized void setSitFailed() { } interface MemoryOp { + default void doIt(DeviceMemoryBuffer[] buffers, long taskId) { + long threadId = RmmSpark.getCurrentThreadId(); + RmmSpark.shuffleThreadWorkingOnTasks(new long[]{taskId}); + RmmSpark.startRetryBlock(threadId); + try { + int tries = 0; + while (tries < 100 && tries >= 0) { + try { + if (tries > 0) { + RmmSpark.blockThreadUntilReady(); + } + tries++; + doIt(buffers); + tries = -1; + } catch (GpuRetryOOM oom) { + // Don't need to clear the buffers, because there is only one buffer. + numRetry.incrementAndGet(); + } catch (CpuRetryOOM oom) { + // Don't need to clear the buffers, because there is only one buffer. + numRetry.incrementAndGet(); + } + } + if (tries >= 100) { + throw new OutOfMemoryError("Could not make shuffle work after " + tries + " tries"); + } + } finally { + RmmSpark.endRetryBlock(threadId); + RmmSpark.poolThreadFinishedForTask(taskId); + } + } + void doIt(DeviceMemoryBuffer[] buffers); MemoryOp[] split(); @@ -748,7 +772,7 @@ private void cleanBuffers() { } } - public void run(ExecutorService shuffle) { + public void run(ExecutorService shuffle, long taskId) { buffers = new DeviceMemoryBuffer[numBuffers]; allocatedBeforeError = 0; boolean isForShuffle = shuffle != null; @@ -757,28 +781,31 @@ public void run(ExecutorService shuffle) { try { for (MemoryOp op: operations) { if (isForShuffle) { - // If shuffle is enabled the first allocation will happen on the shuffle thread... - RmmSpark.threadCouldBlockOnShuffle(); try { - Future f = shuffle.submit(() -> op.doIt(buffers)); + RmmSpark.submittingToPool(); + Future f = shuffle.submit(() -> op.doIt(buffers, taskId)); + RmmSpark.doneWaitingOnPool(); + RmmSpark.waitingOnPool(); f.get(1000, TimeUnit.SECONDS); } finally { isForShuffle = false; - RmmSpark.threadDoneWithShuffle(); + RmmSpark.doneWaitingOnPool(); } } else { op.doIt(buffers); } } done = true; - } catch (RetryOOM room) { + } catch (GpuRetryOOM room) { + numRetry.incrementAndGet(); + cleanBuffers(); + RmmSpark.blockThreadUntilReady(); + } catch (CpuRetryOOM room) { numRetry.incrementAndGet(); cleanBuffers(); RmmSpark.blockThreadUntilReady(); } catch (ExecutionException ee) { - // We are not able to do split and retry/etc from a shuffle - // so just bubble the exception on up - OutOfMemoryError oom = new OutOfMemoryError(""); + OutOfMemoryError oom = new OutOfMemoryError("Came From Shuffle"); oom.addSuppressed(ee); throw oom; } catch (InterruptedException | TimeoutException e) { @@ -844,14 +871,19 @@ public long getTimeLost() { public void run(ExecutorService shuffle) { Thread.currentThread().setName("TASK RUNNER FOR " + taskId); - RmmSpark.associateCurrentThreadWithTask(taskId); + RmmSpark.currentThreadIsDedicatedToTask(taskId); try { RmmSpark.currentThreadStartRetryBlock(); while (!toDo.isEmpty()) { TaskOpSet tos = toDo.pollFirst(); try { - tos.run(shuffle); - } catch (SplitAndRetryOOM soom) { + tos.run(shuffle, taskId); + } catch (GpuSplitAndRetryOOM soom) { + TaskOpSet[] split = tos.split(); + toDo.push(split[1]); + toDo.push(split[0]); + numSplitAndRetry.incrementAndGet(); + } catch (CpuSplitAndRetryOOM soom) { TaskOpSet[] split = tos.split(); toDo.push(split[1]); toDo.push(split[0]); diff --git a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java index cd11da05ae..373deb9ca0 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java @@ -18,6 +18,8 @@ import ai.rapids.cudf.CudfException; import ai.rapids.cudf.DeviceMemoryBuffer; +import ai.rapids.cudf.HostMemoryBuffer; +import ai.rapids.cudf.MemoryBuffer; import ai.rapids.cudf.Rmm; import ai.rapids.cudf.RmmAllocationMode; import ai.rapids.cudf.RmmCudaMemoryResource; @@ -25,7 +27,6 @@ import ai.rapids.cudf.RmmEventHandler; import ai.rapids.cudf.RmmLimitingResourceAdaptor; import ai.rapids.cudf.RmmTrackingResourceAdaptor; -import ai.rapids.cudf.ColumnVector.EventHandler; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -62,7 +63,7 @@ public interface TaskThreadOp { public static class TaskThread extends Thread { private final String name; - private final boolean isShuffle; + private final boolean isForPool; private long threadId = -1; private long taskId = 100; @@ -71,10 +72,10 @@ public TaskThread(String name, long taskId) { this.taskId = taskId; } - public TaskThread(String name, boolean isShuffle) { + public TaskThread(String name, boolean isForPool) { super(name); this.name = name; - this.isShuffle = isShuffle; + this.isForPool = isForPool; } public synchronized long getThreadId() { @@ -89,17 +90,15 @@ public void initialize() throws ExecutionException, InterruptedException, Timeou Future waitForStart = doIt(new TaskThreadOp() { @Override public Void doIt() { - if (isShuffle) { - RmmSpark.associateCurrentThreadWithShuffle(); - } else { - RmmSpark.associateCurrentThreadWithTask(taskId); + if (!isForPool) { + RmmSpark.currentThreadIsDedicatedToTask(taskId); } return null; } @Override public String toString() { - return "INIT TASK " + name + " " + (isShuffle ? "SHUFFLE" : ("TASK " + taskId)); + return "INIT TASK " + name + " " + (isForPool ? "POOL" : ("TASK " + taskId)); } }); System.err.println("WAITING FOR STARTUP (" + name + ")"); @@ -278,13 +277,16 @@ public void run() { } System.err.println("INSIDE THREAD RUNNING (" + name + ")"); while (true) { - TaskThreadOp op = queue.poll(1000, TimeUnit.MILLISECONDS); - System.err.println("GOT '" + op + "' ON " + name); - if (op instanceof TaskThreadDoneOp) { - return; - } - // null is returned from the queue on a timeout + // Because of how our deadlock detection code works we don't want to + // block this thread, so we do this in a busy loop. It is not ideal, + // but works, and is more accurate to what the Spark is likely to do + TaskThreadOp op = queue.poll(); + // null is returned from the queue if it is empty if (op != null) { + System.err.println("GOT '" + op + "' ON " + name); + if (op instanceof TaskThreadDoneOp) { + return; + } op.doIt(); System.err.println("'" + op + "' FINISHED ON " + name); } @@ -293,7 +295,6 @@ public void run() { System.err.println("THROWABLE CAUGHT IN " + name); t.printStackTrace(System.err); } finally { - RmmSpark.removeCurrentThreadAssociation(); System.err.println("THREAD EXITING " + name); } } @@ -306,22 +307,23 @@ public void testBasicInitAndTeardown() { } @Test - public void testInsertOOMs() { + public void testInsertOOMsGpu() { Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 512 * 1024 * 1024); RmmSpark.setEventHandler(new BaseRmmEventHandler(), "stderr"); long threadId = RmmSpark.getCurrentThreadId(); long taskid = 0; // This is arbitrary + Thread t = Thread.currentThread(); assertEquals(RmmSparkThreadState.UNKNOWN, RmmSpark.getStateOf(threadId)); assertEquals(0, RmmSpark.getAndResetNumRetryThrow(taskid)); assertEquals(0, RmmSpark.getAndResetNumSplitRetryThrow(taskid)); assertEquals(0, RmmSpark.getAndResetComputeTimeLostToRetryNs(taskid)); - RmmSpark.associateThreadWithTask(threadId, taskid); - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId)); + RmmSpark.startDedicatedTaskThread(threadId, taskid, t); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); try { RmmSpark.startRetryBlock(threadId); // Allocate something small and verify that it works... Rmm.alloc(100).close(); - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId)); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); try { Thread.sleep(1); // Just in case we run on a really fast system in the future where @@ -332,36 +334,101 @@ public void testInsertOOMs() { // Force an exception RmmSpark.forceRetryOOM(threadId); // No change in the state after a force - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId)); - assertThrows(RetryOOM.class, () -> Rmm.alloc(100).close()); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + assertThrows(GpuRetryOOM.class, () -> Rmm.alloc(100).close()); assert(RmmSpark.getAndResetComputeTimeLostToRetryNs(taskid) > 0); // Verify that injecting OOM does not cause the block to actually happen or // the state to change - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId)); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); assertEquals(1, RmmSpark.getAndResetNumRetryThrow(taskid)); assertEquals(0, RmmSpark.getAndResetNumSplitRetryThrow(taskid)); RmmSpark.blockThreadUntilReady(); // Allocate something small and verify that it works... Rmm.alloc(100).close(); - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId)); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); // Force another exception RmmSpark.forceSplitAndRetryOOM(threadId); // No change in state after force - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId)); - assertThrows(SplitAndRetryOOM.class, () -> Rmm.alloc(100).close()); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + assertThrows(GpuSplitAndRetryOOM.class, () -> Rmm.alloc(100).close()); assertEquals(0, RmmSpark.getAndResetNumRetryThrow(taskid)); assertEquals(1, RmmSpark.getAndResetNumSplitRetryThrow(taskid)); // Verify that injecting OOM does not cause the block to actually happen - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId)); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); RmmSpark.blockThreadUntilReady(); // Allocate something small and verify that it works... Rmm.alloc(100).close(); - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId)); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + } finally { + RmmSpark.taskDone(taskid); + } + assertEquals(RmmSparkThreadState.UNKNOWN, RmmSpark.getStateOf(threadId)); + } + + @Test + public void testInsertOOMsCpu() { + Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 512 * 1024 * 1024); + RmmSpark.setEventHandler(new BaseRmmEventHandler(), "stderr"); + LimitingOffHeapAllocForTests.setLimit(512 * 1024 * 1024); + long threadId = RmmSpark.getCurrentThreadId(); + long taskid = 0; // This is arbitrary + Thread t = Thread.currentThread(); + assertEquals(RmmSparkThreadState.UNKNOWN, RmmSpark.getStateOf(threadId)); + assertEquals(0, RmmSpark.getAndResetNumRetryThrow(taskid)); + assertEquals(0, RmmSpark.getAndResetNumSplitRetryThrow(taskid)); + assertEquals(0, RmmSpark.getAndResetComputeTimeLostToRetryNs(taskid)); + RmmSpark.startDedicatedTaskThread(threadId, taskid, t); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + try { + RmmSpark.startRetryBlock(threadId); + // Allocate something small and verify that it works... + LimitingOffHeapAllocForTests.alloc(100).close(); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + + try { + Thread.sleep(1); // Just in case we run on a really fast system in the future where + // all of this is sub-nanosecond... + } catch (InterruptedException e) { + // Ignored + } + // Force an exception + RmmSpark.forceRetryOOM(threadId); + // No change in the state after a force + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + assertThrows(CpuRetryOOM.class, () -> LimitingOffHeapAllocForTests.alloc(100).close()); + assert(RmmSpark.getAndResetComputeTimeLostToRetryNs(taskid) > 0); + + // Verify that injecting OOM does not cause the block to actually happen or + // the state to change + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + assertEquals(1, RmmSpark.getAndResetNumRetryThrow(taskid)); + assertEquals(0, RmmSpark.getAndResetNumSplitRetryThrow(taskid)); + RmmSpark.blockThreadUntilReady(); + + // Allocate something small and verify that it works... + LimitingOffHeapAllocForTests.alloc(100).close(); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + + // Force another exception + RmmSpark.forceSplitAndRetryOOM(threadId); + // No change in state after force + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + assertThrows(CpuSplitAndRetryOOM.class, () -> LimitingOffHeapAllocForTests.alloc(100).close()); + assertEquals(0, RmmSpark.getAndResetNumRetryThrow(taskid)); + assertEquals(1, RmmSpark.getAndResetNumSplitRetryThrow(taskid)); + + // Verify that injecting OOM does not cause the block to actually happen + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + RmmSpark.blockThreadUntilReady(); + + // Allocate something small and verify that it works... + LimitingOffHeapAllocForTests.alloc(100).close(); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); } finally { RmmSpark.taskDone(taskid); } @@ -374,16 +441,18 @@ public void testReentrantAssociateThread() { RmmSpark.setEventHandler(new BaseRmmEventHandler(), "stderr"); long threadId = 100; long taskId = 1; + long[] taskIds = new long[] {taskId}; + Thread t = Thread.currentThread(); try { - RmmSpark.associateThreadWithTask(threadId, taskId); - RmmSpark.associateThreadWithTask(threadId, taskId); - RmmSpark.removeThreadAssociation(threadId); + RmmSpark.startDedicatedTaskThread(threadId, taskId, t); + RmmSpark.startDedicatedTaskThread(threadId, taskId, t); + RmmSpark.removeDedicatedThreadAssociation(threadId, taskId); // Not removing twice because we don't have to match up the counts so it fits with how // the GPU semaphore is used. - RmmSpark.associateThreadWithShuffle(threadId); - RmmSpark.associateThreadWithShuffle(threadId); - RmmSpark.removeThreadAssociation(threadId); - RmmSpark.removeThreadAssociation(threadId); + RmmSpark.shuffleThreadWorkingTasks(threadId, t, taskIds); + RmmSpark.shuffleThreadWorkingTasks(threadId, t, taskIds); + RmmSpark.removeDedicatedThreadAssociation(threadId, taskId); + RmmSpark.removeDedicatedThreadAssociation(threadId, taskId); } finally { RmmSpark.taskDone(taskId); } @@ -397,17 +466,21 @@ public void testAssociateThread() { long threadIdTwo = 300; long taskId = 2; long otherTaskId = 3; + long[] taskIds = new long[] {taskId, otherTaskId}; + Thread t = Thread.currentThread(); try { - RmmSpark.associateThreadWithTask(threadIdOne, taskId); - assertThrows(CudfException.class, () -> RmmSpark.associateThreadWithShuffle(threadIdOne)); - assertThrows(CudfException.class, () -> RmmSpark.associateThreadWithTask(threadIdOne, otherTaskId)); + RmmSpark.startDedicatedTaskThread(threadIdOne, taskId, t); + assertThrows(CudfException.class, () -> RmmSpark.shuffleThreadWorkingTasks(threadIdOne, t, taskIds)); + // There can be races when a thread goes from one task to another, so we just make it safe to do. + RmmSpark.startDedicatedTaskThread(threadIdOne, otherTaskId, t); - RmmSpark.associateThreadWithShuffle(threadIdTwo); - assertThrows(CudfException.class, () -> RmmSpark.associateThreadWithTask(threadIdTwo, otherTaskId)); + RmmSpark.shuffleThreadWorkingTasks(threadIdTwo, t, taskIds); + assertThrows(CudfException.class, () -> RmmSpark.startDedicatedTaskThread(threadIdTwo, otherTaskId, t)); // Remove the association - RmmSpark.removeThreadAssociation(threadIdTwo); + RmmSpark.removeDedicatedThreadAssociation(threadIdTwo, taskId); + RmmSpark.removeDedicatedThreadAssociation(threadIdTwo, otherTaskId); // Add in a new association - RmmSpark.associateThreadWithTask(threadIdTwo, taskId); + RmmSpark.startDedicatedTaskThread(threadIdTwo, taskId, t); } finally { RmmSpark.taskDone(taskId); RmmSpark.taskDone(otherTaskId); @@ -415,19 +488,40 @@ public void testAssociateThread() { } - static class AllocOnAnotherThread implements AutoCloseable { + static abstract class AllocOnAnotherThread implements AutoCloseable { final TaskThread thread; final long size; - DeviceMemoryBuffer b = null; + final long taskId; + MemoryBuffer b = null; Future fb; Future fc = null; public AllocOnAnotherThread(TaskThread thread, long size) { this.thread = thread; this.size = size; + this.taskId = -1; + fb = thread.doIt(new TaskThreadOp() { + @Override + public Void doIt() { + doAlloc(); + return null; + } + + @Override + public String toString() { + return "ALLOC(" + size + ")"; + } + }); + } + + public AllocOnAnotherThread(TaskThread thread, long size, long taskId) { + this.thread = thread; + this.size = size; + this.taskId = taskId; fb = thread.doIt(new TaskThreadOp() { @Override public Void doIt() { + RmmSpark.shuffleThreadWorkingOnTasks(new long[]{taskId}); doAlloc(); return null; } @@ -443,7 +537,7 @@ public void waitForAlloc() throws ExecutionException, InterruptedException, Time fb.get(1000, TimeUnit.MILLISECONDS); } - public void freeOnThread() throws ExecutionException, InterruptedException, TimeoutException { + public void freeOnThread() { if (fc != null) { throw new IllegalStateException("free called multiple times"); } @@ -473,20 +567,60 @@ public void freeAndWait() throws ExecutionException, InterruptedException, Timeo waitForFree(); } - private Void doAlloc() { + abstract protected Void doAlloc(); + + @Override + public synchronized void close() { + if (b != null) { + try { + b.close(); + b = null; + } finally { + if (this.taskId > 0) { + RmmSpark.poolThreadFinishedForTasks(thread.threadId, new long[]{taskId}); + } + } + } + } + } + + public static class GpuAllocOnAnotherThread extends AllocOnAnotherThread { + + public GpuAllocOnAnotherThread(TaskThread thread, long size) { + super(thread, size); + } + + public GpuAllocOnAnotherThread(TaskThread thread, long size, long taskId) { + super(thread, size, taskId); + } + + @Override + protected Void doAlloc() { DeviceMemoryBuffer tmp = Rmm.alloc(size); synchronized (this) { b = tmp; } return null; } + } + + public static class CpuAllocOnAnotherThread extends AllocOnAnotherThread { + + public CpuAllocOnAnotherThread(TaskThread thread, long size) { + super(thread, size); + } + + public CpuAllocOnAnotherThread(TaskThread thread, long size, long taskId) { + super(thread, size, taskId); + } @Override - public synchronized void close() { - if (b != null) { - b.close(); - b = null; + protected Void doAlloc() { + HostMemoryBuffer tmp = LimitingOffHeapAllocForTests.alloc(size); + synchronized (this) { + b = tmp; } + return null; } } @@ -513,6 +647,51 @@ void setupRmmForTestingWithLimits(long maxAllocSize, RmmEventHandler eventHandle RmmSpark.setEventHandler(eventHandler, "stderr"); } + @Test + public void testNonBlockingCpuAlloc() { + // We are not going to use the GPU here, but this sets it all up for us. + setupRmmForTestingWithLimits(10 * 1024 * 1024); + // We are just going to pretend that we are doing an allocations + long taskId = 0; + long threadId = RmmSpark.getCurrentThreadId(); + RmmSpark.currentThreadIsDedicatedToTask(taskId); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + try { + boolean wasRecursive = RmmSpark.preCpuAlloc(100, false); + assertEquals(RmmSparkThreadState.THREAD_ALLOC, RmmSpark.getStateOf(threadId)); + long address; + try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(100)) { + address = buffer.getAddress(); + RmmSpark.postCpuAllocSuccess(address, 100, false, wasRecursive); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + } + RmmSpark.cpuDeallocate(address, 100); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + } finally { + RmmSpark.removeDedicatedThreadAssociation(threadId, taskId); + } + } + + @Test + public void testNonBlockingCpuAllocFailedOOM() { + // We are not going to use the GPU here, but this sets it all up for us. + setupRmmForTestingWithLimits(10 * 1024 * 1024); + // We are just going to pretend that we are doing an allocations + long taskId = 0; + long threadId = RmmSpark.getCurrentThreadId(); + RmmSpark.currentThreadIsDedicatedToTask(taskId); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + try { + boolean wasRecursive = RmmSpark.preCpuAlloc(100, false); + assertEquals(RmmSparkThreadState.THREAD_ALLOC, RmmSpark.getStateOf(threadId)); + // TODO put this on a background thread so we can time out if it blocks. + RmmSpark.postCpuAllocFailed(true, false, wasRecursive); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + } finally { + RmmSpark.removeDedicatedThreadAssociation(threadId, taskId); + } + } + @Test public void testBasicBlocking() throws ExecutionException, InterruptedException, TimeoutException { // 10 MiB @@ -523,16 +702,49 @@ public void testBasicBlocking() throws ExecutionException, InterruptedException, taskTwo.initialize(); try { long tOneId = taskOne.getThreadId(); - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(tOneId)); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tOneId)); + + long tTwoId = taskTwo.getThreadId(); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId)); + + try (AllocOnAnotherThread firstOne = new GpuAllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) { + firstOne.waitForAlloc(); + // This one should block + try (AllocOnAnotherThread secondOne = new GpuAllocOnAnotherThread(taskTwo, 6 * 1024 * 1024)) { + taskTwo.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS); + // Free the first allocation to wake up the second task... + firstOne.freeAndWait(); + secondOne.waitForAlloc(); + secondOne.freeAndWait(); + } + } + } finally { + taskOne.done(); + taskTwo.done(); + } + } + + @Test + public void testBasicCpuBlocking() throws ExecutionException, InterruptedException, TimeoutException { + // 10 MiB + setupRmmForTestingWithLimits(10 * 1024 * 1024); + LimitingOffHeapAllocForTests.setLimit(10 * 1024 * 1024); + TaskThread taskOne = new TaskThread("TEST THREAD ONE", 1); + TaskThread taskTwo = new TaskThread("TEST THREAD TWO", 2); + taskOne.initialize(); + taskTwo.initialize(); + try { + long tOneId = taskOne.getThreadId(); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tOneId)); long tTwoId = taskTwo.getThreadId(); - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(tTwoId)); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId)); - try (AllocOnAnotherThread firstOne = new AllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) { + try (AllocOnAnotherThread firstOne = new CpuAllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) { firstOne.waitForAlloc(); // This one should block - try (AllocOnAnotherThread secondOne = new AllocOnAnotherThread(taskTwo, 6 * 1024 * 1024)) { - taskTwo.pollForState(RmmSparkThreadState.TASK_BLOCKED, 1000, TimeUnit.MILLISECONDS); + try (AllocOnAnotherThread secondOne = new CpuAllocOnAnotherThread(taskTwo, 6 * 1024 * 1024)) { + taskTwo.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS); // Free the first allocation to wake up the second task... firstOne.freeAndWait(); secondOne.waitForAlloc(); @@ -546,6 +758,68 @@ public void testBasicBlocking() throws ExecutionException, InterruptedException, } } + @Test + public void testBasicMixedBlocking() throws ExecutionException, InterruptedException, TimeoutException { + // 10 MiB + setupRmmForTestingWithLimits(10 * 1024 * 1024); + LimitingOffHeapAllocForTests.setLimit(10 * 1024 * 1024); + TaskThread taskOne = new TaskThread("TEST THREAD ONE", 1); + TaskThread taskTwo = new TaskThread("TEST THREAD TWO", 2); + TaskThread taskThree = new TaskThread("TEST THREAD THREE", 3); + TaskThread taskFour = new TaskThread("TEST THREAD FOUR", 4); + taskOne.initialize(); + taskTwo.initialize(); + taskThree.initialize(); + taskFour.initialize(); + try { + long tOneId = taskOne.getThreadId(); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tOneId)); + + long tTwoId = taskTwo.getThreadId(); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId)); + + long tThreeId = taskThree.getThreadId(); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tThreeId)); + + long tFourId = taskFour.getThreadId(); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tFourId)); + + try (AllocOnAnotherThread firstGpuAlloc = new GpuAllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) { + firstGpuAlloc.waitForAlloc(); + + try (AllocOnAnotherThread firstCpuAlloc = new CpuAllocOnAnotherThread(taskTwo, 5 * 1024 * 1024)) { + firstCpuAlloc.waitForAlloc(); + + // Blocking GPU Alloc + try (AllocOnAnotherThread secondGpuAlloc = new GpuAllocOnAnotherThread(taskThree, 6 * 1024 * 1024)) { + taskThree.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS); + + // Blocking CPU Alloc + try (AllocOnAnotherThread secondCpuAlloc = new CpuAllocOnAnotherThread(taskFour, 6 * 1024 * 1024)) { + taskFour.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS); + + // We want to make sure that the order of wakeup corresponds to the location of the data that was released + // Not necessarily the priority of the task/thread. + firstCpuAlloc.freeAndWait(); + secondCpuAlloc.waitForAlloc(); + secondCpuAlloc.freeAndWait(); + } + + // Now do the GPU frees + firstGpuAlloc.freeAndWait(); + secondGpuAlloc.waitForAlloc(); + secondGpuAlloc.freeAndWait(); + } + } + } + } finally { + taskOne.done(); + taskTwo.done(); + taskThree.done(); + taskFour.done(); + } + } + @Test public void testShuffleBlocking() throws ExecutionException, InterruptedException, TimeoutException { // 10 MiB @@ -559,30 +833,112 @@ public void testShuffleBlocking() throws ExecutionException, InterruptedExceptio taskTwo.initialize(); try { long sOneId = shuffleOne.getThreadId(); - assertEquals(RmmSparkThreadState.SHUFFLE_RUNNING, RmmSpark.getStateOf(sOneId)); + // It is not in a running state until it has something to do. long tOneId = taskOne.getThreadId(); - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(tOneId)); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tOneId)); long tTwoId = taskTwo.getThreadId(); - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(tTwoId)); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId)); - try (AllocOnAnotherThread firstOne = new AllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) { + try (AllocOnAnotherThread firstOne = new GpuAllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) { firstOne.waitForAlloc(); // This one should block - try (AllocOnAnotherThread secondOne = new AllocOnAnotherThread(taskTwo, 6 * 1024 * 1024)) { - taskTwo.pollForState(RmmSparkThreadState.TASK_BLOCKED, 1000, TimeUnit.MILLISECONDS); + try (AllocOnAnotherThread secondOne = new GpuAllocOnAnotherThread(taskTwo, 6 * 1024 * 1024)) { + taskTwo.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS); + // Make sure that shuffle has higher priority than tasks... + try (AllocOnAnotherThread thirdOne = new GpuAllocOnAnotherThread(shuffleOne, 6 * 1024 * 1024, 2)) { + shuffleOne.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS); + // But taskOne is not blocked, so there will be no retry until it is blocked, or else + // it is making progress + taskOne.doIt((TaskThreadOp) () -> { + try { + Thread.sleep(200); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + return null; + }); + + try { + secondOne.waitForAlloc(); + fail("SHOULD HAVE THROWN..."); + } catch (ExecutionException ee) { + assert (ee.getCause() instanceof GpuRetryOOM); + } + secondOne.freeAndWait(); - // Make sure that shuffle has higher priority than do tasks... - try (AllocOnAnotherThread thirdOne = new AllocOnAnotherThread(shuffleOne, 6 * 1024 * 1024)) { - shuffleOne.pollForState(RmmSparkThreadState.SHUFFLE_BLOCKED, 1000, TimeUnit.MILLISECONDS); // Free the first allocation to wake up the shuffle thread, but not the second task yet... firstOne.freeAndWait(); + + thirdOne.waitForAlloc(); + thirdOne.freeAndWait(); + } + } + } + } finally { + shuffleOne.done(); + taskOne.done(); + taskTwo.done(); + } + } + + + @Test + public void testShuffleBlockingCpu() throws ExecutionException, InterruptedException, TimeoutException { + // 10 MiB + setupRmmForTestingWithLimits(10 * 1024 * 1024); + LimitingOffHeapAllocForTests.setLimit(10 * 1024 * 1024); + TaskThread shuffleOne = new TaskThread("TEST THREAD SHUFFLE", true); + TaskThread taskOne = new TaskThread("TEST THREAD ONE", 1); + TaskThread taskTwo = new TaskThread("TEST THREAD TWO", 2); + + shuffleOne.initialize(); + taskOne.initialize(); + taskTwo.initialize(); + try { + long sOneId = shuffleOne.getThreadId(); + // It is not in a running state until it has something to do. + + long tOneId = taskOne.getThreadId(); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tOneId)); + + long tTwoId = taskTwo.getThreadId(); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId)); + + try (AllocOnAnotherThread firstOne = new CpuAllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) { + firstOne.waitForAlloc(); + // This one should block + try (AllocOnAnotherThread secondOne = new CpuAllocOnAnotherThread(taskTwo, 6 * 1024 * 1024)) { + taskTwo.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS); + // Make sure that shuffle has higher priority than tasks... + try (AllocOnAnotherThread thirdOne = new CpuAllocOnAnotherThread(shuffleOne, 6 * 1024 * 1024, 2)) { + shuffleOne.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS); + // But taskOne is not blocked, so there will be no retry until it is blocked, or else + // it is making progress + taskOne.doIt((TaskThreadOp) () -> { + try { + Thread.sleep(200); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + return null; + }); + + try { + secondOne.waitForAlloc(); + fail("SHOULD HAVE THROWN..."); + } catch (ExecutionException ee) { + assert (ee.getCause() instanceof CpuRetryOOM); + } + secondOne.freeAndWait(); + + // Free the first allocation to wake up the shuffle thread, but not the second task yet... + firstOne.freeAndWait(); + thirdOne.waitForAlloc(); thirdOne.freeAndWait(); } - secondOne.waitForAlloc(); - secondOne.freeAndWait(); } } } finally { @@ -604,47 +960,110 @@ public void testBasicBUFN() throws ExecutionException, InterruptedException, Tim taskTwo.initialize(); try { long tThreeId = taskThree.getThreadId(); - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(tThreeId)); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tThreeId)); long tTwoId = taskTwo.getThreadId(); - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(tTwoId)); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId)); - try (AllocOnAnotherThread allocThreeOne = new AllocOnAnotherThread(taskThree, 5 * 1024 * 1024)) { + try (AllocOnAnotherThread allocThreeOne = new GpuAllocOnAnotherThread(taskThree, 5 * 1024 * 1024)) { allocThreeOne.waitForAlloc(); - try (AllocOnAnotherThread allocTwoOne = new AllocOnAnotherThread(taskTwo, 3 * 1024 * 1024)) { + try (AllocOnAnotherThread allocTwoOne = new GpuAllocOnAnotherThread(taskTwo, 3 * 1024 * 1024)) { allocTwoOne.waitForAlloc(); - // This one should block - try (AllocOnAnotherThread allocTwoTwo = new AllocOnAnotherThread(taskTwo, 3 * 1024 * 1024)) { - taskTwo.pollForState(RmmSparkThreadState.TASK_BLOCKED, 1000, TimeUnit.MILLISECONDS); + try (AllocOnAnotherThread allocTwoTwo = new GpuAllocOnAnotherThread(taskTwo, 3 * 1024 * 1024)) { + taskTwo.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS); - try (AllocOnAnotherThread allocThreeTwo = new AllocOnAnotherThread(taskThree, 4 * 1024 * 1024)) { + try (AllocOnAnotherThread allocThreeTwo = new GpuAllocOnAnotherThread(taskThree, 4 * 1024 * 1024)) { // This one should be able to allocate because there is not enough memory, but // now all the threads would be blocked, so the lowest priority thread is going to // become BUFN - taskThree.pollForState(RmmSparkThreadState.TASK_BUFN_WAIT, 1000, TimeUnit.MILLISECONDS); + taskThree.pollForState(RmmSparkThreadState.THREAD_BUFN_WAIT, 1000, TimeUnit.MILLISECONDS); try { allocThreeTwo.waitForAlloc(); fail("ALLOC AFTER BUFN SHOULD HAVE THROWN..."); } catch (ExecutionException ee) { - assert(ee.getCause() instanceof RetryOOM); + assert(ee.getCause() instanceof GpuRetryOOM); } // allocOneTwo cannot be freed, nothing was allocated because it threw an exception. allocThreeOne.freeAndWait(); Future f = taskThree.blockUntilReady(); - taskThree.pollForState(RmmSparkThreadState.TASK_BUFN, 1000, TimeUnit.MILLISECONDS); + taskThree.pollForState(RmmSparkThreadState.THREAD_BUFN, 1000, TimeUnit.MILLISECONDS); // taskOne should only wake up after we finish task 2 // Task two is now able to alloc allocTwoTwo.freeAndWait(); allocTwoOne.freeAndWait(); // Task two has freed things, but is still not done, so task one will stay blocked... - taskTwo.pollForState(RmmSparkThreadState.TASK_RUNNING, 1000, TimeUnit.MILLISECONDS); - taskThree.pollForState(RmmSparkThreadState.TASK_BUFN, 1000, TimeUnit.MILLISECONDS); + taskTwo.pollForState(RmmSparkThreadState.THREAD_RUNNING, 1000, TimeUnit.MILLISECONDS); + taskThree.pollForState(RmmSparkThreadState.THREAD_BUFN, 1000, TimeUnit.MILLISECONDS); taskTwo.done().get(1000, TimeUnit.MILLISECONDS); // Now that task two is done see if task one is running again... - taskThree.pollForState(RmmSparkThreadState.TASK_RUNNING, 1000, TimeUnit.MILLISECONDS); + taskThree.pollForState(RmmSparkThreadState.THREAD_RUNNING, 1000, TimeUnit.MILLISECONDS); + // Now we could finish trying our allocations, but this is good enough... + } + } + } + } + } finally { + taskThree.done(); + taskTwo.done(); + } + } + + @Test + public void testBasicBUFNCpu() throws ExecutionException, InterruptedException, TimeoutException { + // 10 MiB + setupRmmForTestingWithLimits(10 * 1024 * 1024); + LimitingOffHeapAllocForTests.setLimit(10 * 1024 * 1024); + // A task id of 3 is higher than a task id of 2, so it should be a lower + // priority and become BUFN ahead of taskTwo. + TaskThread taskThree = new TaskThread("TEST THREAD ONE", 3); + TaskThread taskTwo = new TaskThread("TEST THREAD TWO", 2); + taskThree.initialize(); + taskTwo.initialize(); + try { + long tThreeId = taskThree.getThreadId(); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tThreeId)); + + long tTwoId = taskTwo.getThreadId(); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId)); + + try (AllocOnAnotherThread allocThreeOne = new CpuAllocOnAnotherThread(taskThree, 5 * 1024 * 1024)) { + allocThreeOne.waitForAlloc(); + try (AllocOnAnotherThread allocTwoOne = new CpuAllocOnAnotherThread(taskTwo, 3 * 1024 * 1024)) { + allocTwoOne.waitForAlloc(); + + try (AllocOnAnotherThread allocTwoTwo = new CpuAllocOnAnotherThread(taskTwo, 3 * 1024 * 1024)) { + taskTwo.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS); + + try (AllocOnAnotherThread allocThreeTwo = new CpuAllocOnAnotherThread(taskThree, 4 * 1024 * 1024)) { + // This one should be able to allocate because there is not enough memory, but + // now all the threads would be blocked, so the lowest priority thread is going to + // become BUFN + taskThree.pollForState(RmmSparkThreadState.THREAD_BUFN_WAIT, 1000, TimeUnit.MILLISECONDS); + try { + allocThreeTwo.waitForAlloc(); + fail("ALLOC AFTER BUFN SHOULD HAVE THROWN..."); + } catch (ExecutionException ee) { + assert(ee.getCause() instanceof CpuRetryOOM); + } + // allocOneTwo cannot be freed, nothing was allocated because it threw an exception. + allocThreeOne.freeAndWait(); + Future f = taskThree.blockUntilReady(); + taskThree.pollForState(RmmSparkThreadState.THREAD_BUFN, 1000, TimeUnit.MILLISECONDS); + + // taskOne should only wake up after we finish task 2 + // Task two is now able to alloc + allocTwoTwo.freeAndWait(); + allocTwoOne.freeAndWait(); + // Task two has freed things, but is still not done, so task one will stay blocked... + taskTwo.pollForState(RmmSparkThreadState.THREAD_RUNNING, 1000, TimeUnit.MILLISECONDS); + taskThree.pollForState(RmmSparkThreadState.THREAD_BUFN, 1000, TimeUnit.MILLISECONDS); + + taskTwo.done().get(1000, TimeUnit.MILLISECONDS); + // Now that task two is done see if task one is running again... + taskThree.pollForState(RmmSparkThreadState.THREAD_RUNNING, 1000, TimeUnit.MILLISECONDS); // Now we could finish trying our allocations, but this is good enough... } } @@ -666,24 +1085,24 @@ public void testBUFNSplitAndRetrySingleThread() throws ExecutionException, Inter taskOne.initialize(); try { long threadId = taskOne.getThreadId(); - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId)); - try (AllocOnAnotherThread one = new AllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) { + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); + try (AllocOnAnotherThread one = new GpuAllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) { one.waitForAlloc(); - try (AllocOnAnotherThread two = new AllocOnAnotherThread(taskOne, 6 * 1024 * 1024)) { + try (AllocOnAnotherThread two = new GpuAllocOnAnotherThread(taskOne, 6 * 1024 * 1024)) { two.waitForAlloc(); fail("Expect that allocating more memory than is allowed would fail"); } catch (ExecutionException oom) { - assert oom.getCause() instanceof RetryOOM : oom.toString(); + assert oom.getCause() instanceof GpuRetryOOM : oom.toString(); } try { taskOne.blockUntilReady().get(1000, TimeUnit.MILLISECONDS); fail("Expect split and retry after all tasks blocked."); } catch (ExecutionException oom) { - assert oom.getCause() instanceof SplitAndRetryOOM : oom.toString(); + assert oom.getCause() instanceof GpuSplitAndRetryOOM : oom.toString(); } - assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId)); + assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); // Now we try to allocate with half the data. - try (AllocOnAnotherThread secondTry = new AllocOnAnotherThread(taskOne, 3 * 1024 * 1024)) { + try (AllocOnAnotherThread secondTry = new GpuAllocOnAnotherThread(taskOne, 3 * 1024 * 1024)) { secondTry.waitForAlloc(); } } @@ -697,8 +1116,9 @@ public void testInsertMultipleOOMs() { Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 10 * 1024 * 1024); RmmSpark.setEventHandler(new BaseRmmEventHandler(), "stderr"); long threadId = RmmSpark.getCurrentThreadId(); - long taskid = 0; // This is arbitrary - RmmSpark.associateThreadWithTask(threadId, taskid); + long taskId = 0; // This is arbitrary + Thread t = Thread.currentThread(); + RmmSpark.startDedicatedTaskThread(threadId, taskId, t); try { // Allocate something small and verify that it works... Rmm.alloc(100).close(); @@ -707,7 +1127,7 @@ public void testInsertMultipleOOMs() { int numRetryOOMs = 3; RmmSpark.forceRetryOOM(threadId, numRetryOOMs); for (int i = 0; i < numRetryOOMs; i++) { - assertThrows(RetryOOM.class, () -> Rmm.alloc(100).close()); + assertThrows(GpuRetryOOM.class, () -> Rmm.alloc(100).close()); // Verify that injecting OOM does not cause the block to actually happen RmmSpark.blockThreadUntilReady(); } @@ -719,7 +1139,7 @@ public void testInsertMultipleOOMs() { int numSplitAndRetryOOMs = 5; RmmSpark.forceSplitAndRetryOOM(threadId, numSplitAndRetryOOMs); for (int i = 0; i < numSplitAndRetryOOMs; i++) { - assertThrows(SplitAndRetryOOM.class, () -> Rmm.alloc(100).close()); + assertThrows(GpuSplitAndRetryOOM.class, () -> Rmm.alloc(100).close()); // Verify that injecting OOM does not cause the block to actually happen RmmSpark.blockThreadUntilReady(); } @@ -727,7 +1147,7 @@ public void testInsertMultipleOOMs() { // Allocate something small and verify that it works... Rmm.alloc(100).close(); } finally { - RmmSpark.removeThreadAssociation(threadId); + RmmSpark.removeDedicatedThreadAssociation(threadId, taskId); } } @@ -736,8 +1156,9 @@ public void testCudfException() { Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 10 * 1024 * 1024); RmmSpark.setEventHandler(new BaseRmmEventHandler(), "stderr"); long threadId = RmmSpark.getCurrentThreadId(); - long taskid = 0; // This is arbitrary - RmmSpark.associateThreadWithTask(threadId, taskid); + long taskId = 0; // This is arbitrary + Thread t = Thread.currentThread(); + RmmSpark.startDedicatedTaskThread(threadId, taskId, t); try { // Allocate something small and verify that it works... Rmm.alloc(100).close(); @@ -754,7 +1175,7 @@ public void testCudfException() { // Allocate something small and verify that it works... Rmm.alloc(100).close(); } finally { - RmmSpark.removeThreadAssociation(threadId); + RmmSpark.removeDedicatedThreadAssociation(threadId, taskId); } } @@ -763,23 +1184,24 @@ public void retryWatchdog() { // 10 MiB setupRmmForTestingWithLimits(10 * 1024 * 1024); long threadId = RmmSpark.getCurrentThreadId(); - long taskid = 0; // This is arbitrary + long taskId = 0; // This is arbitrary long numRetries = 0; - RmmSpark.associateThreadWithTask(threadId, taskid); + Thread t = Thread.currentThread(); + RmmSpark.startDedicatedTaskThread(threadId, taskId, t); long startTime = System.nanoTime(); try (DeviceMemoryBuffer filler = Rmm.alloc(9 * 1024 * 1024)) { while (numRetries < 10000) { try { Rmm.alloc(2 * 1024 * 1024).close(); fail("overallocation should have failed"); - } catch (RetryOOM room) { + } catch (GpuRetryOOM room) { numRetries++; try { RmmSpark.blockThreadUntilReady(); - } catch (SplitAndRetryOOM sroom) { + } catch (GpuSplitAndRetryOOM sroom) { numRetries++; } - } catch (SplitAndRetryOOM sroom) { + } catch (GpuSplitAndRetryOOM sroom) { fail("retry should be thrown before split and retry..."); } } @@ -788,7 +1210,7 @@ public void retryWatchdog() { // The 500 is hard coded in the code below assertEquals(500, numRetries); } finally { - RmmSpark.removeThreadAssociation(threadId); + RmmSpark.removeDedicatedThreadAssociation(threadId, taskId); } long endTime = System.nanoTime(); System.err.println("Took " + (endTime - startTime) + "ns to retry 500 times..."); @@ -812,14 +1234,15 @@ public void testAllocationDuringSpill() { // 10 MiB setupRmmForTestingWithLimits(10 * 1024 * 1024, rmmEventHandler); long threadId = RmmSpark.getCurrentThreadId(); - long taskid = 0; // This is arbitrary - RmmSpark.associateThreadWithTask(threadId, taskid); + long taskId = 0; // This is arbitrary + Thread t = Thread.currentThread(); + RmmSpark.startDedicatedTaskThread(threadId, taskId, t); assertThrows(GpuOOM.class, () -> { try (DeviceMemoryBuffer filler = Rmm.alloc(9 * 1024 * 1024)) { try (DeviceMemoryBuffer shouldFail = Rmm.alloc(2 * 1024 * 1024)) {} fail("overallocation should have failed"); } finally { - RmmSpark.removeThreadAssociation(threadId); + RmmSpark.removeDedicatedThreadAssociation(threadId, taskId); } }); assertEquals(11, rmmEventHandler.getAllocationCount()); @@ -832,14 +1255,15 @@ public void testAllocationFailedDuringSpill() { // 10 MiB setupRmmForTestingWithLimits(10 * 1024 * 1024, rmmEventHandler); long threadId = RmmSpark.getCurrentThreadId(); - long taskid = 0; // This is arbitrary - RmmSpark.associateThreadWithTask(threadId, taskid); + long taskId = 0; // This is arbitrary + Thread t = Thread.currentThread(); + RmmSpark.startDedicatedTaskThread(threadId, taskId, t); assertThrows(GpuOOM.class, () -> { try (DeviceMemoryBuffer filler = Rmm.alloc(9 * 1024 * 1024)) { try (DeviceMemoryBuffer shouldFail = Rmm.alloc(2 * 1024 * 1024)) {} fail("overallocation should have failed"); } finally { - RmmSpark.removeThreadAssociation(threadId); + RmmSpark.removeDedicatedThreadAssociation(threadId, taskId); } }); assertEquals(0, rmmEventHandler.getAllocationCount()); From ab9ed762ace5d9a8aceb89871f7fdc3d86345a99 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 30 Nov 2023 06:04:07 +0800 Subject: [PATCH 025/127] Update submodule cudf to e15290a373ff0c84c85c2c0e940e69377a66cf96 (#1605) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 8da62049ae..e15290a373 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 8da62049aee750b391ff6d8ca4937428f94fd10c +Subproject commit e15290a373ff0c84c85c2c0e940e69377a66cf96 From ddc2410324dccf9e50f5af0476748c5363014608 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 30 Nov 2023 22:02:01 +0800 Subject: [PATCH 026/127] Update submodule cudf to d528c95beb471d5e95a9b24b9d54351496fef11a (#1606) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index e15290a373..d528c95beb 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit e15290a373ff0c84c85c2c0e940e69377a66cf96 +Subproject commit d528c95beb471d5e95a9b24b9d54351496fef11a From 9c3c7a6d05bcaeee6e502203a13b167ec1748869 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Fri, 1 Dec 2023 13:29:30 -0600 Subject: [PATCH 027/127] Fix faultinj build error after spdlog/fmt upgrade (#1608) Signed-off-by: Jason Lowe --- src/main/cpp/faultinj/faultinj.cu | 7 +++++-- thirdparty/cudf | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/main/cpp/faultinj/faultinj.cu b/src/main/cpp/faultinj/faultinj.cu index 19902783b8..6903ebf446 100644 --- a/src/main/cpp/faultinj/faultinj.cu +++ b/src/main/cpp/faultinj/faultinj.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,6 +31,9 @@ #include #include +// Format enums for logging +auto format_as(CUpti_CallbackDomain domain) { return fmt::underlying(domain); } + namespace { #define CUPTI_CALL(call) \ @@ -392,7 +395,7 @@ void readFaultInjectorConfig(void) std::srand(seed); const spdlog::level::level_enum logLevelEnum = static_cast(logLevel); - spdlog::info("changed log level to {}", logLevelEnum); + spdlog::info("changed log level to {}", logLevel); spdlog::set_level(logLevelEnum); traceConfig(globalControl.configRoot); diff --git a/thirdparty/cudf b/thirdparty/cudf index d528c95beb..c8074b5176 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit d528c95beb471d5e95a9b24b9d54351496fef11a +Subproject commit c8074b5176a74630101c78c43c24b66141352b24 From df72c289fedd0497ccbf0ea0e7a5b986fb94239e Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 4 Dec 2023 15:29:47 -0500 Subject: [PATCH 028/127] Updating `parse_uri` to properly validate the URI before returning results (#1554) * Radical rewrite to properly validate URI * Updated to add support for utf8 and escaped hex * Fixing IPv6 issue and code cleanup * Fixing incorrect port string generation. Thanks Haoyang! Signed-off-by: Mike Wilson --- src/main/cpp/src/parse_uri.cu | 920 +++++++++++++----- src/main/cpp/tests/parse_uri.cpp | 113 ++- .../nvidia/spark/rapids/jni/ParseURITest.java | 112 ++- 3 files changed, 902 insertions(+), 243 deletions(-) diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu index 54e79ab022..d0629cb71f 100644 --- a/src/main/cpp/src/parse_uri.cu +++ b/src/main/cpp/src/parse_uri.cu @@ -18,11 +18,13 @@ #include #include +#include #include #include #include #include #include +#include #include #include @@ -34,251 +36,728 @@ namespace spark_rapids_jni { using namespace cudf; namespace detail { + +struct uri_parts { + string_view scheme; + string_view host; + string_view authority; + string_view path; + string_view fragment; + string_view query; + string_view userinfo; + string_view port; + string_view opaque; + bool valid{false}; +}; + +enum class URI_chunks : int8_t { PROTOCOL, HOST, AUTHORITY, PATH, QUERY, USERINFO }; + +enum class chunk_validity : int8_t { VALID, INVALID, FATAL }; + namespace { -// utility to validate a character is valid in a URI -constexpr bool is_valid_character(char ch, bool alphanum_only) +// Some parsing errors are fatal and some parsing errors simply mean this +// thing doesn't exist or is invalid. For example, just because 280.0.1.16 is +// not a valid IPv4 address simply means if asking for the host the host is null +// but the authority is still 280.0.1.16 and the uri is not considered invalid. +// By contrast, the URI https://[15:6:g:invalid] will not return https for the +// scheme and is considered completely invalid. + +constexpr bool is_alpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } + +constexpr bool is_numeric(char c) { return c >= '0' && c <= '9'; } + +constexpr bool is_alphanum(char c) { return is_alpha(c) || is_numeric(c); } + +constexpr bool is_hex(char c) { - if (alphanum_only) { - if (ch >= '-' && ch <= '9' && ch != '/') return true; // 0-9 and .- - if (ch >= 'A' && ch <= 'Z') return true; // A-Z - if (ch >= 'a' && ch <= 'z') return true; // a-z - } else { - if (ch >= '!' && ch <= ';' && ch != '"') return true; // 0-9 and !#%&'()*+,-./ - if (ch >= '=' && ch <= 'Z' && ch != '>') return true; // A-Z and =?@ - if (ch >= '_' && ch <= 'z' && ch != '`') return true; // a-z and _ + return is_numeric(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); +} + +__device__ thrust::pair skip_and_validate_special( + string_view::const_iterator iter, + string_view::const_iterator end, + bool allow_invalid_escapes = false) +{ + while (iter != end) { + auto const c = *iter; + auto const num_bytes = cudf::strings::detail::bytes_in_char_utf8(*iter); + if (*iter == '%' && !allow_invalid_escapes) { + // verify following two characters are hexadecimal + for (int i = 0; i < 2; ++i) { + ++iter; + if (iter == end) { return {false, iter}; } + + if (!is_hex(*iter)) { return {false, iter}; } + } + } else if (num_bytes > 1) { + // UTF8 validation means it isn't whitespace and not a control character + // the normal validation will handle anything single byte, this checks for multiple byte + // whitespace + auto const c = *iter; + // There are multi-byte looking things like extended ASCII characters that are not valid UTF8. + // Check that here. + if ((c & 0xC0) != 0x80) { return {false, iter}; } + if (num_bytes > 2 && ((c & 0xC000) != 0x8000)) { return {false, iter}; } + if (num_bytes > 3 && ((c & 0xC00000) != 0x800000)) { return {false, iter}; } + + // Validate it isn't a whitespace or control unicode character. + if ((c >= 0xc280 && c <= 0xc2a0) || c == 0xe19a80 || (c >= 0xe28080 && c <= 0xe2808a) || + c == 0xe280af || c == 0xe280a8 || c == 0xe2819f || c == 0xe38080) { + return {false, iter}; + } + } else { + break; + } + ++iter; } - return false; + + return {true, iter}; } -/** - * @brief Count the number of characters of each string after parsing the protocol. - * - * @tparam num_warps_per_threadblock Number of warps in a threadblock. This template argument must - * match the launch configuration, i.e. the kernel must be launched with - * `num_warps_per_threadblock * cudf::detail::warp_size` threads per threadblock. - * @tparam char_block_size Number of characters which will be loaded into the shared memory at a - * time. - * - * @param in_strings Input string column - * @param out_counts Number of characters in each decode URL - * @param out_validity Bitmask of validity data, updated in funcion - */ -template -__global__ void parse_uri_protocol_char_counter(column_device_view const in_strings, - size_type* const out_counts, - bitmask_type* out_validity) +template +__device__ bool validate_chunk(string_view s, Predicate fn, bool allow_invalid_escapes = false) { - __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size]; - __shared__ typename cub::WarpScan::TempStorage cub_storage[num_warps_per_threadblock]; - __shared__ bool found_token[num_warps_per_threadblock]; - - auto const global_thread_id = cudf::detail::grid_1d::global_thread_id(); - auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); - auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); - auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); - auto const nwarps = static_cast(gridDim.x * blockDim.x / cudf::detail::warp_size); - char* in_chars_shared = temporary_buffer[local_warp_id]; - - // Loop through strings, and assign each string to a warp. - for (thread_index_type tidx = global_warp_id; tidx < in_strings.size(); tidx += nwarps) { - auto const row_idx = static_cast(tidx); - if (in_strings.is_null(row_idx)) { - if (warp_lane == 0) out_counts[row_idx] = 0; + auto iter = s.begin(); + { + auto [valid, iter_] = skip_and_validate_special(iter, s.end(), allow_invalid_escapes); + iter = std::move(iter_); + if (!valid) { return false; } + } + while (iter != s.end()) { + if (!fn(iter)) { return false; } + + iter++; + auto [valid, iter_] = skip_and_validate_special(iter, s.end(), allow_invalid_escapes); + iter = std::move(iter_); + if (!valid) { return false; } + } + return true; +} + +bool __device__ validate_scheme(string_view scheme) +{ + // A scheme simply needs to be an alpha character followed by alphanumeric + auto iter = scheme.begin(); + if (!is_alpha(*iter)) { return false; } + while (++iter != scheme.end()) { + auto const c = *iter; + if (!is_alphanum(c) && c != '+' && c != '-' && c != '.') { return false; } + } + return true; +} + +bool __device__ validate_ipv6(string_view s) +{ + constexpr auto max_colons{8}; + + if (s.size_bytes() < 2) { return false; } + + bool found_double_colon{false}; + int open_bracket_count{0}; + int close_bracket_count{0}; + int period_count{0}; + int colon_count{0}; + int percent_count{0}; + char previous_char{0}; + int address{0}; + int address_char_count{0}; + bool address_has_hex{false}; + + auto const leading_double_colon = [&]() { + auto iter = s.begin(); + if (*iter == '[') iter++; + return *iter++ == ':' && *iter == ':'; + }(); + + for (auto iter = s.begin(); iter < s.end(); ++iter) { + auto const c = *iter; + + switch (c) { + case '[': + open_bracket_count++; + if (open_bracket_count > 1) { return false; } + break; + case ']': + close_bracket_count++; + if (close_bracket_count > 1) { return false; } + if ((period_count > 0) && (address_has_hex || address > 255)) { return false; } + break; + case ':': + colon_count++; + if (previous_char == ':') { + if (found_double_colon) { return false; } + found_double_colon = true; + } + address = 0; + address_has_hex = false; + address_char_count = 0; + if (colon_count > max_colons || (colon_count == max_colons && !found_double_colon)) { + return false; + } + // Periods before a colon don't work, periods can be an IPv4 address after this IPv6 address + // like [1:2:3:4:5:6:d.d.d.d] + if (period_count > 0 || percent_count > 0) { return false; } + break; + case '.': + period_count++; + if (percent_count > 0) { return false; } + if (period_count > 3) { return false; } + if (address_has_hex) { return false; } + if (address > 255) { return false; } + if (colon_count != 6 && !found_double_colon) { return false; } + // Special case of ::1:2:3:4:5:d.d.d.d has 7 colons - but spark says this is invalid + // if (colon_count == max_colons && !leading_double_colon) { return false; } + if (colon_count >= max_colons) { return false; } + address = 0; + address_has_hex = false; + address_char_count = 0; + break; + case '%': + // IPv6 can define a device to use for the routing. This is expressed as '%eth0' at the end + // of the address. + percent_count++; + if (percent_count > 1) { return false; } + if ((period_count > 0) && (address_has_hex || address > 255)) { return false; } + address = 0; + address_has_hex = false; + address_char_count = 0; + break; + default: + // after % all bets are off as the device name can be nearly anything + if (percent_count == 0) { + if (address_char_count > 3) { return false; } + address_char_count++; + address *= 10; + if (c >= 'a' && c <= 'f') { + address += 10; + address += c - 'a'; + address_has_hex = true; + } else if (c >= 'A' && c <= 'Z') { + address += 10; + address += c - 'A'; + address_has_hex = true; + } else if (c >= '0' && c <= '9') { + address += c - '0'; + } else { + return false; + } + } + break; + } + previous_char = c; + } + + return true; +} + +bool __device__ validate_ipv4(string_view s) +{ + // dotted quad (0-255).(0-255).(0-255).(0-255) + int address = 0; + int address_char_count = 0; + int dot_count = 0; + for (auto iter = s.begin(); iter < s.end(); ++iter) { + auto const c = *iter; + + // can't lead with a . + if ((c < '0' || c > '9') && (iter == s.begin() || c != '.')) { return false; } + + if (c == '.') { + // verify we saw at least one character and reset values + if (address_char_count == 0) { return false; } + address = 0; + address_char_count = 0; + dot_count++; continue; } - auto const in_string = in_strings.element(row_idx); - auto const in_chars = in_string.data(); - auto const string_length = in_string.size_bytes(); - auto const nblocks = cudf::util::div_rounding_up_unsafe(string_length, char_block_size); - size_type output_string_size = 0; + address_char_count++; + address *= 10; + address += c - '0'; + + if (address > 255) { return false; } + } + + // can't end with a . + if (address_char_count == 0) { return false; } + + // must be 4 portions seperated by 3 dots. + if (dot_count != 3) { return false; } + + return true; +} + +bool __device__ validate_domain_name(string_view name) +{ + // domain name can be alphanum or -. + // slash can not be the first of last character of the domain name or around a . + bool last_was_slash = false; + bool last_was_period = false; + bool numeric_start = false; + for (auto iter = name.begin(); iter < name.end(); ++iter) { + auto const c = *iter; + if (!is_alphanum(c) && c != '-' && c != '.') { return false; } + + // the final section can't start with a digit + if (last_was_period && c >= '0' && c <= '9') { + numeric_start = true; + } else { + numeric_start = false; + } + + if (c == '-') { + if (last_was_period || iter == name.begin() || iter == --name.end()) { return false; } + last_was_slash = true; + last_was_period = false; + } else if (c == '.') { + if (last_was_slash) { return false; } + last_was_period = true; + last_was_slash = false; + } else { + last_was_period = false; + last_was_slash = false; + } + } + + // numeric start to last part of domain isn't allowed. + if (numeric_start) { return false; } + + return true; +} + +chunk_validity __device__ validate_host(string_view host) +{ + // This can be IPv4, IPv6, or a domain name. + if (*host.begin() == '[') { + // If last character is a ], this is IPv6 or invalid. + if (*(host.end() - 1) != ']') { + // invalid + return chunk_validity::FATAL; + } + if (!validate_ipv6(host)) { return chunk_validity::FATAL; } + + return chunk_validity::VALID; + } - // valid until proven otherwise - bool valid{true}; + // If there are more [ or ] characters this is invalid. + // Also need to find the last . + int last_open_bracket = -1; + int last_close_bracket = -1; + int last_period = -1; + + // The original plan on this loop was to get fancy and use a reverse iterator and exit when + // everything was found, but the expectation is there are no brackets in this string, so we have + // to traverse the entire thing anyway to verify that. The math is easier with a forward iterator, + // so we're back here. + for (auto iter = host.begin(); iter < host.end(); ++iter) { + auto const c = *iter; + if (c == '[') { + last_open_bracket = iter.position(); + } else if (c == ']') { + last_close_bracket = iter.position(); + } else if (c == '.') { + last_period = iter.position(); + } + } - // Use the last thread of the warp to initialize `found_token` to false. - if (warp_lane == cudf::detail::warp_size - 1) { found_token[local_warp_id] = false; } + if (last_open_bracket >= 0 || last_close_bracket >= 0) { return chunk_validity::FATAL; } - for (size_type block_idx = 0; block_idx < nblocks && valid; block_idx++) { - auto const string_length_block = - std::min(char_block_size, string_length - char_block_size * block_idx); + // If we didn't find a period or if the last character is a period or the character after the last + // period is non numeric + if (last_period < 0 || last_period == host.length() - 1 || host[last_period + 1] < '0' || + host[last_period + 1] > '9') { + // must be domain name or it is invalid + if (validate_domain_name(host)) { return chunk_validity::VALID; } - // Each warp collectively loads input characters of the current block to the shared memory. - for (auto char_idx = warp_lane; char_idx < string_length_block; - char_idx += cudf::detail::warp_size) { - auto const in_idx = block_idx * char_block_size + char_idx; - in_chars_shared[char_idx] = in_idx < string_length ? in_chars[in_idx] : 0; + // the only other option is that this is a IPv4 address + } else if (validate_ipv4(host)) { + return chunk_validity::VALID; + } + + return chunk_validity::INVALID; +} + +bool __device__ validate_query(string_view query) +{ + // query can be alphanum and _-!.~'()*,;:$&+=?/[]@" + return validate_chunk(query, [] __device__(string_view::const_iterator iter) { + auto const c = *iter; + if (c != '!' && c != '"' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && + !(c >= '?' && c <= ']' && c != '\\') && !(c >= 'a' && c <= 'z') && c != '_' && c != '~') { + return false; + } + return true; + }); +} + +bool __device__ validate_authority(string_view authority, bool allow_invalid_escapes) +{ + // authority needs to be alphanum and @[]_-!.'()*,;:$&+= + return validate_chunk( + authority, + [allow_invalid_escapes] __device__(string_view::const_iterator iter) { + auto const c = *iter; + if (c != '!' && c != '$' && !(c >= '&' && c <= ';' && c != '/') && c != '=' && + !(c >= '@' && c <= '_' && c != '^' && c != '\\') && !(c >= 'a' && c <= 'z') && c != '~' && + (!allow_invalid_escapes || c != '%')) { + return false; } + return true; + }, + allow_invalid_escapes); +} - __syncwarp(); - - // `char_idx_start` represents the start character index of the current warp. - for (size_type char_idx_start = 0; char_idx_start < string_length_block; - char_idx_start += cudf::detail::warp_size) { - auto const char_idx = char_idx_start + warp_lane; - char const* const ch_ptr = in_chars_shared + char_idx; - - // need to know if the character we are validating is before or after the token - // as valid characters changes. Default to 1 to handle the case where we have - // alreayd found the token and do not search for it again. - int8_t out_tokens{1}; - if (!found_token[local_warp_id]) { - // Warp-wise prefix sum to establish tokens of string. - // All threads in the warp participate in the prefix sum, even if `char_idx` is beyond - // `string_length_block`. - int8_t const is_token = (char_idx < string_length_block && *ch_ptr == ':') ? 1 : 0; - cub::WarpScan(cub_storage[local_warp_id]).InclusiveSum(is_token, out_tokens); - } +bool __device__ validate_userinfo(string_view userinfo) +{ + // can't be ] or [ in here + return validate_chunk(userinfo, [] __device__(string_view::const_iterator iter) { + auto const c = *iter; + if (c == '[' || c == ']') { return false; } + return true; + }); +} - auto const before_token = out_tokens == 0; - valid = valid && __ballot_sync(0xffffffff, - (char_idx >= string_length_block || - is_valid_character(*ch_ptr, before_token)) - ? 0 - : 1) == 0; - if (!valid) { - // last thread in warp sets validity - if (warp_lane == cudf::detail::warp_size - 1) { - clear_bit(out_validity, row_idx); - out_counts[row_idx] = 0; - } +bool __device__ validate_port(string_view port) +{ + // port is positive numeric >=0 according to spark...shrug + return validate_chunk(port, [] __device__(string_view::const_iterator iter) { + auto const c = *iter; + if (c < '0' && c > '9') { return false; } + return true; + }); +} + +bool __device__ validate_path(string_view path) +{ + // path can be alphanum and @[]_-!.~'()*?/&,;:$+= + return validate_chunk(path, [] __device__(string_view::const_iterator iter) { + auto const c = *iter; + if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && !(c >= '@' && c <= 'Z') && + c != '_' && !(c >= 'a' && c <= 'z') && c != '~') { + return false; + } + return true; + }); +} + +bool __device__ validate_opaque(string_view opaque) +{ + // opaque can be alphanum and @[]_-!.~'()*?/,;:$@+= + return validate_chunk(opaque, [] __device__(string_view::const_iterator iter) { + auto const c = *iter; + if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && + !(c >= '?' && c <= ']' && c != '\\') && c != '_' && c != '~' && !(c >= 'a' && c <= 'z')) { + return false; + } + return true; + }); +} + +bool __device__ validate_fragment(string_view fragment) +{ + // fragment can be alphanum and @[]_-!.~'()*?/,;:$&+= + return validate_chunk(fragment, [] __device__(string_view::const_iterator iter) { + auto const c = *iter; + if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && + !(c >= '?' && c <= ']' && c != '\\') && c != '_' && c != '~' && !(c >= 'a' && c <= 'z')) { + return false; + } + return true; + }); +} + +uri_parts __device__ validate_uri(const char* str, int len) +{ + uri_parts ret; + + // look for :/# characters. + int col = -1; + int slash = -1; + int hash = -1; + int question = -1; + for (const char* c = str; + c - str < len && (col == -1 || slash == -1 || hash == -1 || question == -1); + ++c) { + switch (*c) { + case ':': + if (col == -1) col = c - str; + break; + case '/': + if (slash == -1) slash = c - str; + break; + case '#': + if (hash == -1) hash = c - str; + break; + case '?': + if (question == -1) question = c - str; + break; + default: break; + } + } + + // anything after the hash is part of the fragment and ignored for this part + if (hash >= 0) { + ret.fragment = {str + hash + 1, len - hash - 1}; + if (!validate_fragment(ret.fragment)) { + ret.valid = false; + return ret; + } + + len = hash; + + if (col > hash) col = -1; + if (slash > hash) slash = -1; + if (question > hash) question = -1; + } + + // if the first ':' is after the other tokens, this doesn't have a scheme or it is invalid + if (col != -1 && (slash == -1 || col < slash) && (hash == -1 || col < hash)) { + // we have a scheme up to the : + ret.scheme = {str, col}; + if (!validate_scheme(ret.scheme)) { + ret.valid = false; + return ret; + } + + // skip over scheme + auto const skip = col + 1; + str += skip; + len -= skip; + question -= skip; + hash -= skip; + slash -= skip; + } + + // no more string to parse is an error + if (len <= 0) { + ret.valid = false; + return ret; + } + + // If we have a '/' as the next character, we have a heirarchical uri. If not it is opaque. + bool const heirarchical = str[0] == '/'; + if (heirarchical) { + // a '?' will break this into query and path/authority + if (question >= 0) { + ret.query = {str + question + 1, len - question - 1}; + if (!validate_query(ret.query)) { + ret.valid = false; + return ret; + } + } + auto const path_len = question >= 0 ? question : len; + + if (str[0] == '/' && str[1] == '/') { + // If we have a '/', we have //authority/path, otherwise we have //authority with no path. + int next_slash = -1; + for (int i = 2; i < path_len; ++i) { + if (str[i] == '/') { + next_slash = i; break; } + } + ret.authority = {&str[2], + next_slash == -1 ? question < 0 ? len - 2 : question - 2 : next_slash - 2}; + if (next_slash > 0) { ret.path = {str + next_slash, path_len - next_slash}; } + + if (next_slash == -1 && ret.authority.size_bytes() == 0 && ret.query.size_bytes() == 0 && + ret.fragment.size_bytes() == 0) { + // invalid! - but spark like to return things as long as you don't have illegal characters + // ret.valid = false; + ret.valid = true; + return ret; + } + + if (ret.authority.size_bytes() > 0) { + auto ipv6_address = ret.authority.size_bytes() > 2 && *ret.authority.begin() == '['; + if (!validate_authority(ret.authority, ipv6_address)) { + ret.valid = false; + return ret; + } - // if we have already found our token, no more string copy we only need to validate - // characters - if (!found_token[local_warp_id]) { - // If the current character is before the token we will output the character. - int8_t const out_size = (char_idx >= string_length_block || out_tokens > 0) ? 0 : 1; - - // Warp-wise prefix sum to establish output location of the current thread. - // All threads in the warp participate in the prefix sum, even if `char_idx` is beyond - // `string_length_block`. - int8_t out_offset; - cub::WarpScan(cub_storage[local_warp_id]).InclusiveSum(out_size, out_offset); - - // last thread of the warp updates offsets and token since it has the last offset and - // token value - if (warp_lane == cudf::detail::warp_size - 1) { - output_string_size += out_offset; - found_token[local_warp_id] = out_tokens > 0; + // Inspect the authority for userinfo, host, and port + const char* auth = ret.authority.data(); + auto auth_size = ret.authority.size_bytes(); + int amp = -1; + int closingbracket = -1; + int last_colon = -1; + for (int i = 0; i < auth_size; ++i) { + switch (auth[i]) { + case '@': + if (amp == -1) { + amp = i; + if (last_colon > 0) { last_colon = -1; } + if (closingbracket > 0) { closingbracket = -1; } + } + break; + case ':': last_colon = amp > 0 ? i - amp - 1 : i; break; + case ']': + if (closingbracket == -1) closingbracket = amp > 0 ? i - amp : i; + break; } } - __syncwarp(); - } - } + if (amp > 0) { + ret.userinfo = {auth, amp}; + if (!validate_userinfo(ret.userinfo)) { + ret.valid = false; + return ret; + } + // skip over the @ + amp++; - // last thread of the warp sets output size - if (warp_lane == cudf::detail::warp_size - 1) { - if (!found_token[local_warp_id]) { - clear_bit(out_validity, row_idx); - out_counts[row_idx] = 0; - } else if (valid) { - out_counts[row_idx] = output_string_size; + auth += amp; + auth_size -= amp; + } + if (last_colon > 0 && last_colon > closingbracket) { + // Found a port, attempt to parse it + ret.port = {auth + last_colon + 1, auth_size - last_colon - 1}; + if (!validate_port(ret.port)) { + ret.valid = false; + return ret; + } + ret.host = {auth, last_colon}; + } else { + ret.host = {auth, auth_size}; + } + auto host_ret = validate_host(ret.host); + switch (host_ret) { + case chunk_validity::FATAL: ret.valid = false; return ret; + case chunk_validity::INVALID: ret.host = {}; break; + } } + } else { + // path with no authority + ret.path = {str, len}; + } + if (!validate_path(ret.path)) { + ret.valid = false; + return ret; + } + } else { + ret.opaque = {str, len}; + if (!validate_opaque(ret.opaque)) { + ret.valid = false; + return ret; } } + + ret.valid = true; + return ret; } +// A URI is broken into parts or chunks. There are optional chunks and required chunks. A simple URI +// such as `https://www.nvidia.com` is easy to reason about, but it could also be written as +// `www.nvidia.com`, which is still valid. On top of that, there are characters which are allowed in +// certain chunks that are not allowed in others. There have been a multitude of methods attempted +// to get this correct, but at the end of the day, we have to validate the URI completely. This +// means even the simplest task of pulling off every character before the : still requires +// understanding how to validate an ipv6 address. This kernel was originally conceived as a two-pass +// kernel that ran the same code and either filled in offsets or filled in actual data. The problem +// is that to know what characters you need to copy, you need to have parsed the entire string as a +// 2 meg string could have `:/a` at the very end and everything up to that point is protocol or it +// could end in `.com` and now it is a hostname. To prevent the code from parsing it completely for +// length and then parsing it completely to copy the data, we will store off the offset of the +// string of question. The length is already stored in the offset column, so we then have a pointer +// and a number of bytes to copy and the second pass boils down to a series of memcpy calls. + /** - * @brief Parse protocol and copy from the input string column to the output char buffer. - * - * @tparam num_warps_per_threadblock Number of warps in a threadblock. This template argument must - * match the launch configuration, i.e. the kernel must be launched with - * `num_warps_per_threadblock * cudf::detail::warp_size` threads per threadblock. - * @tparam char_block_size Number of characters which will be loaded into the shared memory at a - * time. + * @brief Count the number of characters of each string after parsing the protocol. * * @param in_strings Input string column - * @param in_validity Validity vector of output column - * @param out_chars Character buffer for the output string column - * @param out_offsets Offset value of each string associated with `out_chars` + * @param chunk Chunk of URI to return + * @param out_lengths Number of characters in each decode URL + * @param out_offsets Offsets to the start of the chunks + * @param out_validity Bitmask of validity data, updated in function */ -template -__global__ void parse_uri_to_protocol(column_device_view const in_strings, - bitmask_type* in_validity, - char* const out_chars, - size_type const* const out_offsets) +__global__ void parse_uri_char_counter(column_device_view const in_strings, + URI_chunks chunk, + size_type* const out_lengths, + size_type* const out_offsets, + bitmask_type* out_validity) { - __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size]; - __shared__ typename cub::WarpScan::TempStorage cub_storage[num_warps_per_threadblock]; - __shared__ size_type out_idx[num_warps_per_threadblock]; - __shared__ bool found_token[num_warps_per_threadblock]; - - auto const global_thread_id = cudf::detail::grid_1d::global_thread_id(); - auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); - auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); - auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); - auto const nwarps = static_cast(gridDim.x * blockDim.x / cudf::detail::warp_size); - char* in_chars_shared = temporary_buffer[local_warp_id]; - - // Loop through strings, and assign each string to a warp - for (thread_index_type tidx = global_warp_id; tidx < in_strings.size(); tidx += nwarps) { + // thread per row + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const base_ptr = in_strings.child(strings_column_view::chars_column_index).data(); + + for (thread_index_type tidx = tid; tidx < in_strings.size(); + tidx += cudf::detail::grid_1d::grid_stride()) { auto const row_idx = static_cast(tidx); - if (!bit_is_set(in_validity, row_idx)) { continue; } + if (in_strings.is_null(row_idx)) { + out_lengths[row_idx] = 0; + continue; + } auto const in_string = in_strings.element(row_idx); auto const in_chars = in_string.data(); auto const string_length = in_string.size_bytes(); - auto out_chars_string = out_chars + out_offsets[row_idx]; - auto const nblocks = cudf::util::div_rounding_up_unsafe(string_length, char_block_size); - - // Use the last thread of the warp to initialize `out_idx` to 0 and `found_token` to false. - if (warp_lane == cudf::detail::warp_size - 1) { - out_idx[local_warp_id] = 0; - found_token[local_warp_id] = false; - } - __syncwarp(); - - for (size_type block_idx = 0; block_idx < nblocks && !found_token[local_warp_id]; block_idx++) { - auto const string_length_block = - std::min(char_block_size, string_length - char_block_size * block_idx); + auto const uri = validate_uri(in_chars, string_length); + if (!uri.valid) { + out_lengths[row_idx] = 0; + clear_bit(out_validity, row_idx); + } else { + // stash output offsets and lengths for next kernel to do the copy + switch (chunk) { + case URI_chunks::PROTOCOL: + out_lengths[row_idx] = uri.scheme.size_bytes(); + out_offsets[row_idx] = uri.scheme.data() - base_ptr; + break; + case URI_chunks::HOST: + out_lengths[row_idx] = uri.host.size_bytes(); + out_offsets[row_idx] = uri.host.data() - base_ptr; + break; + case URI_chunks::AUTHORITY: + out_lengths[row_idx] = uri.authority.size_bytes(); + out_offsets[row_idx] = uri.authority.data() - base_ptr; + break; + case URI_chunks::PATH: + out_lengths[row_idx] = uri.path.size_bytes(); + out_offsets[row_idx] = uri.path.data() - base_ptr; + break; + case URI_chunks::QUERY: + out_lengths[row_idx] = uri.query.size_bytes(); + out_offsets[row_idx] = uri.query.data() - base_ptr; + break; + case URI_chunks::USERINFO: + out_lengths[row_idx] = uri.userinfo.size_bytes(); + out_offsets[row_idx] = uri.userinfo.data() - base_ptr; + break; + } - // Each warp collectively loads input characters of the current block to shared memory. - for (auto char_idx = warp_lane; char_idx < string_length_block; - char_idx += cudf::detail::warp_size) { - auto const in_idx = block_idx * char_block_size + char_idx; - in_chars_shared[char_idx] = in_idx >= 0 && in_idx < string_length ? in_chars[in_idx] : 0; + if (out_lengths[row_idx] == 0) { + // A URI can be valid, but still have no data for a specific chunk + clear_bit(out_validity, row_idx); } + } + } +} - __syncwarp(); - - // `char_idx_start` represents the start character index of the current warp. - for (size_type char_idx_start = 0; - char_idx_start < string_length_block && !found_token[local_warp_id]; - char_idx_start += cudf::detail::warp_size) { - auto const char_idx = char_idx_start + warp_lane; - char const* const ch_ptr = in_chars_shared + char_idx; - - // Warp-wise prefix sum to establish tokens of string. - // All threads in the warp participate in the prefix sum, even if `char_idx` is beyond - // `string_length_block`. - int8_t const is_token = (char_idx < string_length_block && *ch_ptr == ':') ? 1 : 0; - int8_t out_tokens; - cub::WarpScan(cub_storage[local_warp_id]).InclusiveSum(is_token, out_tokens); - - // If the current character is before the token we will output the character. - int8_t const out_size = (char_idx >= string_length_block || out_tokens > 0) ? 0 : 1; - - // Warp-wise prefix sum to establish output location of the current thread. - // All threads in the warp participate in the prefix sum, even if `char_idx` is beyond - // `string_length_block`. - int8_t out_offset; - cub::WarpScan(cub_storage[local_warp_id]).ExclusiveSum(out_size, out_offset); - - // out_size of 1 means this thread writes a byte - if (out_size == 1) { out_chars_string[out_idx[local_warp_id] + out_offset] = *ch_ptr; } - - // last thread of the warp updates the offset and the token - if (warp_lane == cudf::detail::warp_size - 1) { - out_idx[local_warp_id] += (out_offset + out_size); - found_token[local_warp_id] = out_tokens > 0; - } +/** + * @brief Parse protocol and copy from the input string column to the output char buffer. + * + * @param in_strings Input string column + * @param src_offsets Offset value of source strings in in_strings + * @param offsets Offset value of each string associated with `out_chars` + * @param out_chars Character buffer for the output string column + */ +__global__ void parse_uri(column_device_view const in_strings, + size_type const* const src_offsets, + size_type const* const offsets, + char* const out_chars) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const base_ptr = in_strings.child(strings_column_view::chars_column_index).data(); + + for (thread_index_type tidx = tid; tidx < in_strings.size(); + tidx += cudf::detail::grid_1d::grid_stride()) { + auto const row_idx = static_cast(tidx); + auto const len = offsets[row_idx + 1] - offsets[row_idx]; - __syncwarp(); + if (len > 0) { + for (int i = 0; i < len; i++) { + out_chars[offsets[row_idx] + i] = base_ptr[src_offsets[row_idx] + i]; } } } @@ -286,16 +765,16 @@ __global__ void parse_uri_to_protocol(column_device_view const in_strings, } // namespace -std::unique_ptr parse_uri_to_protocol(strings_column_view const& input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr parse_uri(strings_column_view const& input, + URI_chunks chunk, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { size_type strings_count = input.size(); - if (strings_count == 0) return make_empty_column(type_id::STRING); + if (strings_count == 0) { return make_empty_column(type_id::STRING); } constexpr size_type num_warps_per_threadblock = 4; constexpr size_type threadblock_size = num_warps_per_threadblock * cudf::detail::warp_size; - constexpr size_type char_block_size = 256; auto const num_threadblocks = std::min(65536, cudf::util::div_rounding_up_unsafe(strings_count, num_warps_per_threadblock)); @@ -306,6 +785,9 @@ std::unique_ptr parse_uri_to_protocol(strings_column_view const& input, auto offsets_column = make_numeric_column( data_type{type_to_id()}, offset_count, mask_state::UNALLOCATED, stream, mr); + // build src offsets buffer + auto src_offsets = rmm::device_uvector(strings_count, stream); + // copy null mask rmm::device_buffer null_mask = input.parent().nullable() @@ -315,11 +797,12 @@ std::unique_ptr parse_uri_to_protocol(strings_column_view const& input, // count number of bytes in each string after parsing and store it in offsets_column auto offsets_view = offsets_column->view(); auto offsets_mutable_view = offsets_column->mutable_view(); - parse_uri_protocol_char_counter - <<>>( - *d_strings, - offsets_mutable_view.begin(), - reinterpret_cast(null_mask.data())); + parse_uri_char_counter<<>>( + *d_strings, + chunk, + offsets_mutable_view.begin(), + reinterpret_cast(src_offsets.data()), + reinterpret_cast(null_mask.data())); // use scan to transform number of bytes into offsets thrust::exclusive_scan(rmm::exec_policy(stream), @@ -335,13 +818,12 @@ std::unique_ptr parse_uri_to_protocol(strings_column_view const& input, auto chars_column = cudf::strings::detail::create_chars_child_column(out_chars_bytes, stream, mr); auto d_out_chars = chars_column->mutable_view().data(); - // parse and copy the characters from the input column to the output column - parse_uri_to_protocol - <<>>( - *d_strings, - reinterpret_cast(null_mask.data()), - d_out_chars, - offsets_column->view().begin()); + // copy the characters from the input column to the output column + parse_uri<<>>( + *d_strings, + reinterpret_cast(src_offsets.data()), + offsets_column->view().begin(), + d_out_chars); auto null_count = cudf::null_count(reinterpret_cast(null_mask.data()), 0, strings_count); @@ -362,7 +844,7 @@ std::unique_ptr parse_uri_to_protocol(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::parse_uri_to_protocol(input, stream, mr); + return detail::parse_uri(input, detail::URI_chunks::PROTOCOL, stream, mr); } } // namespace spark_rapids_jni \ No newline at end of file diff --git a/src/main/cpp/tests/parse_uri.cpp b/src/main/cpp/tests/parse_uri.cpp index 3ff14a6075..6f522829b6 100644 --- a/src/main/cpp/tests/parse_uri.cpp +++ b/src/main/cpp/tests/parse_uri.cpp @@ -71,7 +71,33 @@ TEST_F(ParseURIProtocolTests, SparkEdges) "/absolute/path", "http://%77%77%77.%4EV%49%44%49%41.com", "https:://broken.url", - "https://www.nvidia.com/q/This%20is%20a%20query"}); + "https://www.nvidia.com/q/This%20is%20a%20query", + "https://www.nvidia.com/\x93path/path/to/file", + "http://?", + "http://??", + "http://\?\?/", + "http://#", + "http://user:pass@host/file;param?query;p2", + "http://[1:2:3:4:5:6:7::]", + "http://[::2:3:4:5:6:7:8]", + "http://[fe80::7:8%eth0]", + "http://[fe80::7:8%1]", + "http://foo.bar/abc/\\\\\\http://foo.bar/abc.gif\\\\\\", + "www.nvidia.com:8100/servlet/" + "impc.DisplayCredits?primekey_in=2000041100:05:14115240636", + "https://nvidia.com/2Ru15Ss ", + "http://www.nvidia.com/plugins//##", + "www.nvidia.com:81/Free.fr/L7D9qw9X4S-aC0&D4X0/Panels&solutionId=0X54a/" + "cCdyncharset=UTF-8&t=01wx58Tab&ps=solution/" + "ccmd=_help&locale0X1&countrycode=MA/", + "http://www.nvidia.com/tags.php?%2F88\323\351\300ึณวน\331\315\370%2F", + "http://www.nvidia.com//wp-admin/includes/index.html#9389#123", + "http://www.nvidia.com/" + "object.php?object=ะก-\320%9Fะฑ-ะฟ-ะก\321%82\321%80ะตะป\321%8Cะฝะฐ-\321%83ะป-\320%" + "97ะฐะฒะพะด\321%81ะบะฐ\321%8F.html&sid=5", + "http://www.nvidia.com/picshow.asp?id=106&mnid=5080&classname=\271\253ืฐฦช", + "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com:443", + "http://userid:password@example.com:8080/"}); auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); @@ -88,8 +114,87 @@ TEST_F(ParseURIProtocolTests, SparkEdges) "", "http", "https", - "https"}, - {1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1}); + "https", + "", + "http", + "http", + "http", + "http", + "http", + "http", + "http", + "http", + "http", + "", + "www.nvidia.com", + "", + "", + "www.nvidia.com", + "", + "", + "", + "", + "http", + "http"}, + {1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); -} \ No newline at end of file +} + +TEST_F(ParseURIProtocolTests, IP6) +{ + cudf::test::strings_column_wrapper col({ + "https://[fe80::]", + "https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]", + "https://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334]", + "https://[2001:db8::1:0]", + "http://[2001:db8::2:1]", + "https://[::1]", + "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:443", + "https://[2001:db8:3333:4444:5555:6666:1.2.3.4]/path/to/file", + "https://[2001:db8:3333:4444:5555:6666:7777:8888:1.2.3.4]/path/to/file", + "https://[::db8:3333:4444:5555:6666:1.2.3.4]/path/to/file]", // this is valid, but spark + // doesn't think so + }); + auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper expected( + {"https", "https", "https", "https", "http", "https", "https", "https", "", ""}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); +} + +TEST_F(ParseURIProtocolTests, IP4) +{ + cudf::test::strings_column_wrapper col({ + "https://192.168.1.100/", + "https://192.168.1.100:8443/", + "https://192.168.1.100.5/", + "https://192.168.1/", + "https://280.100.1.1/", + "https://182.168..100/path/to/file", + }); + auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper expected( + {"https", "https", "https", "https", "https", "https"}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); +} + +TEST_F(ParseURIProtocolTests, UTF8) +{ + cudf::test::strings_column_wrapper col({ + "https://nvidia.com/%4EV%49%44%49%41", + "http://%77%77%77.%4EV%49%44%49%41.com", + "http://✪↩d⁚f„⁈.ws/123", + "https:// /path/to/file", + }); + auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper expected({"https", "http", "http", ""}, {1, 1, 1, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); +} diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java index 7289d110b2..5e90111f21 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java @@ -25,10 +25,33 @@ import ai.rapids.cudf.ColumnVector; public class ParseURITest { + void buildExpectedAndRun(String[] testData) { + String[] expectedStrings = new String[testData.length]; + for (int i=0; i Date: Tue, 5 Dec 2023 05:20:59 +0800 Subject: [PATCH 029/127] Update submodule cudf to 39431db46e718c98f29bfaaf429c7cf40dc95a57 (#1610) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index c8074b5176..39431db46e 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit c8074b5176a74630101c78c43c24b66141352b24 +Subproject commit 39431db46e718c98f29bfaaf429c7cf40dc95a57 From 5579dade1034d496e561c1739b010fd0d24a2705 Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Mon, 4 Dec 2023 23:05:24 +0000 Subject: [PATCH 030/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 0a56305696..39431db46e 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 0a56305696a37870495867cb76941699c3b53fe6 +Subproject commit 39431db46e718c98f29bfaaf429c7cf40dc95a57 From d7ef4219d40bf6a90b93cf2fe72400b5887115ea Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 5 Dec 2023 11:23:40 +0800 Subject: [PATCH 031/127] Update submodule cudf to 1c46d7d2b6eb9aea543be596495cda4972ec7887 (#1613) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 39431db46e..1c46d7d2b6 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 39431db46e718c98f29bfaaf429c7cf40dc95a57 +Subproject commit 1c46d7d2b6eb9aea543be596495cda4972ec7887 From 9b680d75836c3c831ebacca56fa4c43e9b0c980e Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 5 Dec 2023 21:23:11 +0800 Subject: [PATCH 032/127] Update submodule cudf to 8f7cbe69d4c2f670b97decc63e73b08e0eef7329 (#1615) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 1c46d7d2b6..8f7cbe69d4 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 1c46d7d2b6eb9aea543be596495cda4972ec7887 +Subproject commit 8f7cbe69d4c2f670b97decc63e73b08e0eef7329 From 554645df617d1fa36b32a0171a762a920390385a Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 5 Dec 2023 12:46:37 -0500 Subject: [PATCH 033/127] Adding support for parse URI for hostnames (#1569) * Adding host support for parse_uri Signed-off-by: Mike Wilson --- src/main/cpp/src/ParseURIJni.cpp | 14 + src/main/cpp/src/parse_uri.cu | 23 +- src/main/cpp/src/parse_uri.hpp | 13 + src/main/cpp/tests/parse_uri.cpp | 402 ++++++++++++------ .../com/nvidia/spark/rapids/jni/ParseURI.java | 11 + .../nvidia/spark/rapids/jni/ParseURITest.java | 34 +- 6 files changed, 345 insertions(+), 152 deletions(-) diff --git a/src/main/cpp/src/ParseURIJni.cpp b/src/main/cpp/src/ParseURIJni.cpp index 0d2d245108..9079d99b9d 100644 --- a/src/main/cpp/src/ParseURIJni.cpp +++ b/src/main/cpp/src/ParseURIJni.cpp @@ -33,4 +33,18 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseProtocol( } CATCH_STD(env, 0); } + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseHost(JNIEnv* env, + jclass, + jlong input_column) +{ + JNI_NULL_CHECK(env, input_column, "input column is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = reinterpret_cast(input_column); + return cudf::jni::ptr_as_jlong(spark_rapids_jni::parse_uri_to_host(*input).release()); + } + CATCH_STD(env, 0); +} } diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu index d0629cb71f..13a8effb37 100644 --- a/src/main/cpp/src/parse_uri.cu +++ b/src/main/cpp/src/parse_uri.cu @@ -292,9 +292,10 @@ bool __device__ validate_domain_name(string_view name) { // domain name can be alphanum or -. // slash can not be the first of last character of the domain name or around a . - bool last_was_slash = false; - bool last_was_period = false; - bool numeric_start = false; + bool last_was_slash = false; + bool last_was_period = false; + bool numeric_start = false; + int characters_before_period = 0; for (auto iter = name.begin(); iter < name.end(); ++iter) { auto const c = *iter; if (!is_alphanum(c) && c != '-' && c != '.') { return false; } @@ -311,12 +312,14 @@ bool __device__ validate_domain_name(string_view name) last_was_slash = true; last_was_period = false; } else if (c == '.') { - if (last_was_slash) { return false; } - last_was_period = true; - last_was_slash = false; + if (last_was_slash || last_was_period || characters_before_period == 0) { return false; } + last_was_period = true; + last_was_slash = false; + characters_before_period = 0; } else { last_was_period = false; last_was_slash = false; + characters_before_period++; } } @@ -847,4 +850,12 @@ std::unique_ptr parse_uri_to_protocol(strings_column_view const& input, return detail::parse_uri(input, detail::URI_chunks::PROTOCOL, stream, mr); } +std::unique_ptr parse_uri_to_host(strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::parse_uri(input, detail::URI_chunks::HOST, stream, mr); +} + } // namespace spark_rapids_jni \ No newline at end of file diff --git a/src/main/cpp/src/parse_uri.hpp b/src/main/cpp/src/parse_uri.hpp index c65d06d80a..0a76cec1b4 100644 --- a/src/main/cpp/src/parse_uri.hpp +++ b/src/main/cpp/src/parse_uri.hpp @@ -39,4 +39,17 @@ std::unique_ptr parse_uri_to_protocol( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Parse host and copy from the input string column to the output char buffer. + * + * @param input Input string column of URIs to parse + * @param stream Stream on which to operate. + * @param mr Memory resource for returned column + * @return std::unique_ptr String column of hosts parsed. + */ +std::unique_ptr parse_uri_to_host( + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace spark_rapids_jni diff --git a/src/main/cpp/tests/parse_uri.cpp b/src/main/cpp/tests/parse_uri.cpp index 6f522829b6..1112fea232 100644 --- a/src/main/cpp/tests/parse_uri.cpp +++ b/src/main/cpp/tests/parse_uri.cpp @@ -22,144 +22,178 @@ #include struct ParseURIProtocolTests : public cudf::test::BaseFixture {}; +struct ParseURIHostTests : public cudf::test::BaseFixture {}; -TEST_F(ParseURIProtocolTests, Simple) +enum class test_types { + SIMPLE, + SPARK_EDGES, + IPv6, + IPv4, + UTF8, +}; + +namespace { +cudf::test::strings_column_wrapper get_test_data(test_types t) { - cudf::test::strings_column_wrapper col({ - "https://www.nvidia.com/s/uri?param1=2", - "http://www.nvidia.com", - "file://path/to/a/cool/file", - "smb://network/path/to/file", - "http:/www.nvidia.com", - "file:path/to/a/cool/file", - }); - auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); - - cudf::test::strings_column_wrapper expected({"https", "http", "file", "smb", "http", "file"}); + switch (t) { + case test_types::SIMPLE: + return cudf::test::strings_column_wrapper({ + "https://www.nvidia.com/s/uri?param1=2", + "http://www.nvidia.com", + "file://path/to/a/cool/file", + "smb://network/path/to/file", + "http:/www.nvidia.com", + "file:path/to/a/cool/file", + "/network/path/to/file", + "nvidia.com", + "www.nvidia.com/s/uri", + }); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + case test_types::SPARK_EDGES: + return cudf::test::strings_column_wrapper( + {"https://nvidia.com/https&#://nvidia.com", + "https://http://www.nvidia.com", + "filesystemmagicthing://bob.yaml", + "nvidia.com:8080", + "http://thisisinvalid.data/due/to-the_character%s/inside*the#url`~", + "file:/absolute/path", + "//www.nvidia.com", + "#bob", + "#this%doesnt#make//sense://to/me", + "HTTP:&bob", + "/absolute/path", + "http://%77%77%77.%4EV%49%44%49%41.com", + "https:://broken.url", + "https://www.nvidia.com/q/This%20is%20a%20query", + "https://www.nvidia.com/\x93path/path/to/file", + "http://?", + "http://??", + "http://\?\?/", + "http://#", + "http://user:pass@host/file;param?query;p2", + "http://[1:2:3:4:5:6:7::]", + "http://[::2:3:4:5:6:7:8]", + "http://[fe80::7:8%eth0]", + "http://[fe80::7:8%1]", + "http://foo.bar/abc/\\\\\\http://foo.bar/abc.gif\\\\\\", + "www.nvidia.com:8100/servlet/" + "impc.DisplayCredits?primekey_in=2000041100:05:14115240636", + "https://nvidia.com/2Ru15Ss ", + "http://www.nvidia.com/plugins//##", + "www.nvidia.com:81/Free.fr/L7D9qw9X4S-aC0&D4X0/Panels&solutionId=0X54a/" + "cCdyncharset=UTF-8&t=01wx58Tab&ps=solution/" + "ccmd=_help&locale0X1&countrycode=MA/", + "http://www.nvidia.com/tags.php?%2F88\323\351\300ึณวน\331\315\370%2F", + "http://www.nvidia.com//wp-admin/includes/index.html#9389#123", + "http://www.nvidia.com/" + "object.php?object=ะก-\320%9Fะฑ-ะฟ-ะก\321%82\321%80ะตะป\321%8Cะฝะฐ-\321%83ะป-\320%" + "97ะฐะฒะพะด\321%81ะบะฐ\321%8F.html&sid=5", + "http://www.nvidia.com/picshow.asp?id=106&mnid=5080&classname=\271\253ืฐฦช", + "http://-.~_!$&'()*+,;=:%40:80%2f::::::@nvidia.com:443", + "http://userid:password@example.com:8080/", + "http://.www.nvidia.com./", + "http://www.nvidia..com/"}); + case test_types::IPv6: + return cudf::test::strings_column_wrapper({ + "https://[fe80::]", + "https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]", + "https://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334]", + "https://[2001:db8::1:0]", + "http://[2001:db8::2:1]", + "https://[::1]", + "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:443", + "https://[2001:db8:3333:4444:5555:6666:1.2.3.4]/path/to/file", + "https://[2001:db8:3333:4444:5555:6666:7777:8888:1.2.3.4]/path/to/file", + "https://[::db8:3333:4444:5555:6666:1.2.3.4]/path/to/file]", // this is valid, but spark + // doesn't think so + }); + case test_types::IPv4: + return cudf::test::strings_column_wrapper({ + "https://192.168.1.100/", + "https://192.168.1.100:8443/", + "https://192.168.1.100.5/", + "https://192.168.1/", + "https://280.100.1.1/", + "https://182.168..100/path/to/file", + }); + case test_types::UTF8: + return cudf::test::strings_column_wrapper({ + "https://nvidia.com/%4EV%49%44%49%41", + "http://%77%77%77.%4EV%49%44%49%41.com", + "http://✪↩d⁚f„⁈.ws/123", + "https:// /path/to/file", + }); + default: CUDF_FAIL("Test type unsupported!"); return cudf::test::strings_column_wrapper(); + } } +} // namespace -TEST_F(ParseURIProtocolTests, Negatives) +TEST_F(ParseURIProtocolTests, Simple) { - cudf::test::strings_column_wrapper col({ - "https//www.nvidia.com/s/uri?param1=2", - "/network/path/to/file", - "nvidia.com", - "www.nvidia.com/s/uri", - }); - auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); + auto const col = get_test_data(test_types::SIMPLE); + auto const result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); - cudf::test::strings_column_wrapper expected({"", "", "", ""}, {0, 0, 0, 0}); + cudf::test::strings_column_wrapper const expected( + {"https", "http", "file", "smb", "http", "file", "", "", ""}, {1, 1, 1, 1, 1, 1, 0, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } TEST_F(ParseURIProtocolTests, SparkEdges) { - cudf::test::strings_column_wrapper col( - {"https://nvidia.com/https&#://nvidia.com", - "https://http://www.nvidia.com", - "filesystemmagicthing://bob.yaml", - "nvidia.com:8080", - "http://thisisinvalid.data/due/to-the_character%s/inside*the#url`~", - "file:/absolute/path", - "//www.nvidia.com", - "#bob", - "#this%doesnt#make//sense://to/me", - "HTTP:&bob", - "/absolute/path", - "http://%77%77%77.%4EV%49%44%49%41.com", - "https:://broken.url", - "https://www.nvidia.com/q/This%20is%20a%20query", - "https://www.nvidia.com/\x93path/path/to/file", - "http://?", - "http://??", - "http://\?\?/", - "http://#", - "http://user:pass@host/file;param?query;p2", - "http://[1:2:3:4:5:6:7::]", - "http://[::2:3:4:5:6:7:8]", - "http://[fe80::7:8%eth0]", - "http://[fe80::7:8%1]", - "http://foo.bar/abc/\\\\\\http://foo.bar/abc.gif\\\\\\", - "www.nvidia.com:8100/servlet/" - "impc.DisplayCredits?primekey_in=2000041100:05:14115240636", - "https://nvidia.com/2Ru15Ss ", - "http://www.nvidia.com/plugins//##", - "www.nvidia.com:81/Free.fr/L7D9qw9X4S-aC0&D4X0/Panels&solutionId=0X54a/" - "cCdyncharset=UTF-8&t=01wx58Tab&ps=solution/" - "ccmd=_help&locale0X1&countrycode=MA/", - "http://www.nvidia.com/tags.php?%2F88\323\351\300ึณวน\331\315\370%2F", - "http://www.nvidia.com//wp-admin/includes/index.html#9389#123", - "http://www.nvidia.com/" - "object.php?object=ะก-\320%9Fะฑ-ะฟ-ะก\321%82\321%80ะตะป\321%8Cะฝะฐ-\321%83ะป-\320%" - "97ะฐะฒะพะด\321%81ะบะฐ\321%8F.html&sid=5", - "http://www.nvidia.com/picshow.asp?id=106&mnid=5080&classname=\271\253ืฐฦช", - "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com:443", - "http://userid:password@example.com:8080/"}); - - auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); - - cudf::test::strings_column_wrapper expected({"https", - "https", - "filesystemmagicthing", - "nvidia.com", - "", - "file", - "", - "", - "", - "HTTP", - "", - "http", - "https", - "https", - "", - "http", - "http", - "http", - "http", - "http", - "http", - "http", - "http", - "http", - "", - "www.nvidia.com", - "", - "", - "www.nvidia.com", - "", - "", - "", - "", - "http", - "http"}, - {1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1}); + auto const col = get_test_data(test_types::SPARK_EDGES); + auto const result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper const expected( + {"https", + "https", + "filesystemmagicthing", + "nvidia.com", + "", + "file", + "", + "", + "", + "HTTP", + "", + "http", + "https", + "https", + "", + "http", + "http", + "http", + "http", + "http", + "http", + "http", + "http", + "http", + "", + "www.nvidia.com", + "", + "", + "www.nvidia.com", + "", + "", + "", + "", + "http", + "http", + "http", + "http"}, + {1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } TEST_F(ParseURIProtocolTests, IP6) { - cudf::test::strings_column_wrapper col({ - "https://[fe80::]", - "https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]", - "https://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334]", - "https://[2001:db8::1:0]", - "http://[2001:db8::2:1]", - "https://[::1]", - "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:443", - "https://[2001:db8:3333:4444:5555:6666:1.2.3.4]/path/to/file", - "https://[2001:db8:3333:4444:5555:6666:7777:8888:1.2.3.4]/path/to/file", - "https://[::db8:3333:4444:5555:6666:1.2.3.4]/path/to/file]", // this is valid, but spark - // doesn't think so - }); - auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); - - cudf::test::strings_column_wrapper expected( + auto const col = get_test_data(test_types::IPv6); + auto const result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper const expected( {"https", "https", "https", "https", "http", "https", "https", "https", "", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}); @@ -168,17 +202,10 @@ TEST_F(ParseURIProtocolTests, IP6) TEST_F(ParseURIProtocolTests, IP4) { - cudf::test::strings_column_wrapper col({ - "https://192.168.1.100/", - "https://192.168.1.100:8443/", - "https://192.168.1.100.5/", - "https://192.168.1/", - "https://280.100.1.1/", - "https://182.168..100/path/to/file", - }); - auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); - - cudf::test::strings_column_wrapper expected( + auto const col = get_test_data(test_types::IPv4); + auto const result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper const expected( {"https", "https", "https", "https", "https", "https"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); @@ -186,15 +213,112 @@ TEST_F(ParseURIProtocolTests, IP4) TEST_F(ParseURIProtocolTests, UTF8) { - cudf::test::strings_column_wrapper col({ - "https://nvidia.com/%4EV%49%44%49%41", - "http://%77%77%77.%4EV%49%44%49%41.com", - "http://✪↩d⁚f„⁈.ws/123", - "https:// /path/to/file", - }); - auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); - - cudf::test::strings_column_wrapper expected({"https", "http", "http", ""}, {1, 1, 1, 0}); + auto const col = get_test_data(test_types::UTF8); + auto const result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper const expected({"https", "http", "http", ""}, {1, 1, 1, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); +} + +TEST_F(ParseURIHostTests, Simple) +{ + auto const col = get_test_data(test_types::SIMPLE); + auto const result = spark_rapids_jni::parse_uri_to_host(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper const expected( + {"www.nvidia.com", "www.nvidia.com", "path", "network", "", "", "", "", ""}, + {1, 1, 1, 1, 0, 0, 0, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); +} + +TEST_F(ParseURIHostTests, SparkEdges) +{ + auto const col = get_test_data(test_types::SPARK_EDGES); + auto const result = spark_rapids_jni::parse_uri_to_host(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper const expected( + {"nvidia.com", + "http", + "bob.yaml", + "", + "", + "", + "www.nvidia.com", + "", + "", + "", + "", + "", + "", + "www.nvidia.com", + "", + "", + "", + "", + "", + "host", + "[1:2:3:4:5:6:7::]", + "[::2:3:4:5:6:7:8]", + "[fe80::7:8%eth0]", + "[fe80::7:8%1]", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "nvidia.com", + "example.com", + "", + ""}, + {1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); +} + +TEST_F(ParseURIHostTests, IP6) +{ + auto const col = get_test_data(test_types::IPv6); + auto const result = spark_rapids_jni::parse_uri_to_host(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper const expected({"[fe80::]", + "[2001:0db8:85a3:0000:0000:8a2e:0370:7334]", + "[2001:0DB8:85A3:0000:0000:8A2E:0370:7334]", + "[2001:db8::1:0]", + "[2001:db8::2:1]", + "[::1]", + "[2001:db8:85a3:8d3:1319:8a2e:370:7348]", + "[2001:db8:3333:4444:5555:6666:1.2.3.4]", + "", + ""}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); +} + +TEST_F(ParseURIHostTests, IP4) +{ + auto const col = get_test_data(test_types::IPv4); + auto const result = spark_rapids_jni::parse_uri_to_host(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper const expected( + {"192.168.1.100", "192.168.1.100", "", "", "", ""}, {1, 1, 0, 0, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); +} + +TEST_F(ParseURIHostTests, UTF8) +{ + auto const col = get_test_data(test_types::UTF8); + auto const result = spark_rapids_jni::parse_uri_to_host(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper const expected({"nvidia.com", "", "", ""}, {1, 0, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java index 0c0b046f15..0e14f388d4 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java @@ -38,7 +38,18 @@ public static ColumnVector parseURIProtocol(ColumnView uriColumn) { return new ColumnVector(parseProtocol(uriColumn.getNativeView())); } + /** + * Parse host for each URI from the incoming column. + * + * @param URIColumn The input strings column in which each row contains a URI. + * @return A string column with host data extracted. + */ + public static ColumnVector parseURIHost(ColumnView uriColumn) { + assert uriColumn.getType().equals(DType.STRING) : "Input type must be String"; + return new ColumnVector(parseHost(uriColumn.getNativeView())); + } private static native long parseProtocol(long jsonColumnHandle); + private static native long parseHost(long jsonColumnHandle); } diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java index 5e90111f21..c6e3b06ed1 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java @@ -26,7 +26,8 @@ public class ParseURITest { void buildExpectedAndRun(String[] testData) { - String[] expectedStrings = new String[testData.length]; + String[] expectedProtocolStrings = new String[testData.length]; + String[] expectedHostStrings = new String[testData.length]; for (int i=0; i Date: Wed, 6 Dec 2023 05:24:49 +0800 Subject: [PATCH 034/127] Update submodule cudf to a2d2ef4829a1616764a929d4c4d0d60a8debd4e5 (#1616) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 8f7cbe69d4..a2d2ef4829 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 8f7cbe69d4c2f670b97decc63e73b08e0eef7329 +Subproject commit a2d2ef4829a1616764a929d4c4d0d60a8debd4e5 From bbf5732aecd4224a93b8e9dd948bdcf8e85bd3bd Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 5 Dec 2023 23:49:05 +0000 Subject: [PATCH 035/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 31aedf2ddc..a2d2ef4829 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 31aedf2ddcd99cb4b572f8685f7790b743500149 +Subproject commit a2d2ef4829a1616764a929d4c4d0d60a8debd4e5 From da8be0b20f6ebff279c50d9f4dab0f8d0a5f8d4b Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 6 Dec 2023 11:23:26 +0800 Subject: [PATCH 036/127] Update submodule cudf to d97b3e091778987562508612d216a36207f5cd7c (#1619) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index a2d2ef4829..d97b3e0917 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit a2d2ef4829a1616764a929d4c4d0d60a8debd4e5 +Subproject commit d97b3e091778987562508612d216a36207f5cd7c From 5ad7fe44fcd60a55bc5cb2c2e3fa1dde4afd3b11 Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 6 Dec 2023 18:00:43 +0000 Subject: [PATCH 037/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 8eacf8f2ec..d97b3e0917 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 8eacf8f2ecb70eedf917fec2dfca4403810399d1 +Subproject commit d97b3e091778987562508612d216a36207f5cd7c From 9e64d262f0c47ed5852dbbe55f8bb9036e6059fc Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 7 Dec 2023 06:02:17 +0800 Subject: [PATCH 038/127] Update submodule cudf to fe612b3eaa30cd0cc6f0f49f99dce8785e0258f6 (#1622) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index d97b3e0917..fe612b3eaa 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit d97b3e091778987562508612d216a36207f5cd7c +Subproject commit fe612b3eaa30cd0cc6f0f49f99dce8785e0258f6 From cd55b989eac33435da1d9ceeef189fb5c0a7ed3c Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 7 Dec 2023 01:48:56 +0000 Subject: [PATCH 039/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index c1d307396f..fe612b3eaa 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit c1d307396fd357a9b6b68d626c2b17813d1181b8 +Subproject commit fe612b3eaa30cd0cc6f0f49f99dce8785e0258f6 From b3c23091740d3a493157bc6bf6064a967d71f345 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 7 Dec 2023 11:27:41 +0800 Subject: [PATCH 040/127] Update submodule cudf to d8f49750c76694c5093c22b415308ffc1ae1172f (#1626) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index fe612b3eaa..d8f49750c7 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit fe612b3eaa30cd0cc6f0f49f99dce8785e0258f6 +Subproject commit d8f49750c76694c5093c22b415308ffc1ae1172f From 776990f2517ee817dda624b9a020ebb048582f1a Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 7 Dec 2023 17:29:45 +0800 Subject: [PATCH 041/127] Update submodule cudf to f5dca59b0066427e3fa6e73570f4cd3b96fe3043 (#1627) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index d8f49750c7..f5dca59b00 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit d8f49750c76694c5093c22b415308ffc1ae1172f +Subproject commit f5dca59b0066427e3fa6e73570f4cd3b96fe3043 From aeb0f5cb0075c0ad90395e242673a76610eb3bc7 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 7 Dec 2023 23:27:28 +0800 Subject: [PATCH 042/127] Update submodule cudf to a253826fbce0a81ee2b35f48174f002f66c228a6 (#1628) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index f5dca59b00..a253826fbc 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit f5dca59b0066427e3fa6e73570f4cd3b96fe3043 +Subproject commit a253826fbce0a81ee2b35f48174f002f66c228a6 From 46472b8755a455e2aca92c41d9d599d6b883389f Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 8 Dec 2023 05:25:08 +0800 Subject: [PATCH 043/127] Update submodule cudf to 6fc230ab8fd545ac1018664086af582fff2abd68 (#1629) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index a253826fbc..6fc230ab8f 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit a253826fbce0a81ee2b35f48174f002f66c228a6 +Subproject commit 6fc230ab8fd545ac1018664086af582fff2abd68 From 844a336b31c17c9355f7c9c2f9639522c4bdca62 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 8 Dec 2023 11:24:36 +0800 Subject: [PATCH 044/127] Update submodule cudf to 248aa2c8873c12e41f1e6ea2660740a0a4ddaf68 (#1630) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 6fc230ab8f..248aa2c887 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 6fc230ab8fd545ac1018664086af582fff2abd68 +Subproject commit 248aa2c8873c12e41f1e6ea2660740a0a4ddaf68 From 4c20e3ab78ce28e759ee57f6327049e9748fae6c Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 8 Dec 2023 13:23:08 +0800 Subject: [PATCH 045/127] Adding float to string kernel (#1508) * wip Signed-off-by: Haoyang Li * wip Signed-off-by: Haoyang Li * Add float to string kernel Signed-off-by: Haoyang Li * Update src/main/cpp/src/cast_float_to_string.cu Co-authored-by: Mike Wilson * Update src/main/cpp/src/cast_float_to_string.cu Co-authored-by: Mike Wilson * address comments and use different precision for float Signed-off-by: Haoyang Li * rewrite the solution with ryu Signed-off-by: Haoyang Li * update license Signed-off-by: Haoyang Li * clean up Signed-off-by: Haoyang Li * Split ftos_converter out Signed-off-by: Haoyang Li * clean up Signed-off-by: Haoyang Li * resolve cudf conflicts Signed-off-by: Haoyang Li * resolve cudf conflicts Signed-off-by: Haoyang Li * resolve cudf conflicts Signed-off-by: Haoyang Li * resolve cudf conflicts Signed-off-by: Haoyang Li * remove cudf changes Signed-off-by: Haoyang Li * remove cudf changes Signed-off-by: Haoyang Li * Add copyright and notice Signed-off-by: Haoyang Li * Fix copyrights and license Signed-off-by: Haoyang Li * cudf conflict resolve Signed-off-by: Haoyang Li * Add nv apache license to ftos_converter Signed-off-by: Haoyang Li * Update src/main/cpp/src/ftos_converter.cu Co-authored-by: Jason Lowe * address some comments Signed-off-by: Haoyang Li * cudf conflict Signed-off-by: Haoyang Li * Update src/main/cpp/src/cast_float_to_string.cu Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> * addressed comments Signed-off-by: Haoyang Li * clang format Signed-off-by: Haoyang Li * Address comments Signed-off-by: Haoyang Li * Address comments Signed-off-by: Haoyang Li * sync Signed-off-by: Haoyang Li * address comments Signed-off-by: Haoyang Li --------- Signed-off-by: Haoyang Li Co-authored-by: Mike Wilson Co-authored-by: Jason Lowe Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- NOTICE | 20 + src/main/cpp/CMakeLists.txt | 1 + src/main/cpp/src/CastStringJni.cpp | 20 +- src/main/cpp/src/cast_float_to_string.cu | 127 ++ src/main/cpp/src/cast_string.hpp | 7 +- src/main/cpp/src/ftos_converter.cuh | 1179 +++++++++++++++++ src/main/cpp/tests/CMakeLists.txt | 3 + src/main/cpp/tests/cast_float_to_string.cpp | 83 ++ .../nvidia/spark/rapids/jni/CastStrings.java | 13 +- 9 files changed, 1449 insertions(+), 4 deletions(-) create mode 100644 NOTICE create mode 100644 src/main/cpp/src/cast_float_to_string.cu create mode 100644 src/main/cpp/src/ftos_converter.cuh create mode 100644 src/main/cpp/tests/cast_float_to_string.cpp diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000000..a0975c00c8 --- /dev/null +++ b/NOTICE @@ -0,0 +1,20 @@ +RAPIDS Accelerator JNI For Apache Spark +Copyright (c) 2022-2023, NVIDIA CORPORATION + +-------------------------------------------------------------------------------- + +This project includes code from ryu (https://github.com/ulfjack/ryu). + +Copyright (2018) Ulf Adams and contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 29b6a795a3..18c0cd12e8 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -164,6 +164,7 @@ add_library( src/ZOrderJni.cpp src/bloom_filter.cu src/cast_decimal_to_string.cu + src/cast_float_to_string.cu src/cast_string.cu src/cast_string_to_float.cu src/datetime_rebase.cu diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index d09bc33e4c..933fc15e34 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -109,6 +109,22 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toFloat( CATCH_CAST_EXCEPTION(env, 0); } +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat(JNIEnv* env, + jclass, + jlong input_column) +{ + JNI_NULL_CHECK(env, input_column, "input column is null", 0); + + try { + cudf::jni::auto_set_device(env); + + auto const& cv = *reinterpret_cast(input_column); + return cudf::jni::release_as_jlong( + spark_rapids_jni::float_to_string(cv, cudf::get_default_stream())); + } + CATCH_CAST_EXCEPTION(env, 0); +} + JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromDecimal(JNIEnv* env, jclass, jlong input_column) @@ -118,7 +134,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromDecimal try { cudf::jni::auto_set_device(env); - cudf::column_view cv{*reinterpret_cast(input_column)}; + auto const& cv = *reinterpret_cast(input_column); return cudf::jni::release_as_jlong( spark_rapids_jni::decimal_to_non_ansi_string(cv, cudf::get_default_stream())); } diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu new file mode 100644 index 0000000000..6fc4d20f79 --- /dev/null +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cast_string.hpp" +#include "ftos_converter.cuh" + +#include +#include +#include +#include +#include + +#include +#include + +namespace spark_rapids_jni { + +namespace detail { +namespace { + +template +struct float_to_string_fn { + cudf::column_device_view d_floats; + cudf::size_type* d_offsets; + char* d_chars; + + __device__ cudf::size_type compute_output_size(cudf::size_type idx) const + { + auto const value = d_floats.element(idx); + bool constexpr is_float = std::is_same_v; + return static_cast( + ftos_converter::compute_ftos_size(static_cast(value), is_float)); + } + + __device__ void float_to_string(cudf::size_type idx) const + { + auto const value = d_floats.element(idx); + bool constexpr is_float = std::is_same_v; + auto const output = d_chars + d_offsets[idx]; + ftos_converter::float_to_string(static_cast(value), is_float, output); + } + + __device__ void operator()(cudf::size_type idx) const + { + if (d_floats.is_null(idx)) { + if (d_chars == nullptr) { d_offsets[idx] = 0; } + return; + } + if (d_chars != nullptr) { + float_to_string(idx); + } else { + d_offsets[idx] = compute_output_size(idx); + } + } +}; + +/** + * @brief This dispatch method is for converting floats into strings. + * + * The template function declaration ensures only float types are allowed. + */ +struct dispatch_float_to_string_fn { + template )> + std::unique_ptr operator()(cudf::column_view const& floats, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + auto const strings_count = floats.size(); + if (strings_count == 0) { return cudf::make_empty_column(cudf::type_id::STRING); } + + auto const input_ptr = cudf::column_device_view::create(floats, stream); + + auto [offsets, chars] = cudf::strings::detail::make_strings_children( + float_to_string_fn{*input_ptr}, strings_count, stream, mr); + + return make_strings_column(strings_count, + std::move(offsets), + std::move(chars), + floats.null_count(), + cudf::detail::copy_bitmask(floats, stream, mr)); + } + + // non-float types throw an exception + template )> + std::unique_ptr operator()(cudf::column_view const&, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) + { + CUDF_FAIL("Values for float_to_string function must be a float type."); + } +}; + +} // namespace + +// This will convert all float column types into a strings column. +std::unique_ptr float_to_string(cudf::column_view const& floats, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type_dispatcher(floats.type(), dispatch_float_to_string_fn{}, floats, stream, mr); +} + +} // namespace detail + +// external API +std::unique_ptr float_to_string(cudf::column_view const& floats, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::float_to_string(floats, stream, mr); +} + +} // namespace spark_rapids_jni \ No newline at end of file diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp index df74407355..c4f850b47f 100644 --- a/src/main/cpp/src/cast_string.hpp +++ b/src/main/cpp/src/cast_string.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -115,6 +115,11 @@ std::unique_ptr string_to_float( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr float_to_string( + cudf::column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + std::unique_ptr decimal_to_non_ansi_string( cudf::column_view const& input, rmm::cuda_stream_view stream, diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh new file mode 100644 index 0000000000..444f790d3c --- /dev/null +++ b/src/main/cpp/src/ftos_converter.cuh @@ -0,0 +1,1179 @@ +/* + * Copyright 2018 Ulf Adams + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +namespace spark_rapids_jni::ftos_converter { + +namespace { + +// d2s.c from ryu +// A floating decimal representing m * 10^e. +typedef struct floating_decimal_64 { + uint64_t mantissa; + // Decimal exponent's range is -324 to 308 + // inclusive, and can fit in a short if needed. + int32_t exponent; +} floating_decimal_64; + +// f2s.c from ryu +// A floating decimal representing m * 10^e. +typedef struct floating_decimal_32 { + uint32_t mantissa; + // Decimal exponent's range is -45 to 38 + // inclusive, and can fit in a short if needed. + int32_t exponent; +} floating_decimal_32; + +//===== constants from ryu ===== + +// These tables are generated by PrintDoubleLookupTable. +constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125; +constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125; +constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64); +constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64); +constexpr unsigned int DOUBLE_MANTISSA_BITS = 52; +constexpr unsigned int DOUBLE_EXPONENT_BITS = 11; +constexpr unsigned int DOUBLE_BIAS = 1023; +constexpr unsigned int FLOAT_MANTISSA_BITS = 23; +constexpr unsigned int FLOAT_EXPONENT_BITS = 8; +constexpr unsigned int FLOAT_BIAS = 127; + +__constant__ uint64_t const DOUBLE_POW5_INV_SPLIT2[15][2] = { + {1u, 2305843009213693952u}, + {5955668970331000884u, 1784059615882449851u}, + {8982663654677661702u, 1380349269358112757u}, + {7286864317269821294u, 2135987035920910082u}, + {7005857020398200553u, 1652639921975621497u}, + {17965325103354776697u, 1278668206209430417u}, + {8928596168509315048u, 1978643211784836272u}, + {10075671573058298858u, 1530901034580419511u}, + {597001226353042382u, 1184477304306571148u}, + {1527430471115325346u, 1832889850782397517u}, + {12533209867169019542u, 1418129833677084982u}, + {5577825024675947042u, 2194449627517475473u}, + {11006974540203867551u, 1697873161311732311u}, + {10313493231639821582u, 1313665730009899186u}, + {12701016819766672773u, 2032799256770390445u}}; + +__constant__ uint32_t const POW5_INV_OFFSETS[19] = {0x54544554, + 0x04055545, + 0x10041000, + 0x00400414, + 0x40010000, + 0x41155555, + 0x00000454, + 0x00010044, + 0x40000000, + 0x44000041, + 0x50454450, + 0x55550054, + 0x51655554, + 0x40004000, + 0x01000001, + 0x00010500, + 0x51515411, + 0x05555554, + 0x00000000}; + +__constant__ uint64_t const DOUBLE_POW5_SPLIT2[13][2] = { + {0u, 1152921504606846976u}, + {0u, 1490116119384765625u}, + {1032610780636961552u, 1925929944387235853u}, + {7910200175544436838u, 1244603055572228341u}, + {16941905809032713930u, 1608611746708759036u}, + {13024893955298202172u, 2079081953128979843u}, + {6607496772837067824u, 1343575221513417750u}, + {17332926989895652603u, 1736530273035216783u}, + {13037379183483547984u, 2244412773384604712u}, + {1605989338741628675u, 1450417759929778918u}, + {9630225068416591280u, 1874621017369538693u}, + {665883850346957067u, 1211445438634777304u}, + {14931890668723713708u, 1565756531257009982u}}; + +__constant__ uint32_t const POW5_OFFSETS[21] = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995, 0x55545555, + 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540, 0x45555550, 0x40004000, + 0x96440440, 0x55565565, 0x54454045, 0x40154151, 0x55559155, 0x51405555, 0x00000105}; + +constexpr uint32_t POW5_TABLE_SIZE = 26; + +__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = { + 1ull, + 5ull, + 25ull, + 125ull, + 625ull, + 3125ull, + 15625ull, + 78125ull, + 390625ull, + 1953125ull, + 9765625ull, + 48828125ull, + 244140625ull, + 1220703125ull, + 6103515625ull, + 30517578125ull, + 152587890625ull, + 762939453125ull, + 3814697265625ull, + 19073486328125ull, + 95367431640625ull, + 476837158203125ull, + 2384185791015625ull, + 11920928955078125ull, + 59604644775390625ull, + 298023223876953125ull //, 1490116119384765625ull +}; + +//===== common.h from ryu ===== + +// Returns the number of decimal digits in v, which must not contain more than 9 digits. +__device__ inline uint32_t decimalLength9(uint32_t const v) +{ + // Function precondition: v is not a 10-digit number. + // (f2s: 9 digits are sufficient for round-tripping.) + // (d2fixed: We print 9-digit blocks.) + assert(v < 1000000000); + if (v >= 100000000) { return 9; } + if (v >= 10000000) { return 8; } + if (v >= 1000000) { return 7; } + if (v >= 100000) { return 6; } + if (v >= 10000) { return 5; } + if (v >= 1000) { return 4; } + if (v >= 100) { return 3; } + if (v >= 10) { return 2; } + return 1; +} + +// Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. +__device__ inline int32_t pow5bits(int32_t const e) +{ + // This approximation works up to the point that the multiplication overflows at e = 3529. + // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater + // than 2^9297. + assert(e >= 0); + assert(e <= 3528); + return (int32_t)(((((uint32_t)e) * 1217359) >> 19) + 1); +} + +// Returns floor(log_10(2^e)); requires 0 <= e <= 1650. +__device__ inline uint32_t log10Pow2(int32_t const e) +{ + // The first value this approximation fails for is 2^1651 which is just greater than 10^297. + assert(e >= 0); + assert(e <= 1650); + return (((uint32_t)e) * 78913) >> 18; +} + +// Returns floor(log_10(5^e)); requires 0 <= e <= 2620. +__device__ inline uint32_t log10Pow5(int32_t const e) +{ + // The first value this approximation fails for is 5^2621 which is just greater than 10^1832. + assert(e >= 0); + assert(e <= 2620); + return (((uint32_t)e) * 732923) >> 20; +} + +__device__ inline uint32_t pow5factor_32(uint32_t value) +{ + uint32_t count = 0; + for (;;) { + assert(value != 0); + uint32_t const q = value / 5; + uint32_t const r = value % 5; + if (r != 0) { break; } + value = q; + ++count; + } + return count; +} + +// Returns true if value is divisible by 5^p. +__device__ inline bool multipleOfPowerOf5_32(uint32_t const value, uint32_t const p) +{ + return pow5factor_32(value) >= p; +} + +// Returns true if value is divisible by 2^p. +__device__ inline bool multipleOfPowerOf2_32(uint32_t const value, uint32_t const p) +{ + // __builtin_ctz doesn't appear to be faster here. + return (value & ((1u << p) - 1)) == 0; +} + +// It seems to be slightly faster to avoid uint128_t here, although the +// generated code for uint128_t looks slightly nicer. +__device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, int32_t const shift) +{ + assert(shift > 32); + + // The casts here help MSVC to avoid calls to the __allmul library + // function. + uint32_t const factorLo = (uint32_t)(factor); + uint32_t const factorHi = (uint32_t)(factor >> 32); + uint64_t const bits0 = (uint64_t)m * factorLo; + uint64_t const bits1 = (uint64_t)m * factorHi; + + uint64_t const sum = (bits0 >> 32) + bits1; + uint64_t const shiftedSum = sum >> (shift - 32); + assert(shiftedSum <= UINT32_MAX); + return (uint32_t)shiftedSum; +} + +__device__ inline int copy_special_str(char* const result, + bool const sign, + bool const exponent, + bool const mantissa) +{ + if (mantissa) { + memcpy(result, "NaN", 3); + return 3; + } + if (sign) { result[0] = '-'; } + if (exponent) { + memcpy(result + sign, "Infinity", 8); + return sign + 8; + } + memcpy(result + sign, "0.0", 3); + return sign + 3; +} + +__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa) +{ + if (mantissa) { return 3; } + if (exponent) { return sign + 8; } + return sign + 3; +} + +__device__ inline uint32_t float_to_bits(float const f) +{ + uint32_t bits = 0; + memcpy(&bits, &f, sizeof(float)); + return bits; +} + +__device__ inline uint64_t double_to_bits(double const d) +{ + uint64_t bits = 0; + memcpy(&bits, &d, sizeof(double)); + return bits; +} + +//===== d2s_intrinsics.h from ryu ===== + +__device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* const productHi) +{ + // The casts here help MSVC to avoid calls to the __allmul library function. + uint32_t const aLo = (uint32_t)a; + uint32_t const aHi = (uint32_t)(a >> 32); + uint32_t const bLo = (uint32_t)b; + uint32_t const bHi = (uint32_t)(b >> 32); + + uint64_t const b00 = (uint64_t)aLo * bLo; + uint64_t const b01 = (uint64_t)aLo * bHi; + uint64_t const b10 = (uint64_t)aHi * bLo; + uint64_t const b11 = (uint64_t)aHi * bHi; + + uint32_t const b00Lo = (uint32_t)b00; + uint32_t const b00Hi = (uint32_t)(b00 >> 32); + + uint64_t const mid1 = b10 + b00Hi; + uint32_t const mid1Lo = (uint32_t)(mid1); + uint32_t const mid1Hi = (uint32_t)(mid1 >> 32); + + uint64_t const mid2 = b01 + mid1Lo; + uint32_t const mid2Lo = (uint32_t)(mid2); + uint32_t const mid2Hi = (uint32_t)(mid2 >> 32); + + uint64_t const pHi = b11 + mid1Hi + mid2Hi; + uint64_t const pLo = ((uint64_t)mid2Lo << 32) | b00Lo; + + *productHi = pHi; + return pLo; +} + +__device__ inline uint64_t shiftright128(uint64_t const lo, uint64_t const hi, uint32_t const dist) +{ + // We don't need to handle the case dist >= 64 here (see above). + assert(dist < 64); + assert(dist > 0); + return (hi << (64 - dist)) | (lo >> dist); +} + +__device__ inline uint64_t div5(uint64_t const x) { return x / 5; } + +__device__ inline uint64_t div10(uint64_t const x) { return x / 10; } + +__device__ inline uint64_t div100(uint64_t const x) { return x / 100; } + +__device__ inline uint32_t pow5Factor(uint64_t value) +{ + uint64_t const m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64) + uint64_t const n_div_5 = 3689348814741910323u; // #{ n | n = 0 (mod 2^64) } = 2^64 / 5 + uint32_t count = 0; + for (;;) { + assert(value != 0); + value *= m_inv_5; + if (value > n_div_5) break; + ++count; + } + return count; +} + +// Returns true if value is divisible by 5^p. +__device__ inline bool multipleOfPowerOf5(uint64_t const value, uint32_t const p) +{ + // I tried a case distinction on p, but there was no performance difference. + return pow5Factor(value) >= p; +} + +// Returns true if value is divisible by 2^p. +__device__ inline bool multipleOfPowerOf2(uint64_t const value, uint32_t const p) +{ + assert(value != 0); + assert(p < 64); + // __builtin_ctzll doesn't appear to be faster here. + return (value & ((1ull << p) - 1)) == 0; +} + +__device__ inline uint64_t mulShift64(uint64_t const m, uint64_t const* const mul, int32_t const j) +{ + // m is maximum 55 bits + uint64_t high1; // 128 + uint64_t const low1 = umul128(m, mul[1], &high1); // 64 + uint64_t high0; // 64 + umul128(m, mul[0], &high0); // 0 + uint64_t const sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + return shiftright128(sum, high1, j - 64); +} + +__device__ inline uint64_t mulShiftAll64(uint64_t const m, + uint64_t const* const mul, + int32_t const j, + uint64_t* const vp, + uint64_t* const vm, + uint32_t const mmShift) +{ + *vp = mulShift64(4 * m + 2, mul, j); + *vm = mulShift64(4 * m - 1 - mmShift, mul, j); + return mulShift64(4 * m, mul, j); +} + +//===== d2s_small_table.h from ryu ===== + +// Computes 5^i in the form required by Ryu, and stores it in the given pointer. +__device__ inline void double_computePow5(uint32_t const i, uint64_t* const result) +{ + uint32_t const base = i / POW5_TABLE_SIZE; + uint32_t const base2 = base * POW5_TABLE_SIZE; + uint32_t const offset = i - base2; + uint64_t const* const mul = DOUBLE_POW5_SPLIT2[base]; + if (offset == 0) { + result[0] = mul[0]; + result[1] = mul[1]; + return; + } + uint64_t const m = DOUBLE_POW5_TABLE[offset]; + uint64_t high1; + uint64_t const low1 = umul128(m, mul[1], &high1); + uint64_t high0; + uint64_t const low0 = umul128(m, mul[0], &high0); + uint64_t const sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + // high1 | sum | low0 + uint32_t const delta = pow5bits(i) - pow5bits(base2); + result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); + result[1] = shiftright128(sum, high1, delta); +} + +// Computes 5^-i in the form required by Ryu, and stores it in the given pointer. +__device__ inline void double_computeInvPow5(uint32_t const i, uint64_t* const result) +{ + uint32_t const base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE; + uint32_t const base2 = base * POW5_TABLE_SIZE; + uint32_t const offset = base2 - i; + uint64_t const* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2 + if (offset == 0) { + result[0] = mul[0]; + result[1] = mul[1]; + return; + } + uint64_t const m = DOUBLE_POW5_TABLE[offset]; + uint64_t high1; + uint64_t const low1 = umul128(m, mul[1], &high1); + uint64_t high0; + uint64_t const low0 = umul128(m, mul[0] - 1, &high0); + uint64_t const sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + // high1 | sum | low0 + uint32_t const delta = pow5bits(base2) - pow5bits(i); + result[0] = + shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); + result[1] = shiftright128(sum, high1, delta); +} + +//===== f2s_intrinsics.h from ryu ===== + +__device__ inline uint32_t mulPow5InvDivPow2(uint32_t const m, uint32_t const q, int32_t const j) +{ + // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double + // lookup table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely + // on the fact that the added 1 that's already stored in the table never overflows into the upper + // 64 bits. + uint64_t pow5[2]; + double_computeInvPow5(q, pow5); + return mulShift32(m, pow5[1] + 1, j); +} + +__device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, int32_t const j) +{ + uint64_t pow5[2]; + double_computePow5(i, pow5); + return mulShift32(m, pow5[1], j); +} + +//===== d2s.c and f2s.c from ryu ===== + +__device__ inline uint32_t decimalLength17(uint64_t const v) +{ + // This is slightly faster than a loop. + // The average output length is 16.38 digits, so we check high-to-low. + // Function precondition: v is not an 18, 19, or 20-digit number. + // (17 digits are sufficient for round-tripping.) + assert(v < 100000000000000000L); + if (v >= 10000000000000000L) { return 17; } + if (v >= 1000000000000000L) { return 16; } + if (v >= 100000000000000L) { return 15; } + if (v >= 10000000000000L) { return 14; } + if (v >= 1000000000000L) { return 13; } + if (v >= 100000000000L) { return 12; } + if (v >= 10000000000L) { return 11; } + if (v >= 1000000000L) { return 10; } + if (v >= 100000000L) { return 9; } + if (v >= 10000000L) { return 8; } + if (v >= 1000000L) { return 7; } + if (v >= 100000L) { return 6; } + if (v >= 10000L) { return 5; } + if (v >= 1000L) { return 4; } + if (v >= 100L) { return 3; } + if (v >= 10L) { return 2; } + return 1; +} + +__device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t const ieeeExponent) +{ + int32_t e2; + uint64_t m2; + if (ieeeExponent == 0) { + // We subtract 2 so that the bounds computation has 2 additional bits. + e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; + m2 = ieeeMantissa; + } else { + e2 = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; + m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; + } + bool const even = (m2 & 1) == 0; + bool const acceptBounds = even; + + // Step 2: Determine the interval of valid decimal representations. + uint64_t const mv = 4 * m2; + // Implicit bool -> int conversion. True is 1, false is 0. + uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; + // We would compute mp and mm like this: + // uint64_t mp = 4 * m2 + 2; + // uint64_t mm = mv - 1 - mmShift; + + // Step 3: Convert to a decimal power base using 128-bit arithmetic. + uint64_t vr, vp, vm; + int32_t e10; + bool vmIsTrailingZeros = false; + bool vrIsTrailingZeros = false; + if (e2 >= 0) { + // I tried special-casing q == 0, but there was no effect on performance. + // This expression is slightly faster than max(0, log10Pow2(e2) - 1). + uint32_t const q = log10Pow2(e2) - (e2 > 3); + e10 = (int32_t)q; + int32_t const k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1; + int32_t const i = -e2 + (int32_t)q + k; + uint64_t pow5[2]; + double_computeInvPow5(q, pow5); + vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift); + + if (q <= 21) { + // This should use q <= 22, but I think 21 is also safe. Smaller values + // may still be safe, but it's more difficult to reason about them. + // Only one of mp, mv, and mm can be a multiple of 5, if any. + uint32_t const mvMod5 = ((uint32_t)mv) - 5 * ((uint32_t)div5(mv)); + if (mvMod5 == 0) { + vrIsTrailingZeros = multipleOfPowerOf5(mv, q); + } else if (acceptBounds) { + // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q + // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q + // <=> true && pow5Factor(mm) >= q, since e2 >= q. + vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q); + } else { + // Same as min(e2 + 1, pow5Factor(mp)) >= q. + vp -= multipleOfPowerOf5(mv + 2, q); + } + } + } else { + // This expression is slightly faster than max(0, log10Pow5(-e2) - 1). + uint32_t const q = log10Pow5(-e2) - (-e2 > 1); + e10 = (int32_t)q + e2; + int32_t const i = -e2 - (int32_t)q; + int32_t const k = pow5bits(i) - DOUBLE_POW5_BITCOUNT; + int32_t const j = (int32_t)q - k; + + uint64_t pow5[2]; + double_computePow5(i, pow5); + vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift); + + if (q <= 1) { + // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. + // mv = 4 * m2, so it always has at least two trailing 0 bits. + vrIsTrailingZeros = true; + if (acceptBounds) { + // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. + vmIsTrailingZeros = mmShift == 1; + } else { + // mp = mv + 2, so it always has at least one trailing 0 bit. + --vp; + } + } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here. + // We want to know if the full product has at least q trailing zeros. + // We need to compute min(p2(mv), p5(mv) - e2) >= q + // <=> p2(mv) >= q && p5(mv) - e2 >= q + // <=> p2(mv) >= q (because -e2 >= q) + vrIsTrailingZeros = multipleOfPowerOf2(mv, q); + } + } + + // Step 4: Find the shortest decimal representation in the interval of valid representations. + int32_t removed = 0; + uint8_t lastRemovedDigit = 0; + uint64_t output; + // On average, we remove ~2 digits. + if (vmIsTrailingZeros || vrIsTrailingZeros) { + // General case, which happens rarely (~0.7%). + for (;;) { + uint64_t const vpDiv10 = div10(vp); + uint64_t const vmDiv10 = div10(vm); + if (vpDiv10 <= vmDiv10) { break; } + uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10); + uint64_t const vrDiv10 = div10(vr); + uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10); + vmIsTrailingZeros &= vmMod10 == 0; + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t)vrMod10; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + + if (vmIsTrailingZeros) { + for (;;) { + uint64_t const vmDiv10 = div10(vm); + uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10); + if (vmMod10 != 0) { break; } + uint64_t const vpDiv10 = div10(vp); + uint64_t const vrDiv10 = div10(vr); + uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10); + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t)vrMod10; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + } + + if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { + // Round even if the exact number is .....50..0. + lastRemovedDigit = 4; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); + } else { + // Specialized for the common case (~99.3%). Percentages below are relative to this. + bool roundUp = false; + uint64_t const vpDiv100 = div100(vp); + uint64_t const vmDiv100 = div100(vm); + if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%). + uint64_t const vrDiv100 = div100(vr); + uint32_t const vrMod100 = ((uint32_t)vr) - 100 * ((uint32_t)vrDiv100); + roundUp = vrMod100 >= 50; + vr = vrDiv100; + vp = vpDiv100; + vm = vmDiv100; + removed += 2; + } + // Loop iterations below (approximately), without optimization above: + // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02% + // Loop iterations below (approximately), with optimization above: + // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02% + for (;;) { + uint64_t const vpDiv10 = div10(vp); + uint64_t const vmDiv10 = div10(vm); + if (vpDiv10 <= vmDiv10) { break; } + uint64_t const vrDiv10 = div10(vr); + uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10); + roundUp = vrMod10 >= 5; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + (vr == vm || roundUp); + } + int32_t const exp = e10 + removed; + + floating_decimal_64 fd; + fd.exponent = exp; + fd.mantissa = output; + return fd; +} + +__device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t const ieeeExponent) +{ + int32_t e2; + uint32_t m2; + if (ieeeExponent == 0) { + // We subtract 2 so that the bounds computation has 2 additional bits. + e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; + m2 = ieeeMantissa; + } else { + e2 = (int32_t)ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; + m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa; + } + bool const even = (m2 & 1) == 0; + bool const acceptBounds = even; + + // Step 2: Determine the interval of valid decimal representations. + uint32_t const mv = 4 * m2; + uint32_t const mp = 4 * m2 + 2; + // Implicit bool -> int conversion. True is 1, false is 0. + uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; + uint32_t const mm = 4 * m2 - 1 - mmShift; + + // Step 3: Convert to a decimal power base using 64-bit arithmetic. + uint32_t vr, vp, vm; + int32_t e10; + bool vmIsTrailingZeros = false; + bool vrIsTrailingZeros = false; + uint8_t lastRemovedDigit = 0; + if (e2 >= 0) { + uint32_t const q = log10Pow2(e2); + e10 = (int32_t)q; + int32_t const k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1; + int32_t const i = -e2 + (int32_t)q + k; + vr = mulPow5InvDivPow2(mv, q, i); + vp = mulPow5InvDivPow2(mp, q, i); + vm = mulPow5InvDivPow2(mm, q, i); + if (q != 0 && (vp - 1) / 10 <= vm / 10) { + // We need to know one removed digit even if we are not going to loop below. We could use + // q = X - 1 above, except that would require 33 bits for the result, and we've found that + // 32-bit arithmetic is faster even on 64-bit machines. + int32_t const l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)(q - 1)) - 1; + lastRemovedDigit = (uint8_t)(mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t)q - 1 + l) % 10); + } + if (q <= 9) { + // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well. + // Only one of mp, mv, and mm can be a multiple of 5, if any. + if (mv % 5 == 0) { + vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q); + } else if (acceptBounds) { + vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q); + } else { + vp -= multipleOfPowerOf5_32(mp, q); + } + } + } else { + uint32_t const q = log10Pow5(-e2); + e10 = (int32_t)q + e2; + int32_t const i = -e2 - (int32_t)q; + int32_t const k = pow5bits(i) - FLOAT_POW5_BITCOUNT; + int32_t j = (int32_t)q - k; + vr = mulPow5divPow2(mv, (uint32_t)i, j); + vp = mulPow5divPow2(mp, (uint32_t)i, j); + vm = mulPow5divPow2(mm, (uint32_t)i, j); + if (q != 0 && (vp - 1) / 10 <= vm / 10) { + j = (int32_t)q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT); + lastRemovedDigit = (uint8_t)(mulPow5divPow2(mv, (uint32_t)(i + 1), j) % 10); + } + if (q <= 1) { + // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. + // mv = 4 * m2, so it always has at least two trailing 0 bits. + vrIsTrailingZeros = true; + if (acceptBounds) { + // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. + vmIsTrailingZeros = mmShift == 1; + } else { + // mp = mv + 2, so it always has at least one trailing 0 bit. + --vp; + } + } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here. + vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1); + } + } + + // Step 4: Find the shortest decimal representation in the interval of valid representations. + int32_t removed = 0; + uint32_t output; + if (vmIsTrailingZeros || vrIsTrailingZeros) { + // General case, which happens rarely (~4.0%). + while (vp / 10 > vm / 10) { + vmIsTrailingZeros &= vm % 10 == 0; + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t)(vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + if (vmIsTrailingZeros) { + while (vm % 10 == 0) { + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t)(vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + } + if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { + // Round even if the exact number is .....50..0. + lastRemovedDigit = 4; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); + } else { + // Specialized for the common case (~96.0%). Percentages below are relative to this. + // Loop iterations below (approximately): + // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01% + while (vp / 10 > vm / 10) { + lastRemovedDigit = (uint8_t)(vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + (vr == vm || lastRemovedDigit >= 5); + } + int32_t const exp = e10 + removed; + + floating_decimal_32 fd; + fd.exponent = exp; + fd.mantissa = output; + return fd; +} + +__device__ inline int to_chars(floating_decimal_64 const v, bool const sign, char* const result) +{ + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { result[index++] = '-'; } + + uint64_t output = v.mantissa; + uint32_t const olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t)olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + // Values in the interval [1E-3, 1E7) are special. + if (scientificNotation) { + // Print in the format x.xxxxxE-yy. + for (uint32_t i = 0; i < olength - 1; ++i) { + uint32_t const c = output % 10; + output /= 10; + result[index + olength - i] = (char)('0' + c); + } + result[index] = '0' + output % 10; + result[index + 1] = '.'; + index += olength + 1; + if (olength == 1) { result[index++] = '0'; } + // Print 'E', the exponent sign, and the exponent, which has at most three digits. + result[index++] = 'E'; + if (exp < 0) { + result[index++] = '-'; + exp = -exp; + } + if (exp >= 100) { + result[index++] = (char)('0' + exp / 100); + exp %= 100; + result[index++] = (char)('0' + exp / 10); + } else if (exp >= 10) { + result[index++] = (char)('0' + exp / 10); + } + result[index++] = (char)('0' + exp % 10); + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + result[index++] = '0'; + result[index++] = '.'; + for (int i = -1; i > exp; i--) { + result[index++] = '0'; + } + int current = index; + for (int i = 0; i < olength; i++) { + result[current + olength - i - 1] = (char)('0' + output % 10); + output /= 10; + index++; + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + for (int i = 0; i < olength; i++) { + result[index + olength - i - 1] = (char)('0' + output % 10); + output /= 10; + } + index += olength; + for (int i = olength; i < exp + 1; i++) { + result[index++] = '0'; + } + result[index++] = '.'; + result[index++] = '0'; + } else { + // Decimal dot is somewhere between the digits. + int current = index + 1; + for (int i = 0; i < olength; i++) { + if (olength - i - 1 == exp) { + result[current + olength - i - 1] = '.'; + current--; + } + result[current + olength - i - 1] = (char)('0' + output % 10); + output /= 10; + } + index += olength + 1; + } + } + return index; +} + +__device__ inline int d2s_size(floating_decimal_64 const v, bool const sign) +{ + int index = 0; + if (sign) { index++; } + + uint64_t output = v.mantissa; + uint32_t const olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t)olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + index += olength + 1; + if (olength == 1) { index++; } + // 'E' + index++; + if (exp < 0) { + exp = -exp; + index++; + } + if (exp >= 100) { + index += 3; + } else if (exp >= 10) { + index += 2; + } else { + index++; + } + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + index += 1 - exp + olength; + } else if (exp + 1 >= olength) { + index += exp + 3; + } else { + index += olength + 1; + } + } + return index; +} + +__device__ inline int to_chars(floating_decimal_32 const v, bool const sign, char* const result) +{ + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { result[index++] = '-'; } + + uint32_t output = v.mantissa; + uint32_t const olength = decimalLength9(output); + int32_t exp = v.exponent + olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + // Print in the format x.xxxxxE-yy. + for (int i = 0; i < olength - 1; i++) { + int c = output % 10; + output /= 10; + result[index + olength - i] = (char)('0' + c); + } + result[index] = (char)('0' + output % 10); + result[index + 1] = '.'; + index += olength + 1; + if (olength == 1) { result[index++] = '0'; } + + // Print 'E', the exponent sign, and the exponent, which has at most two digits. + result[index++] = 'E'; + if (exp < 0) { + result[index++] = '-'; + exp = -exp; + } + if (exp >= 10) { result[index++] = (char)('0' + exp / 10); } + result[index++] = (char)('0' + exp % 10); + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + result[index++] = '0'; + result[index++] = '.'; + for (int i = -1; i > exp; i--) { + result[index++] = '0'; + } + int current = index; + for (int i = 0; i < olength; i++) { + result[current + olength - i - 1] = (char)('0' + output % 10); + output /= 10; + index++; + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + for (int i = 0; i < olength; i++) { + result[index + olength - i - 1] = (char)('0' + output % 10); + output /= 10; + } + index += olength; + for (int i = olength; i < exp + 1; i++) { + result[index++] = '0'; + } + result[index++] = '.'; + result[index++] = '0'; + } else { + // Decimal dot is somewhere between the digits. + int current = index + 1; + for (int i = 0; i < olength; i++) { + if (olength - i - 1 == exp) { + result[current + olength - i - 1] = '.'; + current--; + } + result[current + olength - i - 1] = (char)('0' + output % 10); + output /= 10; + } + index += olength + 1; + } + } + return index; +} + +__device__ inline int f2s_size(floating_decimal_32 const v, bool const sign) +{ + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { index++; } + + uint32_t output = v.mantissa; + uint32_t const olength = decimalLength9(output); + int32_t exp = v.exponent + olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + index += olength + 1; + if (olength == 1) { index++; } + // 'E' + index++; + if (exp < 0) { + index++; + exp = -exp; + } + if (exp >= 10) { index++; } + index++; + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + index += 1 - exp + olength; + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + index += exp + 3; + } else { + // Decimal dot is somewhere between the digits. + index += olength + 1; + } + } + return index; +} + +__device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, + uint32_t const ieeeExponent, + floating_decimal_64* const v) +{ + uint64_t const m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; + int32_t const e2 = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; + + if (e2 > 0) { + // f = m2 * 2^e2 >= 2^53 is an integer. + // Ignore this case for now. + return false; + } + + if (e2 < -52) { + // f < 1. + return false; + } + + // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53. + // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0. + uint64_t const mask = (1ull << -e2) - 1; + uint64_t const fraction = m2 & mask; + if (fraction != 0) { return false; } + + // f is an integer in the range [1, 2^53). + // Note: mantissa might contain trailing (decimal) 0's. + // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17(). + v->mantissa = m2 >> -e2; + v->exponent = 0; + return true; +} + +__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) +{ + // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. + uint64_t const bits = double_to_bits(f); + + // Decode bits into sign, mantissa, and exponent. + ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0; + uint64_t const ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1); + uint32_t const ieeeExponent = + (uint32_t)((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); + // Case distinction; exit early for the easy cases. + if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || + (ieeeExponent == 0 && ieeeMantissa == 0)) { + special = true; + return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent}; + } + special = false; + floating_decimal_64 v; + bool const isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v); + if (isSmallInt) { + // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros. + // For scientific notation we need to move these zeros into the exponent. + // (This is not needed for fixed-point notation, so it might be beneficial to trim + // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.) + for (;;) { + uint64_t const q = div10(v.mantissa); + uint32_t const r = ((uint32_t)v.mantissa) - 10 * ((uint32_t)q); + if (r != 0) { break; } + v.mantissa = q; + ++v.exponent; + } + } else { + v = d2d(ieeeMantissa, ieeeExponent); + } + return v; +} + +__device__ int d2s_buffered_n(double f, char* result) +{ + bool sign = false, special = false; + floating_decimal_64 v = d2d(f, sign, special); + if (special) { return copy_special_str(result, sign, v.exponent, v.mantissa); } + return to_chars(v, sign, result); +} + +__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) +{ + // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. + uint32_t const bits = float_to_bits(f); + + // Decode bits into sign, mantissa, and exponent. + ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0; + uint32_t const ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1); + uint32_t const ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1); + + // Case distinction; exit early for the easy cases. + if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || + (ieeeExponent == 0 && ieeeMantissa == 0)) { + special = true; + return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent}; + } + special = false; + return f2d(ieeeMantissa, ieeeExponent); +} + +__device__ int f2s_buffered_n(float f, char* result) +{ + bool sign = false, special = false; + floating_decimal_32 v = f2d(f, sign, special); + if (special) { return copy_special_str(result, sign, v.exponent, v.mantissa); } + return to_chars(v, sign, result); +} + +//===== compute float to string size ===== + +__device__ int compute_d2s_size(double value) +{ + bool sign = false, special = false; + floating_decimal_64 v = d2d(value, sign, special); + if (special) { return special_str_size(sign, v.exponent, v.mantissa); } + return d2s_size(v, sign); +} + +__device__ int compute_f2s_size(float value) +{ + bool sign = false, special = false; + floating_decimal_32 v = f2d(value, sign, special); + if (special) { return special_str_size(sign, v.exponent, v.mantissa); } + return f2s_size(v, sign); +} + +} // namespace + +//===== APIs ===== + +__device__ int compute_ftos_size(double value, bool is_float) +{ + if (is_float) { + return compute_f2s_size(value); + } else { + return compute_d2s_size(value); + } +} + +__device__ int float_to_string(double value, bool is_float, char* output) +{ + if (is_float) { + return f2s_buffered_n(value, output); + } else { + return d2s_buffered_n(value, output); + } +} + +} // namespace spark_rapids_jni::ftos_converter diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt index 5e16398145..c9bb13046f 100644 --- a/src/main/cpp/tests/CMakeLists.txt +++ b/src/main/cpp/tests/CMakeLists.txt @@ -51,6 +51,9 @@ ConfigureTest(CAST_STRING ConfigureTest(CAST_DECIMAL_TO_STRING cast_decimal_to_string.cpp) +ConfigureTest(CAST_FLOAT_TO_STRING + cast_float_to_string.cpp) + ConfigureTest(DATETIME_REBASE datetime_rebase.cpp) diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp new file mode 100644 index 0000000000..1ae066fe42 --- /dev/null +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +#include + +using namespace cudf; + +constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; + +struct FloatToStringTests : public cudf::test::BaseFixture {}; + +TEST_F(FloatToStringTests, FromFloats32) +{ + auto const floats = + cudf::test::fixed_width_column_wrapper{100.0f, + 654321.25f, + -12761.125f, + 0.f, + 5.0f, + -4.0f, + std::numeric_limits::quiet_NaN(), + 123456789012.34f, + -0.0f}; + + auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); + + auto const expected = cudf::test::strings_column_wrapper{ + "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "1.2345679E11", "-0.0"}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); +} + +TEST_F(FloatToStringTests, FromFloats64) +{ + auto const floats = + cudf::test::fixed_width_column_wrapper{100.0d, + 654321.25d, + -12761.125d, + 1.123456789123456789d, + 0.000000000000000000123456789123456789d, + 0.0d, + 5.0d, + -4.0d, + std::numeric_limits::quiet_NaN(), + 839542223232.794248339d, + -0.0d}; + + auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); + + auto const expected = cudf::test::strings_column_wrapper{"100.0", + "654321.25", + "-12761.125", + "1.1234567891234568", + "1.234567891234568E-19", + "0.0", + "5.0", + "-4.0", + "NaN", + "8.395422232327942E11", + "-0.0"}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); +} \ No newline at end of file diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index eab42c41f6..022cb93085 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -80,6 +80,16 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st return new ColumnVector(toDecimal(cv.getNativeView(), ansiMode, strip, precision, scale)); } + /** + * Convert a float column to a string column. + * + * @param cv the column data to process + * @return the converted column + */ + public static ColumnVector fromFloat(ColumnView cv) { + return new ColumnVector(fromFloat(cv.getNativeView())); + } + /** * Convert a decimal column to a string column. * @@ -137,6 +147,7 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled int precision, int scale); private static native long toFloat(long nativeColumnView, boolean ansi_enabled, int dtype); private static native long fromDecimal(long nativeColumnView); + private static native long fromFloat(long nativeColumnView); private static native long toIntegersWithBase(long nativeColumnView, int base, boolean ansiEnabled, int dtype); private static native long fromIntegersWithBase(long nativeColumnView, int base); From b51fde8fdf3e8e52199044a9e3b02f8a1f3a085b Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 8 Dec 2023 23:27:08 +0800 Subject: [PATCH 046/127] Update submodule cudf to dee47c7b55b3adb6c4f7545699112ddd6240441f (#1631) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 248aa2c887..dee47c7b55 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 248aa2c8873c12e41f1e6ea2660740a0a4ddaf68 +Subproject commit dee47c7b55b3adb6c4f7545699112ddd6240441f From 4e67dd9ddea7783f8c18d4648b447656c15e9a56 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Sat, 9 Dec 2023 11:24:00 +0800 Subject: [PATCH 047/127] Update submodule cudf to 899e3923b4a25078f1274ef1ba85cc5ef90552d6 (#1632) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index dee47c7b55..899e3923b4 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit dee47c7b55b3adb6c4f7545699112ddd6240441f +Subproject commit 899e3923b4a25078f1274ef1ba85cc5ef90552d6 From 2b4bc70bb78cb3164fe7dfd5dfd157553e1fa21e Mon Sep 17 00:00:00 2001 From: spark-rapids automation <70000568+nvauto@users.noreply.github.com> Date: Mon, 11 Dec 2023 02:27:36 +0000 Subject: [PATCH 048/127] Auto-merge use submodule in BASE ref Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 2ce46216b5..899e3923b4 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 2ce46216b5bc9926aec438b64fc490c31c526a31 +Subproject commit 899e3923b4a25078f1274ef1ba85cc5ef90552d6 From ba7c3ad2015e7ebb874bfe5a67b74dbb0e1f170d Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Mon, 11 Dec 2023 17:27:15 +0800 Subject: [PATCH 049/127] Update submodule cudf to 759a1c867fda8b207154f024b63de89701b2dad6 (#1636) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 899e3923b4..759a1c867f 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 899e3923b4a25078f1274ef1ba85cc5ef90552d6 +Subproject commit 759a1c867fda8b207154f024b63de89701b2dad6 From 9e88a5711c80df203148d0bc99f303e11534fa7a Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 12 Dec 2023 05:31:56 +0800 Subject: [PATCH 050/127] Update submodule cudf to fcaebeba50c0f6ea1c98491ab232a7cd9e018c71 (#1638) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 759a1c867f..fcaebeba50 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 759a1c867fda8b207154f024b63de89701b2dad6 +Subproject commit fcaebeba50c0f6ea1c98491ab232a7cd9e018c71 From feca82aa2461b6704a5e669535962848c055f133 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 12 Dec 2023 11:26:31 +0800 Subject: [PATCH 051/127] Update submodule cudf to 1c6f80dc630d3a18e216812d4d6bd912995971d0 (#1640) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index fcaebeba50..1c6f80dc63 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit fcaebeba50c0f6ea1c98491ab232a7cd9e018c71 +Subproject commit 1c6f80dc630d3a18e216812d4d6bd912995971d0 From abfc5415acf68827623c49c8836865cc962e80ea Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 12 Dec 2023 17:28:34 +0800 Subject: [PATCH 052/127] Update submodule cudf to f8e891fc551ff691ac62c6d4067cb1867ea6213c (#1641) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 1c6f80dc63..f8e891fc55 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 1c6f80dc630d3a18e216812d4d6bd912995971d0 +Subproject commit f8e891fc551ff691ac62c6d4067cb1867ea6213c From c95098c7a740d53c73b1f487ba5e318075cbae44 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 12 Dec 2023 23:30:11 +0800 Subject: [PATCH 053/127] Update submodule cudf to ef11061911aa9ef77cf615fea042a2bfa6f6cdea (#1642) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index f8e891fc55..ef11061911 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit f8e891fc551ff691ac62c6d4067cb1867ea6213c +Subproject commit ef11061911aa9ef77cf615fea042a2bfa6f6cdea From f6bb2052af9d3bce60d087c758c76b6d77b9936f Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 13 Dec 2023 06:07:54 +0800 Subject: [PATCH 054/127] Update submodule cudf to 21c90d6a264ee4334084f513b34425d5fdd032f8 (#1644) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index ef11061911..21c90d6a26 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit ef11061911aa9ef77cf615fea042a2bfa6f6cdea +Subproject commit 21c90d6a264ee4334084f513b34425d5fdd032f8 From 6f049a7bf3a2ec990766840f0b723d5e7d750efb Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 13 Dec 2023 12:04:11 +0800 Subject: [PATCH 055/127] Update submodule cudf to 06984380b724d30565c8da40c6512ea62ba4a64f (#1645) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 21c90d6a26..06984380b7 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 21c90d6a264ee4334084f513b34425d5fdd032f8 +Subproject commit 06984380b724d30565c8da40c6512ea62ba4a64f From 7340b17448ca140a996de5b4160de793db6626e1 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 13 Dec 2023 18:04:22 +0800 Subject: [PATCH 056/127] Update submodule cudf to 420dc5d787d4571c00266364f1a253e5ccffb094 (#1646) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 06984380b7..420dc5d787 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 06984380b724d30565c8da40c6512ea62ba4a64f +Subproject commit 420dc5d787d4571c00266364f1a253e5ccffb094 From 6fc60a7ef8f7f480b218390e6209d7c203d713fc Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 14 Dec 2023 00:03:43 +0800 Subject: [PATCH 057/127] Update submodule cudf to a894ca03b18bd0304180f97882ccaaffa18028a0 (#1647) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 420dc5d787..a894ca03b1 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 420dc5d787d4571c00266364f1a253e5ccffb094 +Subproject commit a894ca03b18bd0304180f97882ccaaffa18028a0 From 5466b8fc6c07280a531519ed7de3584176f130e4 Mon Sep 17 00:00:00 2001 From: Gera Shegalov Date: Wed, 13 Dec 2023 10:02:32 -0800 Subject: [PATCH 058/127] Make OOM injections separable by host & device (#1637) This PR makes it possible to inject - host OOM only - device OOM only - in addition to the current mixed OOM injections It also enables deferring OOM to the N+1st allocation by skipping first N --------- Signed-off-by: Gera Shegalov --- src/main/cpp/src/SparkResourceAdaptorJni.cpp | 108 +++++++++++++----- .../com/nvidia/spark/rapids/jni/RmmSpark.java | 29 ++++- .../rapids/jni/SparkResourceAdaptor.java | 27 ++++- 3 files changed, 123 insertions(+), 41 deletions(-) diff --git a/src/main/cpp/src/SparkResourceAdaptorJni.cpp b/src/main/cpp/src/SparkResourceAdaptorJni.cpp index d3821fcc18..b8fb337bf2 100644 --- a/src/main/cpp/src/SparkResourceAdaptorJni.cpp +++ b/src/main/cpp/src/SparkResourceAdaptorJni.cpp @@ -226,6 +226,36 @@ struct task_metrics { } }; +enum class oom_type { + CPU_OR_GPU = 0, + CPU, + GPU, +}; + +struct oom_state_type { + int hit_count = 0; + int skip_count = 0; + oom_type filter = oom_type::CPU_OR_GPU; + + void init(int const num_ooms, int const skip_count, int const oom_type_id) + { + if (num_ooms < 0) { throw std::invalid_argument("num_ooms cannot be negative"); } + if (skip_count < 0) { throw std::invalid_argument("skip_count cannot be negative"); } + if (oom_type_id < 0 || oom_type_id > 2) { + throw std::invalid_argument("oom_filter must be between 0 and 2"); + } + this->hit_count = num_ooms; + this->skip_count = skip_count; + this->filter = static_cast(oom_type_id); + } + + bool matches(bool is_for_cpu) + { + return filter == oom_type::CPU_OR_GPU || (is_for_cpu && filter == oom_type::CPU) || + ((!is_for_cpu) && filter == oom_type::GPU); + } +}; + /** * This is the full state of a thread. Some things like the thread_id and task_id * should not change after the state is set up. Everything else is up for change, @@ -249,10 +279,12 @@ class full_thread_state { std::unordered_set pool_task_ids; bool is_cpu_alloc = false; // Is the thread transitively blocked on a pool or not. - bool pool_blocked = false; - int retry_oom_injected = 0; - int split_and_retry_oom_injected = 0; - int cudf_exception_injected = 0; + bool pool_blocked = false; + + oom_state_type retry_oom; + oom_state_type split_and_retry_oom; + + int cudf_exception_injected = 0; // watchdog limit on maximum number of retries to avoid unexpected live lock situations int num_times_retried = 0; // When did the retry time for this thread start, or when did the block time end. @@ -663,12 +695,15 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * Force a specific thread to throw one or more RetryOOM exceptions when an * alloc is called. This is intended only for testing. */ - void force_retry_oom(long const thread_id, int const num_ooms) + void force_retry_oom(long const thread_id, + int const num_ooms, + int const oom_filter, + int const skip_count) { std::unique_lock lock(state_mutex); auto const threads_at = threads.find(thread_id); if (threads_at != threads.end()) { - threads_at->second.retry_oom_injected = num_ooms; + threads_at->second.retry_oom.init(num_ooms, skip_count, oom_filter); } else { throw std::invalid_argument("the thread is not associated with any task/shuffle"); } @@ -678,12 +713,15 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { * Force a specific thread to throw one or more SplitAndRetryOOM exceptions * when an alloc is called. This is intended only for testing. */ - void force_split_and_retry_oom(long const thread_id, int const num_ooms) + void force_split_and_retry_oom(long const thread_id, + int const num_ooms, + int const oom_filter, + int const skip_count) { std::unique_lock lock(state_mutex); auto const threads_at = threads.find(thread_id); if (threads_at != threads.end()) { - threads_at->second.split_and_retry_oom_injected = num_ooms; + threads_at->second.split_and_retry_oom.init(num_ooms, skip_count, oom_filter); } else { throw std::invalid_argument("the thread is not associated with any task/shuffle"); } @@ -1228,15 +1266,18 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { default: break; } - if (thread->second.retry_oom_injected > 0) { - thread->second.retry_oom_injected--; - thread->second.metrics.num_times_retry_throw++; - log_status("INJECTED_RETRY_OOM", thread_id, thread->second.task_id, thread->second.state); - thread->second.record_failed_retry_time(); - if (is_for_cpu) { - throw_java_exception(CPU_RETRY_OOM_CLASS, "injected RetryOOM"); - } else { - throw_java_exception(GPU_RETRY_OOM_CLASS, "injected RetryOOM"); + if (thread->second.retry_oom.matches(is_for_cpu)) { + if (thread->second.retry_oom.skip_count > 0) { + thread->second.retry_oom.skip_count--; + } else if (thread->second.retry_oom.hit_count > 0) { + thread->second.retry_oom.hit_count--; + thread->second.metrics.num_times_retry_throw++; + std::string const op_prefix = "INJECTED_RETRY_OOM_"; + std::string const op = op_prefix + (is_for_cpu ? "CPU" : "GPU"); + log_status(op, thread_id, thread->second.task_id, thread->second.state); + thread->second.record_failed_retry_time(); + throw_java_exception(is_for_cpu ? CPU_RETRY_OOM_CLASS : GPU_RETRY_OOM_CLASS, + "injected RetryOOM"); } } @@ -1248,16 +1289,21 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { throw_java_exception(cudf::jni::CUDF_ERROR_CLASS, "injected CudfException"); } - if (thread->second.split_and_retry_oom_injected > 0) { - thread->second.split_and_retry_oom_injected--; - thread->second.metrics.num_times_split_retry_throw++; - log_status( - "INJECTED_SPLIT_AND_RETRY_OOM", thread_id, thread->second.task_id, thread->second.state); - thread->second.record_failed_retry_time(); - if (is_for_cpu) { - throw_java_exception(CPU_SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM"); - } else { - throw_java_exception(GPU_SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM"); + if (thread->second.split_and_retry_oom.matches(is_for_cpu)) { + if (thread->second.split_and_retry_oom.skip_count > 0) { + thread->second.split_and_retry_oom.skip_count--; + } else if (thread->second.split_and_retry_oom.hit_count > 0) { + thread->second.split_and_retry_oom.hit_count--; + thread->second.metrics.num_times_split_retry_throw++; + std::string const op_prefix = "INJECTED_SPLIT_AND_RETRY_OOM_"; + std::string const op = op_prefix + (is_for_cpu ? "CPU" : "GPU"); + log_status(op, thread_id, thread->second.task_id, thread->second.state); + thread->second.record_failed_retry_time(); + if (is_for_cpu) { + throw_java_exception(CPU_SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM"); + } else { + throw_java_exception(GPU_SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM"); + } } } @@ -1927,25 +1973,25 @@ JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_don } JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_forceRetryOOM( - JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_ooms) + JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_ooms, jint oom_filter, jint skip_count) { JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); auto mr = reinterpret_cast(ptr); - mr->force_retry_oom(thread_id, num_ooms); + mr->force_retry_oom(thread_id, num_ooms, oom_filter, skip_count); } CATCH_STD(env, ) } JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_forceSplitAndRetryOOM( - JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_ooms) + JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_ooms, jint oom_filter, jint skip_count) { JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", ); try { cudf::jni::auto_set_device(env); auto mr = reinterpret_cast(ptr); - mr->force_split_and_retry_oom(thread_id, num_ooms); + mr->force_split_and_retry_oom(thread_id, num_ooms, oom_filter, skip_count); } CATCH_STD(env, ) } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java b/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java index 558124e2fe..e171894601 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java @@ -23,11 +23,20 @@ import ai.rapids.cudf.RmmException; import ai.rapids.cudf.RmmTrackingResourceAdaptor; +import java.util.Arrays; +import java.util.Map; + /** * Initialize RMM in ways that are specific to Spark. */ public class RmmSpark { + public enum OomInjectionType { + CPU_OR_GPU, + CPU, + GPU; + } + private static volatile SparkResourceAdaptor sra = null; /** @@ -432,17 +441,23 @@ public static void forceRetryOOM(long threadId) { * allocation attempt, depending on the type of allocation being done. * @param threadId the ID of the thread to throw the exception (not java thread id). * @param numOOMs the number of times the *RetryOOM should be thrown + * @param oomMode the ordinal corresponding to OomInjectionType to filter allocations + * @param skipCount how many matching allocations to skip */ - public static void forceRetryOOM(long threadId, int numOOMs) { + public static void forceRetryOOM(long threadId, int numOOMs, int oomMode, int skipCount) { synchronized (Rmm.class) { if (sra != null && sra.isOpen()) { - sra.forceRetryOOM(threadId, numOOMs); + sra.forceRetryOOM(threadId, numOOMs, oomMode, skipCount); } else { throw new IllegalStateException("RMM has not been configured for OOM injection"); } } } + public static void forceRetryOOM(long threadId, int numOOMs) { + forceRetryOOM(threadId, numOOMs, OomInjectionType.CPU_OR_GPU.ordinal(), 0); + } + /** * Force the thread with the given ID to throw a GpuSplitAndRetryOOM of CpuSplitAndRetryOOM * on their next allocation attempt, depending on the allocation being done. @@ -457,17 +472,23 @@ public static void forceSplitAndRetryOOM(long threadId) { * on their next allocation attempt, depending on the allocation being done. * @param threadId the ID of the thread to throw the exception (not java thread id). * @param numOOMs the number of times the *SplitAndRetryOOM should be thrown + * @param oomMode the ordinal corresponding to OomInjectionType to filter allocations + * @param skipCount how many matching allocations to skip */ - public static void forceSplitAndRetryOOM(long threadId, int numOOMs) { + public static void forceSplitAndRetryOOM(long threadId, int numOOMs, int oomMode, int skipCount) { synchronized (Rmm.class) { if (sra != null && sra.isOpen()) { - sra.forceSplitAndRetryOOM(threadId, numOOMs); + sra.forceSplitAndRetryOOM(threadId, numOOMs, oomMode, skipCount); } else { throw new IllegalStateException("RMM has not been configured for OOM injection"); } } } + public static void forceSplitAndRetryOOM(long threadId, int numOOMs) { + forceSplitAndRetryOOM(threadId, numOOMs, OomInjectionType.CPU_OR_GPU.ordinal(), 0); + } + /** * Force the thread with the given ID to throw a CudfException on their next allocation attempt. * This is to simulate a cuDF exception being thrown from a kernel and test retry handling code. diff --git a/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java b/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java index 74f1946748..d766c34230 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java @@ -15,6 +15,8 @@ */ package com.nvidia.spark.rapids.jni; +import com.nvidia.spark.rapids.jni.RmmSpark.OomInjectionType; + import ai.rapids.cudf.NativeDepsLoader; import ai.rapids.cudf.RmmDeviceMemoryResource; import ai.rapids.cudf.RmmEventHandlerResourceAdaptor; @@ -186,18 +188,31 @@ public void doneWaitingOnPool(long threadId) { * Force the thread with the given ID to throw a GpuRetryOOM on their next allocation attempt. * @param threadId the ID of the thread to throw the exception (not java thread id). * @param numOOMs the number of times the GpuRetryOOM should be thrown + * @param oomMode ordinal of the corresponding RmmSpark.OomInjectionType + * @param skipCount the number of times a matching allocation is skipped before injecting the first OOM */ - public void forceRetryOOM(long threadId, int numOOMs) { - forceRetryOOM(getHandle(), threadId, numOOMs); + public void forceRetryOOM(long threadId, int numOOMs, int oomMode, int skipCount) { + validateOOMInjectionParams(numOOMs, oomMode, skipCount); + forceRetryOOM(getHandle(), threadId, numOOMs, oomMode, skipCount); + } + + private void validateOOMInjectionParams(int numOOMs, int oomMode, int skipCount) { + assert numOOMs >= 0 : "non-negative numOoms expected: actual=" + numOOMs; + assert skipCount >= 0 : "non-negative skipCount expected: actual=" + skipCount; + assert oomMode >= 0 && oomMode < OomInjectionType.values().length: + "non-negative oomMode<" + OomInjectionType.values().length + " expected: actual=" + oomMode; } /** * Force the thread with the given ID to throw a GpuSplitAndRetryOOM on their next allocation attempt. * @param threadId the ID of the thread to throw the exception (not java thread id). * @param numOOMs the number of times the GpuSplitAndRetryOOM should be thrown + * @param oomMode ordinal of the corresponding RmmSpark.OomInjectionType + * @param skipCount the number of times a matching allocation is skipped before injecting the first OOM */ - public void forceSplitAndRetryOOM(long threadId, int numOOMs) { - forceSplitAndRetryOOM(getHandle(), threadId, numOOMs); + public void forceSplitAndRetryOOM(long threadId, int numOOMs, int oomMode, int skipCount) { + validateOOMInjectionParams(numOOMs, oomMode, skipCount); + forceSplitAndRetryOOM(getHandle(), threadId, numOOMs, oomMode, skipCount); } /** @@ -295,8 +310,8 @@ public void cpuDeallocate(long ptr, long amount) { private static native void submittingToPool(long handle, long threadId); private static native void waitingOnPool(long handle, long threadId); private static native void doneWaitingOnPool(long handle, long threadId); - private static native void forceRetryOOM(long handle, long threadId, int numOOMs); - private static native void forceSplitAndRetryOOM(long handle, long threadId, int numOOMs); + private static native void forceRetryOOM(long handle, long threadId, int numOOMs, int oomMode, int skipCount); + private static native void forceSplitAndRetryOOM(long handle, long threadId, int numOOMs, int oomMode, int skipCount); private static native void forceCudfException(long handle, long threadId, int numTimes); private static native void blockThreadUntilReady(long handle); private static native int getStateOf(long handle, long threadId); From 56081f7472ec498f8b54f80627b3bf1b8deb7b0d Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 14 Dec 2023 05:25:53 +0800 Subject: [PATCH 059/127] Update submodule cudf to 8136a16701f970b512d2bd35a45606f00263fd89 (#1648) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index a894ca03b1..8136a16701 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit a894ca03b18bd0304180f97882ccaaffa18028a0 +Subproject commit 8136a16701f970b512d2bd35a45606f00263fd89 From 4d2fc033a0b1636adb44edace5134cf462493f21 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 14 Dec 2023 11:28:24 +0800 Subject: [PATCH 060/127] Update submodule cudf to cee642916cfc3b8df73e819bf3bc50f1b9fc684f (#1649) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 8136a16701..cee642916c 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 8136a16701f970b512d2bd35a45606f00263fd89 +Subproject commit cee642916cfc3b8df73e819bf3bc50f1b9fc684f From 19dc0595dd1fa99005f9150ac315040dcd622df8 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 15 Dec 2023 11:30:56 +0800 Subject: [PATCH 061/127] Update submodule cudf to 2cb8f3da3a3dd539301f90dcfbccadaf06963fd2 (#1653) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index cee642916c..2cb8f3da3a 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit cee642916cfc3b8df73e819bf3bc50f1b9fc684f +Subproject commit 2cb8f3da3a3dd539301f90dcfbccadaf06963fd2 From 6320bbe5eb4f19c8a4f7781abd97351f5e3db18a Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Fri, 15 Dec 2023 18:47:50 +0100 Subject: [PATCH 062/127] Handle Decimal-128 Multiplication For Newer Spark Versions (#1623) * Added another multiplication method for decimal 128 * Signing off Signed-off-by: Raza Jafri * addressed review comments * fixed clang * addressed review comments * ran pre-commit * removed pass-by reference * possible reason for CI failure, as locally it still builds * addressed review comments * formatted Java code --------- Signed-off-by: Raza Jafri --- src/main/cpp/src/DecimalUtilsJni.cpp | 11 ++-- src/main/cpp/src/decimal_utils.cu | 51 +++++++++++-------- src/main/cpp/src/decimal_utils.hpp | 1 + .../nvidia/spark/rapids/jni/DecimalUtils.java | 35 +++++++++++-- .../spark/rapids/jni/DecimalUtilsTest.java | 12 +++++ 5 files changed, 82 insertions(+), 28 deletions(-) diff --git a/src/main/cpp/src/DecimalUtilsJni.cpp b/src/main/cpp/src/DecimalUtilsJni.cpp index f732276817..6c7c1cc781 100644 --- a/src/main/cpp/src/DecimalUtilsJni.cpp +++ b/src/main/cpp/src/DecimalUtilsJni.cpp @@ -19,8 +19,13 @@ extern "C" { -JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_multiply128( - JNIEnv* env, jclass, jlong j_view_a, jlong j_view_b, jint j_product_scale) +JNIEXPORT jlongArray JNICALL +Java_com_nvidia_spark_rapids_jni_DecimalUtils_multiply128(JNIEnv* env, + jclass, + jlong j_view_a, + jlong j_view_b, + jint j_product_scale, + bool cast_interim_result) { JNI_NULL_CHECK(env, j_view_a, "column is null", 0); JNI_NULL_CHECK(env, j_view_b, "column is null", 0); @@ -30,7 +35,7 @@ JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_multi auto view_b = reinterpret_cast(j_view_b); auto scale = static_cast(j_product_scale); return cudf::jni::convert_table_for_return( - env, cudf::jni::multiply_decimal128(*view_a, *view_b, scale)); + env, cudf::jni::multiply_decimal128(*view_a, *view_b, scale, cast_interim_result)); } CATCH_STD(env, 0); } diff --git a/src/main/cpp/src/decimal_utils.cu b/src/main/cpp/src/decimal_utils.cu index 392fb495b4..92273ff545 100644 --- a/src/main/cpp/src/decimal_utils.cu +++ b/src/main/cpp/src/decimal_utils.cu @@ -657,14 +657,16 @@ struct dec128_multiplier { dec128_multiplier(bool* overflows, cudf::mutable_column_view const& product_view, cudf::column_view const& a_col, - cudf::column_view const& b_col) + cudf::column_view const& b_col, + bool const cast_interim_result) : overflows(overflows), a_data(a_col.data<__int128_t>()), b_data(b_col.data<__int128_t>()), product_data(product_view.data<__int128_t>()), a_scale(a_col.type().scale()), b_scale(b_col.type().scale()), - prod_scale(product_view.type().scale()) + prod_scale(product_view.type().scale()), + cast_interim_result(cast_interim_result) { } @@ -675,22 +677,24 @@ struct dec128_multiplier { chunked256 product = multiply(a, b); - // Spark does some really odd things that I personally think are a bug - // https://issues.apache.org/jira/browse/SPARK-40129 - // But to match Spark we need to first round the result to a precision of 38 - // and this is specific to the value in the result of the multiply. - // Then we need to round the result to the final scale that we care about. - int dec_precision = precision10(product); - int first_div_precision = dec_precision - 38; - - int mult_scale = a_scale + b_scale; - if (first_div_precision > 0) { - auto const first_div_scale_divisor = pow_ten(first_div_precision).as_128_bits(); - product = divide_and_round(product, first_div_scale_divisor); - - // a_scale and b_scale are negative. first_div_precision is not - mult_scale = a_scale + b_scale + first_div_precision; - } + int const mult_scale = [&]() { + // According to https://issues.apache.org/jira/browse/SPARK-40129 + // and https://issues.apache.org/jira/browse/SPARK-45786, Spark has a bug in + // versions 3.2.4, 3.3.3, 3.4.1, 3.5.0 and 4.0.0 The bug is fixed for later versions but to + // match the legacy behavior we need to first round the result to a precision of 38 then we + // need to round the result to the final scale that we care about. + if (cast_interim_result) { + auto const first_div_precision = precision10(product) - 38; + if (first_div_precision > 0) { + auto const first_div_scale_divisor = pow_ten(first_div_precision).as_128_bits(); + product = divide_and_round(product, first_div_scale_divisor); + + // a_scale and b_scale are negative. first_div_precision is not + return a_scale + b_scale + first_div_precision; + } + } + return a_scale + b_scale; + }(); int exponent = prod_scale - mult_scale; if (exponent < 0) { @@ -718,6 +722,7 @@ struct dec128_multiplier { private: // output column for overflow detected bool* const overflows; + bool const cast_interim_result; // input data for multiply __int128_t const* const a_data; @@ -968,6 +973,7 @@ namespace cudf::jni { std::unique_ptr multiply_decimal128(cudf::column_view const& a, cudf::column_view const& b, int32_t product_scale, + bool const cast_interim_result, rmm::cuda_stream_view stream) { CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column"); @@ -992,10 +998,11 @@ std::unique_ptr multiply_decimal128(cudf::column_view const& a, auto overflows_view = columns[0]->mutable_view(); auto product_view = columns[1]->mutable_view(); check_scale_divisor(a.type().scale() + b.type().scale(), product_scale); - thrust::for_each(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - dec128_multiplier(overflows_view.begin(), product_view, a, b)); + thrust::for_each( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + dec128_multiplier(overflows_view.begin(), product_view, a, b, cast_interim_result)); return std::make_unique(std::move(columns)); } diff --git a/src/main/cpp/src/decimal_utils.hpp b/src/main/cpp/src/decimal_utils.hpp index 95c6c56c3d..9793e63445 100644 --- a/src/main/cpp/src/decimal_utils.hpp +++ b/src/main/cpp/src/decimal_utils.hpp @@ -30,6 +30,7 @@ std::unique_ptr multiply_decimal128( cudf::column_view const& a, cudf::column_view const& b, int32_t product_scale, + bool const cast_interim_result, rmm::cuda_stream_view stream = cudf::get_default_stream()); std::unique_ptr divide_decimal128( diff --git a/src/main/java/com/nvidia/spark/rapids/jni/DecimalUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/DecimalUtils.java index 389679965a..17337691c5 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/DecimalUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/DecimalUtils.java @@ -25,21 +25,50 @@ public class DecimalUtils { NativeDepsLoader.loadNativeDeps(); } + /** + * Multiply two DECIMAL128 columns together into a DECIMAL128 product rounded to the specified + * scale with overflow detection. This method considers a precision greater than 38 as overflow + * even if the number still fits in a 128-bit representation. + * + * WARNING: This method has a bug which we match with Spark versions before 3.4.2, + * 4.0.0, 3.5.1. Consider the following example using Decimal with a precision of 38 and scale of 10: + * -8533444864753048107770677711.1312637916 * -12.0000000000 = 102401338377036577293248132533.575166 + * while the actual answer based on Java BigDecimal is 102401338377036577293248132533.575165 + * + * @param a factor input, must match row count of the other factor input + * @param b factor input, must match row count of the other factor input + * @param productScale scale to use for the product type + * @return table containing a boolean column and a DECIMAL128 product column of the specified + * scale. The boolean value will be true if an overflow was detected for that row's + * DECIMAL128 product value. A null input row will result in a corresponding null output + * row. + */ + public static Table multiply128(ColumnView a, ColumnView b, int productScale) { + return new Table(multiply128(a.getNativeView(), b.getNativeView(), productScale, true)); + } /** * Multiply two DECIMAL128 columns together into a DECIMAL128 product rounded to the specified * scale with overflow detection. This method considers a precision greater than 38 as overflow * even if the number still fits in a 128-bit representation. + * + * WARNING: With interimCast set to true, this method has a bug which we match with Spark versions before 3.4.2, + * 4.0.0, 3.5.1. Consider the following example using Decimal with a precision of 38 and scale of 10: + * -8533444864753048107770677711.1312637916 * -12.0000000000 = 102401338377036577293248132533.575166 + * while the actual answer based on Java BigDecimal is 102401338377036577293248132533.575165 + * * @param a factor input, must match row count of the other factor input * @param b factor input, must match row count of the other factor input * @param productScale scale to use for the product type + * @param interimCast whether to cast the result of the division to 38 precision before casting it again to the final + * precision * @return table containing a boolean column and a DECIMAL128 product column of the specified * scale. The boolean value will be true if an overflow was detected for that row's * DECIMAL128 product value. A null input row will result in a corresponding null output * row. */ - public static Table multiply128(ColumnView a, ColumnView b, int productScale) { - return new Table(multiply128(a.getNativeView(), b.getNativeView(), productScale)); + public static Table multiply128(ColumnView a, ColumnView b, int productScale, boolean interimCast) { + return new Table(multiply128(a.getNativeView(), b.getNativeView(), productScale, interimCast)); } /** @@ -148,7 +177,7 @@ public static Table add128(ColumnView a, ColumnView b, int targetScale) { return new Table(add128(a.getNativeView(), b.getNativeView(), targetScale)); } - private static native long[] multiply128(long viewA, long viewB, int productScale); + private static native long[] multiply128(long viewA, long viewB, int productScale, boolean interimCast); private static native long[] divide128(long viewA, long viewB, int quotientScale, boolean isIntegerDivide); diff --git a/src/test/java/com/nvidia/spark/rapids/jni/DecimalUtilsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/DecimalUtilsTest.java index 4698855f31..7f3079e825 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/DecimalUtilsTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/DecimalUtilsTest.java @@ -86,6 +86,18 @@ void simplePosMultiplyZeroByNegOne() { } } + @Test + void multiply128WithoutInterimCast() { + try (ColumnVector lhs = makeDec128Column("-8533444864753048107770677711.1312637916"); + ColumnVector rhs = makeDec128Column("-12.0000000000"); + ColumnVector expectedBasic = makeDec128Column("102401338377036577293248132533.575165"); + ColumnVector expectedValid = ColumnVector.fromBooleans(false); + Table found = DecimalUtils.multiply128(lhs, rhs, -6, false)) { + assertColumnsAreEqual(expectedValid, found.getColumn(0)); + assertColumnsAreEqual(expectedBasic, found.getColumn(1)); + } + } + @Test void largePosMultiplyTenByTen() { try (ColumnVector lhs = From c58aa0b4b9fa064bcb34772de81e70a599740bc4 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Sat, 16 Dec 2023 05:27:47 +0800 Subject: [PATCH 063/127] Update submodule cudf to 0762fbea9100421e8a0a826fb3c5704c2a3f6a31 (#1657) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 2cb8f3da3a..0762fbea91 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 2cb8f3da3a3dd539301f90dcfbccadaf06963fd2 +Subproject commit 0762fbea9100421e8a0a826fb3c5704c2a3f6a31 From a9842f9eda1943e6a907dfe37afea121974b9e5c Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Sat, 16 Dec 2023 12:07:13 +0800 Subject: [PATCH 064/127] Update submodule cudf to 9c16d895f509e1d4e9710651e57e4cd29defbcce (#1658) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 0762fbea91..9c16d895f5 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 0762fbea9100421e8a0a826fb3c5704c2a3f6a31 +Subproject commit 9c16d895f509e1d4e9710651e57e4cd29defbcce From 6bdc68b1881a0bbaa399dfc79b20857b16b5dbde Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Mon, 18 Dec 2023 23:29:28 +0800 Subject: [PATCH 065/127] Update submodule cudf to 8dca25c782bbe239ed6e9b6317cc3a01b15a2b42 (#1659) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 9c16d895f5..8dca25c782 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 9c16d895f509e1d4e9710651e57e4cd29defbcce +Subproject commit 8dca25c782bbe239ed6e9b6317cc3a01b15a2b42 From 48d27360029bedc9002e468a68f51512bbb640d5 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 19 Dec 2023 03:48:49 +0800 Subject: [PATCH 066/127] Adding format_float kernel (#1572) * wip Signed-off-by: Haoyang Li * wip Signed-off-by: Haoyang Li * Add float to string kernel Signed-off-by: Haoyang Li * Update src/main/cpp/src/cast_float_to_string.cu Co-authored-by: Mike Wilson * Update src/main/cpp/src/cast_float_to_string.cu Co-authored-by: Mike Wilson * address comments and use different precision for float Signed-off-by: Haoyang Li * a runnable format_number demo Signed-off-by: Haoyang Li * rewrite the solution with ryu Signed-off-by: Haoyang Li * update license Signed-off-by: Haoyang Li * clean up Signed-off-by: Haoyang Li * Split ftos_converter out Signed-off-by: Haoyang Li * clean up Signed-off-by: Haoyang Li * resolve cudf conflicts Signed-off-by: Haoyang Li * resolve cudf conflicts Signed-off-by: Haoyang Li * resolve cudf conflicts Signed-off-by: Haoyang Li * resolve cudf conflicts Signed-off-by: Haoyang Li * remove cudf changes Signed-off-by: Haoyang Li * remove cudf changes Signed-off-by: Haoyang Li * add ryu Signed-off-by: Haoyang Li * Add copyright and notice Signed-off-by: Haoyang Li * Fix copyrights and license Signed-off-by: Haoyang Li * cudf conflict resolve Signed-off-by: Haoyang Li * Add format_float kernel Signed-off-by: Haoyang Li * clean up Signed-off-by: Haoyang Li * Fixed two bugs Signed-off-by: Haoyang Li * Added a failed case back Signed-off-by: Haoyang Li * Refactor Signed-off-by: Haoyang Li * Handle d=0 case Signed-off-by: Haoyang Li * Add nv apache license to ftos_converter Signed-off-by: Haoyang Li * Add nv apache license to ftos_converter Signed-off-by: Haoyang Li * Fix an rounding bug Signed-off-by: Haoyang Li * Update src/main/cpp/src/ftos_converter.cu Co-authored-by: Jason Lowe * address some comments Signed-off-by: Haoyang Li * cudf conflict Signed-off-by: Haoyang Li * Update src/main/cpp/src/cast_float_to_string.cu Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> * Make it runable again Signed-off-by: Haoyang Li * address some comments Signed-off-by: Haoyang Li * addressed comments Signed-off-by: Haoyang Li * Address comments Signed-off-by: Haoyang Li * clang format Signed-off-by: Haoyang Li * Address comments Signed-off-by: Haoyang Li * Address comments Signed-off-by: Haoyang Li * address comments Signed-off-by: Haoyang Li * fix build after upmerge Signed-off-by: Haoyang Li * move inf/nan replacement to kernel Signed-off-by: Haoyang Li * Apply suggestions from code review Co-authored-by: Mike Wilson Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> * address comments Signed-off-by: Haoyang Li * Apply suggestions from code review Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> * address comments Signed-off-by: Haoyang Li * cudf Signed-off-by: Haoyang Li * cudf Signed-off-by: Haoyang Li * format Signed-off-by: Haoyang Li * cudf reset Signed-off-by: Haoyang Li * Apply suggestions from code review Co-authored-by: Mike Wilson --------- Signed-off-by: Haoyang Li Co-authored-by: Mike Wilson Co-authored-by: Jason Lowe Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- src/main/cpp/CMakeLists.txt | 1 + src/main/cpp/src/CastStringJni.cpp | 15 + src/main/cpp/src/cast_string.hpp | 6 + src/main/cpp/src/format_float.cu | 131 +++++ src/main/cpp/src/ftos_converter.cuh | 453 ++++++++++++++++-- src/main/cpp/tests/CMakeLists.txt | 3 + src/main/cpp/tests/cast_decimal_to_string.cpp | 3 +- src/main/cpp/tests/cast_string.cpp | 3 +- src/main/cpp/tests/format_float.cpp | 88 ++++ .../nvidia/spark/rapids/jni/CastStrings.java | 12 + 10 files changed, 677 insertions(+), 38 deletions(-) create mode 100644 src/main/cpp/src/format_float.cu create mode 100644 src/main/cpp/tests/format_float.cpp diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 18c0cd12e8..fee3e60b8e 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -164,6 +164,7 @@ add_library( src/ZOrderJni.cpp src/bloom_filter.cu src/cast_decimal_to_string.cu + src/format_float.cu src/cast_float_to_string.cu src/cast_string.cu src/cast_string_to_float.cu diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index 933fc15e34..b7d898a0c8 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -125,6 +125,21 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat(J CATCH_CAST_EXCEPTION(env, 0); } +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloatWithFormat( + JNIEnv* env, jclass, jlong input_column, jint digits) +{ + JNI_NULL_CHECK(env, input_column, "input column is null", 0); + + try { + cudf::jni::auto_set_device(env); + + auto const& cv = *reinterpret_cast(input_column); + return cudf::jni::release_as_jlong( + spark_rapids_jni::format_float(cv, digits, cudf::get_default_stream())); + } + CATCH_CAST_EXCEPTION(env, 0); +} + JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromDecimal(JNIEnv* env, jclass, jlong input_column) diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp index c4f850b47f..43ec36e576 100644 --- a/src/main/cpp/src/cast_string.hpp +++ b/src/main/cpp/src/cast_string.hpp @@ -115,6 +115,12 @@ std::unique_ptr string_to_float( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr format_float( + cudf::column_view const& input, + int const digits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + std::unique_ptr float_to_string( cudf::column_view const& input, rmm::cuda_stream_view stream, diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu new file mode 100644 index 0000000000..d9ecbe8206 --- /dev/null +++ b/src/main/cpp/src/format_float.cu @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cast_string.hpp" +#include "ftos_converter.cuh" + +#include +#include +#include +#include +#include + +#include +#include + +namespace spark_rapids_jni { + +namespace detail { +namespace { + +template +struct format_float_fn { + cudf::column_device_view d_floats; + int digits; + cudf::size_type* d_offsets; + char* d_chars; + + __device__ cudf::size_type compute_output_size(FloatType const value) const + { + bool constexpr is_float = std::is_same_v; + return static_cast( + ftos_converter::compute_format_float_size(static_cast(value), digits, is_float)); + } + + __device__ void format_float(cudf::size_type const idx) const + { + auto const value = d_floats.element(idx); + bool constexpr is_float = std::is_same_v; + auto const output = d_chars + d_offsets[idx]; + ftos_converter::format_float(static_cast(value), digits, is_float, output); + } + + __device__ void operator()(cudf::size_type const idx) const + { + if (d_floats.is_null(idx)) { + if (d_chars == nullptr) { d_offsets[idx] = 0; } + return; + } + if (d_chars != nullptr) { + format_float(idx); + } else { + d_offsets[idx] = compute_output_size(d_floats.element(idx)); + } + } +}; + +/** + * @brief This dispatch method is for converting floats into strings. + * + * The template function declaration ensures only float types are allowed. + */ +struct dispatch_format_float_fn { + template )> + std::unique_ptr operator()(cudf::column_view const& floats, + int const digits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + auto const strings_count = floats.size(); + if (strings_count == 0) { return cudf::make_empty_column(cudf::type_id::STRING); } + + auto const input_ptr = cudf::column_device_view::create(floats, stream); + + auto [offsets, chars] = cudf::strings::detail::make_strings_children( + format_float_fn{*input_ptr, digits}, strings_count, stream, mr); + + return cudf::make_strings_column(strings_count, + std::move(offsets), + std::move(chars), + floats.null_count(), + cudf::detail::copy_bitmask(floats, stream, mr)); + } + + // non-float types throw an exception + template )> + std::unique_ptr operator()(cudf::column_view const&, + int const, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const + { + CUDF_FAIL("Values for format_float function must be a float type."); + } +}; + +} // namespace + +// This will convert all float column types into a strings column. +std::unique_ptr format_float(cudf::column_view const& floats, + int const digits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type_dispatcher(floats.type(), dispatch_format_float_fn{}, floats, digits, stream, mr); +} + +} // namespace detail + +// external API +std::unique_ptr format_float(cudf::column_view const& floats, + int const digits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::format_float(floats, digits, stream, mr); +} + +} // namespace spark_rapids_jni \ No newline at end of file diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh index 444f790d3c..e684f73921 100644 --- a/src/main/cpp/src/ftos_converter.cuh +++ b/src/main/cpp/src/ftos_converter.cuh @@ -15,6 +15,8 @@ * limitations under the License. */ +#pragma once + #include #include #include @@ -116,34 +118,32 @@ __constant__ uint32_t const POW5_OFFSETS[21] = { constexpr uint32_t POW5_TABLE_SIZE = 26; -__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = { - 1ull, - 5ull, - 25ull, - 125ull, - 625ull, - 3125ull, - 15625ull, - 78125ull, - 390625ull, - 1953125ull, - 9765625ull, - 48828125ull, - 244140625ull, - 1220703125ull, - 6103515625ull, - 30517578125ull, - 152587890625ull, - 762939453125ull, - 3814697265625ull, - 19073486328125ull, - 95367431640625ull, - 476837158203125ull, - 2384185791015625ull, - 11920928955078125ull, - 59604644775390625ull, - 298023223876953125ull //, 1490116119384765625ull -}; +__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {1ull, + 5ull, + 25ull, + 125ull, + 625ull, + 3125ull, + 15625ull, + 78125ull, + 390625ull, + 1953125ull, + 9765625ull, + 48828125ull, + 244140625ull, + 1220703125ull, + 6103515625ull, + 30517578125ull, + 152587890625ull, + 762939453125ull, + 3814697265625ull, + 19073486328125ull, + 95367431640625ull, + 476837158203125ull, + 2384185791015625ull, + 11920928955078125ull, + 59604644775390625ull, + 298023223876953125ull}; //===== common.h from ryu ===== @@ -1063,7 +1063,7 @@ __device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, return true; } -__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) +__device__ inline floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) { // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. uint64_t const bits = double_to_bits(f); @@ -1100,7 +1100,7 @@ __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) return v; } -__device__ int d2s_buffered_n(double f, char* result) +__device__ inline int d2s_buffered_n(double f, char* result) { bool sign = false, special = false; floating_decimal_64 v = d2d(f, sign, special); @@ -1108,7 +1108,7 @@ __device__ int d2s_buffered_n(double f, char* result) return to_chars(v, sign, result); } -__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) +__device__ inline floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) { // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. uint32_t const bits = float_to_bits(f); @@ -1128,7 +1128,7 @@ __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) return f2d(ieeeMantissa, ieeeExponent); } -__device__ int f2s_buffered_n(float f, char* result) +__device__ inline int f2s_buffered_n(float f, char* result) { bool sign = false, special = false; floating_decimal_32 v = f2d(f, sign, special); @@ -1138,7 +1138,7 @@ __device__ int f2s_buffered_n(float f, char* result) //===== compute float to string size ===== -__device__ int compute_d2s_size(double value) +__device__ inline int compute_d2s_size(double value) { bool sign = false, special = false; floating_decimal_64 v = d2d(value, sign, special); @@ -1146,7 +1146,7 @@ __device__ int compute_d2s_size(double value) return d2s_size(v, sign); } -__device__ int compute_f2s_size(float value) +__device__ inline int compute_f2s_size(float value) { bool sign = false, special = false; floating_decimal_32 v = f2d(value, sign, special); @@ -1158,7 +1158,7 @@ __device__ int compute_f2s_size(float value) //===== APIs ===== -__device__ int compute_ftos_size(double value, bool is_float) +__device__ inline int compute_ftos_size(double value, bool is_float) { if (is_float) { return compute_f2s_size(value); @@ -1167,7 +1167,7 @@ __device__ int compute_ftos_size(double value, bool is_float) } } -__device__ int float_to_string(double value, bool is_float, char* output) +__device__ inline int float_to_string(double value, bool is_float, char* output) { if (is_float) { return f2s_buffered_n(value, output); @@ -1176,4 +1176,385 @@ __device__ int float_to_string(double value, bool is_float, char* output) } } +//===== format float ===== + +__constant__ uint64_t const POW10_TABLE[19] = {1ull, + 10ull, + 100ull, + 1000ull, + 10000ull, + 100000ull, + 1000000ull, + 10000000ull, + 100000000ull, + 1000000000ull, + 10000000000ull, + 100000000000ull, + 1000000000000ull, + 10000000000000ull, + 100000000000000ull, + 1000000000000000ull, + 10000000000000000ull, + 100000000000000000ull}; + +template +__device__ inline T round_half_even(T const input, int const olength, int const digits) +{ + // "round" a integer to digits digits, with the half-even rounding mode. + if (digits > olength) { + T num = input; + for (int i = 0; i < digits - olength; i++) { + num *= 10; + } + return num; + } + T div = POW10_TABLE[olength - digits]; + T mod = input % div; + T num = input / div; + if (mod > (div / 2) || ((mod == (div / 2) && (num % 2 == 1) && mod != 0))) { num++; } + return num; +} + +__device__ inline int to_formated_chars(floating_decimal_64 const v, + bool const sign, + char* const result, + int digits) +{ + int index = 0; + if (sign) { result[index++] = '-'; } + uint64_t output = v.mantissa; + const uint32_t olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t)olength - 1; + if (exp < 0) { + // Decimal dot is before any of the digits. + int index_for_carrier = index; + result[index++] = '0'; + if (digits == 0) { return index; } + result[index++] = '.'; + int actural_round = digits; + for (int i = -1; i > exp; i--) { + index_for_carrier = index; + result[index++] = '0'; + actural_round--; + if (actural_round == 0) { + if (i != exp + 1) { return index; } // else, possible carry + break; + } + } + int actural_olength = fmin(int(olength), actural_round); + uint64_t rounded_output = round_half_even(output, olength, actural_round); + // check if carry + if (rounded_output >= POW10_TABLE[actural_olength]) { + result[index_for_carrier] = '1'; + rounded_output -= POW10_TABLE[actural_olength]; + } + int current = index; + for (int i = 0; i < actural_olength; i++) { + result[current + actural_olength - i - 1] = (char)('0' + rounded_output % 10); + rounded_output /= 10; + index++; + } + actural_round -= actural_olength; + if (actural_round > 0) { + for (int i = 0; i < actural_round; i++) { + result[index++] = '0'; + } + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + int integer_len = index + exp + 1 + exp / 3; + int sep_cnt = 0; + int rev_index = 0; + for (int i = olength; i < exp + 1; i++) { + result[integer_len - (rev_index++) - 1] = '0'; + sep_cnt++; + if (sep_cnt == 3) { + result[integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + } + for (int i = 0; i < olength; i++) { + if (sep_cnt == 3) { + result[integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + result[integer_len - (rev_index++) - 1] = (char)('0' + output % 10); + sep_cnt++; + output /= 10; + } + index = integer_len; + if (digits == 0) { return index; } + result[index++] = '.'; + for (int i = 0; i < digits; i++) { + result[index++] = '0'; + } + } else { + uint32_t temp_d = digits, tailing_zero = 0; + if (exp + digits > olength) { + temp_d = olength - exp; + tailing_zero = digits - temp_d; + } + uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1); + uint64_t pow10 = POW10_TABLE[temp_d]; + uint64_t integer = rounded_output / pow10; + uint64_t decimal = rounded_output % pow10; + // calculate integer length after format to cover carry case + uint32_t integer_len = decimalLength17(integer); + uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; + uint32_t sep_cnt = 0; + int rev_index = 0; + for (int i = 0; i < integer_len; i++) { + if (sep_cnt == 3) { + result[formated_integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + result[formated_integer_len - (rev_index++) - 1] = (char)('0' + integer % 10); + sep_cnt++; + integer /= 10; + } + index = formated_integer_len; + if (digits == 0) { return index; } + result[index++] = '.'; + int current = index; + for (int i = 0; i < tailing_zero; i++) { + result[current + digits - i - 1] = '0'; + index++; + } + for (int i = tailing_zero; i < digits; i++) { + result[current + digits - i - 1] = (char)('0' + decimal % 10); + decimal /= 10; + index++; + } + } + return index; +} + +__device__ inline int format_float_size(floating_decimal_64 const v, bool const sign, int digits) +{ + int index = 0; + if (sign) { index++; } + uint64_t output = v.mantissa; + const uint32_t olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t)olength - 1; + if (exp < 0) { + index += 2 + digits; + } else if (exp + 1 >= olength) { + index += exp + 1 + exp / 3 + 1 + digits; + } else { + uint32_t temp_d = digits; + if (exp + digits > olength) { temp_d = olength - exp; } + uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1); + uint64_t pow10 = POW10_TABLE[temp_d]; + uint64_t integer = rounded_output / pow10; + uint32_t integer_len = decimalLength17(integer); + index += integer_len + (integer_len - 1) / 3 + 1 + digits; + } + if (digits == 0) { index--; } + return index; +} + +__device__ inline int to_formated_chars(floating_decimal_32 const v, + bool const sign, + char* const result, + int digits) +{ + int index = 0; + if (sign) { result[index++] = '-'; } + uint32_t output = v.mantissa; + uint32_t const olength = decimalLength9(output); + int32_t exp = v.exponent + (int32_t)olength - 1; + if (exp < 0) { + // Decimal dot is before any of the digits. + int index_for_carrier = index; + result[index++] = '0'; + if (digits == 0) { return index; } + result[index++] = '.'; + int actural_round = digits; + for (int i = -1; i > exp; i--) { + index_for_carrier = index; + result[index++] = '0'; + actural_round--; + if (actural_round == 0) { + if (i != exp + 1) { return index; } // else, possible carry + break; + } + } + int actural_olength = fmin(int(olength), actural_round); + uint64_t rounded_output = round_half_even(output, olength, actural_round); + // check if carry + if (rounded_output >= POW10_TABLE[actural_olength]) { + result[index_for_carrier] = '1'; + rounded_output -= POW10_TABLE[actural_olength]; + } + int current = index; + for (int i = 0; i < actural_olength; i++) { + result[current + actural_olength - i - 1] = (char)('0' + rounded_output % 10); + rounded_output /= 10; + index++; + } + actural_round -= actural_olength; + if (actural_round > 0) { + for (int i = 0; i < actural_round; i++) { + result[index++] = '0'; + } + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + int integer_len = index + exp + 1 + exp / 3; + int sep_cnt = 0; + int rev_index = 0; + for (int i = olength; i < exp + 1; i++) { + result[integer_len - (rev_index++) - 1] = '0'; + sep_cnt++; + if (sep_cnt == 3) { + result[integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + } + for (int i = 0; i < olength; i++) { + if (sep_cnt == 3) { + result[integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + result[integer_len - (rev_index++) - 1] = (char)('0' + output % 10); + sep_cnt++; + output /= 10; + } + index = integer_len; + if (digits == 0) { return index; } + result[index++] = '.'; + for (int i = 0; i < digits; i++) { + result[index++] = '0'; + } + } else { + uint32_t temp_d = digits, tailing_zero = 0; + if (exp + digits > olength) { + temp_d = olength - exp; + tailing_zero = digits - temp_d; + } + uint32_t rounded_output = round_half_even(output, olength, exp + temp_d + 1); + uint32_t pow10 = POW10_TABLE[temp_d]; + uint32_t integer = rounded_output / pow10; + uint32_t decimal = rounded_output % pow10; + // calculate integer length after format to cover carry case + uint32_t integer_len = decimalLength9(integer); + uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; + uint32_t sep_cnt = 0; + int rev_index = 0; + for (int i = 0; i < integer_len; i++) { + if (sep_cnt == 3) { + result[formated_integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + result[formated_integer_len - (rev_index++) - 1] = (char)('0' + integer % 10); + sep_cnt++; + integer /= 10; + } + index = formated_integer_len; + if (digits == 0) { return index; } + result[index++] = '.'; + int current = index; + for (int i = 0; i < tailing_zero; i++) { + result[current + digits - i - 1] = '0'; + index++; + } + for (int i = tailing_zero; i < digits; i++) { + result[current + digits - i - 1] = (char)('0' + decimal % 10); + decimal /= 10; + index++; + } + } + return index; +} + +__device__ inline int format_float_size(floating_decimal_32 const v, bool const sign, int digits) +{ + int index = 0; + if (sign) { index++; } + uint64_t output = v.mantissa; + uint32_t const olength = decimalLength9(output); + int32_t exp = v.exponent + (int32_t)olength - 1; + if (exp < 0) { + index += 2 + digits; + } else if (exp + 1 >= olength) { + index += exp + 1 + exp / 3 + 1 + digits; + } else { + uint32_t temp_d = digits; + if (exp + digits > olength) { temp_d = olength - exp; } + uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1); + uint64_t pow10 = POW10_TABLE[temp_d]; + uint64_t integer = rounded_output / pow10; + uint32_t integer_len = decimalLength9(integer); + index += integer_len + (integer_len - 1) / 3 + 1 + digits; + } + if (digits == 0) { index--; } + return index; +} + +__device__ inline int copy_format_special_str(char* const result, + bool const sign, + bool const exponent, + bool const mantissa, + int const digits = 1) +{ + if (mantissa) { + memcpy(result, "\xEF\xBF\xBD", 3); // U+FFFD, replacement character, NaN + return 3; + } + if (sign) { result[0] = '-'; } + if (exponent) { + memcpy(result + sign, "\xE2\x88\x9E", 3); // U+221E, infinity symbol + return sign + 3; + } + result[sign] = '0'; + if (digits == 0) { + return sign + 1; + } else { + result[sign + 1] = '.'; + } + for (int i = 0; i < digits; i++) { + result[sign + 2 + i] = '0'; + } + return sign + 2 + digits; +} + +__device__ inline int special_format_str_size(bool const sign, + bool const exponent, + bool const mantissa, + int const digits = 1) +{ + if (mantissa) { return 3; } + if (exponent) { return sign + 3; } + if (digits == 0) { return sign + 1; } + return sign + 2 + digits; +} + +__device__ inline int compute_format_float_size(double value, int digits, bool is_float) +{ + bool sign = false, special = false; + if (is_float) { + floating_decimal_32 v = f2d(value, sign, special); + if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); } + return format_float_size(v, sign, digits); + } else { + floating_decimal_64 v = d2d(value, sign, special); + if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); } + return format_float_size(v, sign, digits); + } +} + +__device__ inline int format_float(double value, int digits, bool is_float, char* output) +{ + bool sign = false, special = false; + if (is_float) { + floating_decimal_32 v = f2d(value, sign, special); + if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); } + return to_formated_chars(v, sign, output, digits); + } else { + floating_decimal_64 v = d2d(value, sign, special); + if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); } + return to_formated_chars(v, sign, output, digits); + } +} + } // namespace spark_rapids_jni::ftos_converter diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt index c9bb13046f..b34b1b8b01 100644 --- a/src/main/cpp/tests/CMakeLists.txt +++ b/src/main/cpp/tests/CMakeLists.txt @@ -51,6 +51,9 @@ ConfigureTest(CAST_STRING ConfigureTest(CAST_DECIMAL_TO_STRING cast_decimal_to_string.cpp) +ConfigureTest(FORMAT_FLOAT + format_float.cpp) + ConfigureTest(CAST_FLOAT_TO_STRING cast_float_to_string.cpp) diff --git a/src/main/cpp/tests/cast_decimal_to_string.cpp b/src/main/cpp/tests/cast_decimal_to_string.cpp index 1a93354339..ba1aaf05c8 100644 --- a/src/main/cpp/tests/cast_decimal_to_string.cpp +++ b/src/main/cpp/tests/cast_decimal_to_string.cpp @@ -24,9 +24,10 @@ #include -#include #include +#include + using namespace cudf; template diff --git a/src/main/cpp/tests/cast_string.cpp b/src/main/cpp/tests/cast_string.cpp index c736d5971f..1f7aaaad21 100644 --- a/src/main/cpp/tests/cast_string.cpp +++ b/src/main/cpp/tests/cast_string.cpp @@ -24,9 +24,10 @@ #include -#include #include +#include + using namespace cudf; template diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp new file mode 100644 index 0000000000..b9d77593db --- /dev/null +++ b/src/main/cpp/tests/format_float.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cast_string.hpp" + +#include +#include + +#include + +using namespace cudf; + +constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; + +struct FormatFloatTests : public cudf::test::BaseFixture {}; + +TEST_F(FormatFloatTests, FormatFloats32) +{ + auto const floats = + cudf::test::fixed_width_column_wrapper{100.0f, + 654321.25f, + -12761.125f, + 0.0f, + 5.0f, + -4.0f, + std::numeric_limits::quiet_NaN(), + 123456789012.34f, + -0.0f}; + + auto const expected = cudf::test::strings_column_wrapper{"100.00000", + "654,321.25000", + "-12,761.12500", + "0.00000", + "5.00000", + "-4.00000", + "\xEF\xBF\xBD", + "123,456,790,000.00000", + "-0.00000"}; + + auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream()); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); +} + +TEST_F(FormatFloatTests, FormatFloats64) +{ + auto const floats = + cudf::test::fixed_width_column_wrapper{100.0d, + 654321.25d, + -12761.125d, + 1.123456789123456789d, + 0.000000000000000000123456789123456789d, + 0.0d, + 5.0d, + -4.0d, + std::numeric_limits::quiet_NaN(), + 839542223232.794248339d, + -0.0d}; + + auto const expected = cudf::test::strings_column_wrapper{"100.00000", + "654,321.25000", + "-12,761.12500", + "1.12346", + "0.00000", + "0.00000", + "5.00000", + "-4.00000", + "\xEF\xBF\xBD", + "839,542,223,232.79420", + "-0.00000"}; + + auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream()); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); +} \ No newline at end of file diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index 022cb93085..2b2267f034 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -80,6 +80,17 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st return new ColumnVector(toDecimal(cv.getNativeView(), ansiMode, strip, precision, scale)); } + /** + * Convert a float column to a formatted string column. + * + * @param cv the column data to process + * @param digits the number of digits to display after the decimal point + * @return the converted column + */ + public static ColumnVector fromFloatWithFormat(ColumnView cv, int digits) { + return new ColumnVector(fromFloatWithFormat(cv.getNativeView(), digits)); + } + /** * Convert a float column to a string column. * @@ -147,6 +158,7 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled int precision, int scale); private static native long toFloat(long nativeColumnView, boolean ansi_enabled, int dtype); private static native long fromDecimal(long nativeColumnView); + private static native long fromFloatWithFormat(long nativeColumnView, int digits); private static native long fromFloat(long nativeColumnView); private static native long toIntegersWithBase(long nativeColumnView, int base, boolean ansiEnabled, int dtype); From 9dffe324e52d10261a571be059edf14225e862bf Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 19 Dec 2023 05:29:51 +0800 Subject: [PATCH 067/127] Update submodule cudf to 90cccef3e3070b0f03df75c49aca64517d5a4cfa (#1660) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 8dca25c782..90cccef3e3 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 8dca25c782bbe239ed6e9b6317cc3a01b15a2b42 +Subproject commit 90cccef3e3070b0f03df75c49aca64517d5a4cfa From b15b8391acaa1d67c59259247026227d700d6a26 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 19 Dec 2023 11:25:17 +0800 Subject: [PATCH 068/127] Update submodule cudf to bb047a230a805476f3008abb031741f8995c6f1e (#1661) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 90cccef3e3..bb047a230a 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 90cccef3e3070b0f03df75c49aca64517d5a4cfa +Subproject commit bb047a230a805476f3008abb031741f8995c6f1e From dadc7a091c1350a040aec25d565f6ba97e8e80ae Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 19 Dec 2023 17:30:39 +0800 Subject: [PATCH 069/127] Update submodule cudf to 8b695e340355d43261800a1cff876369e916ae90 (#1663) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index bb047a230a..8b695e3403 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit bb047a230a805476f3008abb031741f8995c6f1e +Subproject commit 8b695e340355d43261800a1cff876369e916ae90 From 98dc423dfbacb68e0d5d8d15069455aaffad618f Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 19 Dec 2023 13:38:25 -0500 Subject: [PATCH 070/127] Adding query support to parseURI (#1652) * Adding query to parseuri Signed-off-by: Mike Wilson --- src/main/cpp/src/ParseURIJni.cpp | 14 +++ src/main/cpp/src/parse_uri.cu | 87 +++++++++++++------ src/main/cpp/src/parse_uri.hpp | 15 +++- src/main/cpp/tests/parse_uri.cpp | 69 ++++++++++++--- .../com/nvidia/spark/rapids/jni/ParseURI.java | 13 ++- .../nvidia/spark/rapids/jni/ParseURITest.java | 79 +++++++++++++---- 6 files changed, 223 insertions(+), 54 deletions(-) diff --git a/src/main/cpp/src/ParseURIJni.cpp b/src/main/cpp/src/ParseURIJni.cpp index 9079d99b9d..3af72687b6 100644 --- a/src/main/cpp/src/ParseURIJni.cpp +++ b/src/main/cpp/src/ParseURIJni.cpp @@ -47,4 +47,18 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseHost(JNIE } CATCH_STD(env, 0); } + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseQuery(JNIEnv* env, + jclass, + jlong input_column) +{ + JNI_NULL_CHECK(env, input_column, "input column is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = reinterpret_cast(input_column); + return cudf::jni::ptr_as_jlong(spark_rapids_jni::parse_uri_to_query(*input).release()); + } + CATCH_STD(env, 0); +} } diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu index 13a8effb37..d75dfc18c1 100644 --- a/src/main/cpp/src/parse_uri.cu +++ b/src/main/cpp/src/parse_uri.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -47,10 +48,20 @@ struct uri_parts { string_view userinfo; string_view port; string_view opaque; - bool valid{false}; + uint32_t valid{0}; }; -enum class URI_chunks : int8_t { PROTOCOL, HOST, AUTHORITY, PATH, QUERY, USERINFO }; +enum class URI_chunks : int8_t { + PROTOCOL, + HOST, + AUTHORITY, + PATH, + FRAGMENT, + QUERY, + USERINFO, + PORT, + OPAQUE +}; enum class chunk_validity : int8_t { VALID, INVALID, FATAL }; @@ -436,7 +447,7 @@ bool __device__ validate_path(string_view path) // path can be alphanum and @[]_-!.~'()*?/&,;:$+= return validate_chunk(path, [] __device__(string_view::const_iterator iter) { auto const c = *iter; - if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && !(c >= '@' && c <= 'Z') && + if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && !(c >= '?' && c <= 'Z') && c != '_' && !(c >= 'a' && c <= 'z') && c != '~') { return false; } @@ -474,6 +485,7 @@ uri_parts __device__ validate_uri(const char* str, int len) { uri_parts ret; + auto const original_str = str; // look for :/# characters. int col = -1; int slash = -1; @@ -503,9 +515,10 @@ uri_parts __device__ validate_uri(const char* str, int len) if (hash >= 0) { ret.fragment = {str + hash + 1, len - hash - 1}; if (!validate_fragment(ret.fragment)) { - ret.valid = false; + ret.valid = 0; return ret; } + ret.valid |= (1 << static_cast(URI_chunks::FRAGMENT)); len = hash; @@ -519,9 +532,10 @@ uri_parts __device__ validate_uri(const char* str, int len) // we have a scheme up to the : ret.scheme = {str, col}; if (!validate_scheme(ret.scheme)) { - ret.valid = false; + ret.valid = 0; return ret; } + ret.valid |= (1 << static_cast(URI_chunks::PROTOCOL)); // skip over scheme auto const skip = col + 1; @@ -534,20 +548,22 @@ uri_parts __device__ validate_uri(const char* str, int len) // no more string to parse is an error if (len <= 0) { - ret.valid = false; + ret.valid = 0; return ret; } - // If we have a '/' as the next character, we have a heirarchical uri. If not it is opaque. - bool const heirarchical = str[0] == '/'; + // If we have a '/' as the next character or this is still the start of the string, we have a + // heirarchical uri. If not it is opaque. + bool const heirarchical = str[0] == '/' || str == original_str; if (heirarchical) { // a '?' will break this into query and path/authority if (question >= 0) { ret.query = {str + question + 1, len - question - 1}; if (!validate_query(ret.query)) { - ret.valid = false; + ret.valid = 0; return ret; } + ret.valid |= (1 << static_cast(URI_chunks::QUERY)); } auto const path_len = question >= 0 ? question : len; @@ -567,17 +583,17 @@ uri_parts __device__ validate_uri(const char* str, int len) if (next_slash == -1 && ret.authority.size_bytes() == 0 && ret.query.size_bytes() == 0 && ret.fragment.size_bytes() == 0) { // invalid! - but spark like to return things as long as you don't have illegal characters - // ret.valid = false; - ret.valid = true; + // ret.valid = 0; return ret; } if (ret.authority.size_bytes() > 0) { auto ipv6_address = ret.authority.size_bytes() > 2 && *ret.authority.begin() == '['; if (!validate_authority(ret.authority, ipv6_address)) { - ret.valid = false; + ret.valid = 0; return ret; } + ret.valid |= (1 << static_cast(URI_chunks::AUTHORITY)); // Inspect the authority for userinfo, host, and port const char* auth = ret.authority.data(); @@ -604,9 +620,11 @@ uri_parts __device__ validate_uri(const char* str, int len) if (amp > 0) { ret.userinfo = {auth, amp}; if (!validate_userinfo(ret.userinfo)) { - ret.valid = false; + ret.valid = 0; return ret; } + ret.valid |= (1 << static_cast(URI_chunks::USERINFO)); + // skip over the @ amp++; @@ -617,36 +635,39 @@ uri_parts __device__ validate_uri(const char* str, int len) // Found a port, attempt to parse it ret.port = {auth + last_colon + 1, auth_size - last_colon - 1}; if (!validate_port(ret.port)) { - ret.valid = false; + ret.valid = 0; return ret; } + ret.valid |= (1 << static_cast(URI_chunks::PORT)); ret.host = {auth, last_colon}; } else { ret.host = {auth, auth_size}; } auto host_ret = validate_host(ret.host); switch (host_ret) { - case chunk_validity::FATAL: ret.valid = false; return ret; + case chunk_validity::FATAL: ret.valid = 0; return ret; case chunk_validity::INVALID: ret.host = {}; break; + case chunk_validity::VALID: ret.valid |= (1 << static_cast(URI_chunks::HOST)); break; } } } else { // path with no authority - ret.path = {str, len}; + ret.path = {str, path_len}; } if (!validate_path(ret.path)) { - ret.valid = false; + ret.valid = 0; return ret; } + ret.valid |= (1 << static_cast(URI_chunks::PATH)); } else { ret.opaque = {str, len}; if (!validate_opaque(ret.opaque)) { - ret.valid = false; + ret.valid = 0; return ret; } + ret.valid |= (1 << static_cast(URI_chunks::OPAQUE)); } - ret.valid = true; return ret; } @@ -697,7 +718,7 @@ __global__ void parse_uri_char_counter(column_device_view const in_strings, auto const string_length = in_string.size_bytes(); auto const uri = validate_uri(in_chars, string_length); - if (!uri.valid) { + if ((uri.valid & (1 << static_cast(chunk))) == 0) { out_lengths[row_idx] = 0; clear_bit(out_validity, row_idx); } else { @@ -727,11 +748,18 @@ __global__ void parse_uri_char_counter(column_device_view const in_strings, out_lengths[row_idx] = uri.userinfo.size_bytes(); out_offsets[row_idx] = uri.userinfo.data() - base_ptr; break; - } - - if (out_lengths[row_idx] == 0) { - // A URI can be valid, but still have no data for a specific chunk - clear_bit(out_validity, row_idx); + case URI_chunks::PORT: + out_lengths[row_idx] = uri.port.size_bytes(); + out_offsets[row_idx] = uri.port.data() - base_ptr; + break; + case URI_chunks::FRAGMENT: + out_lengths[row_idx] = uri.fragment.size_bytes(); + out_offsets[row_idx] = uri.fragment.data() - base_ptr; + break; + case URI_chunks::OPAQUE: + out_lengths[row_idx] = uri.opaque.size_bytes(); + out_offsets[row_idx] = uri.opaque.data() - base_ptr; + break; } } } @@ -858,4 +886,13 @@ std::unique_ptr parse_uri_to_host(strings_column_view const& input, return detail::parse_uri(input, detail::URI_chunks::HOST, stream, mr); } +std::unique_ptr parse_uri_to_query(strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::parse_uri( + input, detail::URI_chunks::QUERY, stream, rmm::mr::get_current_device_resource()); +} + } // namespace spark_rapids_jni \ No newline at end of file diff --git a/src/main/cpp/src/parse_uri.hpp b/src/main/cpp/src/parse_uri.hpp index 0a76cec1b4..07f6f9cd46 100644 --- a/src/main/cpp/src/parse_uri.hpp +++ b/src/main/cpp/src/parse_uri.hpp @@ -49,7 +49,20 @@ std::unique_ptr parse_uri_to_protocol( */ std::unique_ptr parse_uri_to_host( cudf::strings_column_view const& input, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Parse query and copy from the input string column to the output char buffer. + * + * @param input Input string column of URIs to parse + * @param stream Stream on which to operate. + * @param mr Memory resource for returned column + * @return std::unique_ptr String column of queries parsed. + */ +std::unique_ptr parse_uri_to_query( + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace spark_rapids_jni diff --git a/src/main/cpp/tests/parse_uri.cpp b/src/main/cpp/tests/parse_uri.cpp index 1112fea232..36ebbeacc0 100644 --- a/src/main/cpp/tests/parse_uri.cpp +++ b/src/main/cpp/tests/parse_uri.cpp @@ -19,10 +19,12 @@ #include #include #include +#include #include struct ParseURIProtocolTests : public cudf::test::BaseFixture {}; struct ParseURIHostTests : public cudf::test::BaseFixture {}; +struct ParseURIQueryTests : public cudf::test::BaseFixture {}; enum class test_types { SIMPLE, @@ -30,6 +32,7 @@ enum class test_types { IPv6, IPv4, UTF8, + QUERY, }; namespace { @@ -123,6 +126,15 @@ cudf::test::strings_column_wrapper get_test_data(test_types t) "http://✪↩d⁚f„⁈.ws/123", "https:// /path/to/file", }); + case test_types::QUERY: + return cudf::test::strings_column_wrapper({ + "https://www.nvidia.com/path?param0=1¶m2=3¶m4=5", + "https:// /?params=5&cloth=0&metal=1", + "https://[2001:db8::2:1]:443/parms/in/the/uri?a=b", + "https://[::1]/?invalid=param&f„⁈.=7", + "https://[::1]/?invalid=param&~.=!@&^", + "userinfo@www.nvidia.com/path?query=1#Ref", + }); default: CUDF_FAIL("Test type unsupported!"); return cudf::test::strings_column_wrapper(); } } @@ -136,7 +148,7 @@ TEST_F(ParseURIProtocolTests, Simple) cudf::test::strings_column_wrapper const expected( {"https", "http", "file", "smb", "http", "file", "", "", ""}, {1, 1, 1, 1, 1, 1, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); } TEST_F(ParseURIProtocolTests, SparkEdges) @@ -185,7 +197,7 @@ TEST_F(ParseURIProtocolTests, SparkEdges) {1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); } TEST_F(ParseURIProtocolTests, IP6) @@ -197,7 +209,7 @@ TEST_F(ParseURIProtocolTests, IP6) {"https", "https", "https", "https", "http", "https", "https", "https", "", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); } TEST_F(ParseURIProtocolTests, IP4) @@ -208,7 +220,7 @@ TEST_F(ParseURIProtocolTests, IP4) cudf::test::strings_column_wrapper const expected( {"https", "https", "https", "https", "https", "https"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); } TEST_F(ParseURIProtocolTests, UTF8) @@ -218,7 +230,7 @@ TEST_F(ParseURIProtocolTests, UTF8) cudf::test::strings_column_wrapper const expected({"https", "http", "http", ""}, {1, 1, 1, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); } TEST_F(ParseURIHostTests, Simple) @@ -230,7 +242,7 @@ TEST_F(ParseURIHostTests, Simple) {"www.nvidia.com", "www.nvidia.com", "path", "network", "", "", "", "", ""}, {1, 1, 1, 1, 0, 0, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); } TEST_F(ParseURIHostTests, SparkEdges) @@ -279,7 +291,7 @@ TEST_F(ParseURIHostTests, SparkEdges) {1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); } TEST_F(ParseURIHostTests, IP6) @@ -299,7 +311,7 @@ TEST_F(ParseURIHostTests, IP6) ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); } TEST_F(ParseURIHostTests, IP4) @@ -310,7 +322,7 @@ TEST_F(ParseURIHostTests, IP4) cudf::test::strings_column_wrapper const expected( {"192.168.1.100", "192.168.1.100", "", "", "", ""}, {1, 1, 0, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); } TEST_F(ParseURIHostTests, UTF8) @@ -320,5 +332,42 @@ TEST_F(ParseURIHostTests, UTF8) cudf::test::strings_column_wrapper const expected({"nvidia.com", "", "", ""}, {1, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); +} + +TEST_F(ParseURIQueryTests, Simple) +{ + auto const col = get_test_data(test_types::SIMPLE); + auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper const expected({"param1=2", "", "", "", "", "", "", "", ""}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); +} + +TEST_F(ParseURIQueryTests, SparkEdges) +{ + auto const col = get_test_data(test_types::SPARK_EDGES); + auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper const expected( + {"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", + "", // empty + "?", "?/", "", "query;p2", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); +} + +TEST_F(ParseURIQueryTests, Queries) +{ + auto const col = get_test_data(test_types::QUERY); + auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper const expected( + {"param0=1¶m2=3¶m4=5", "", "a=b", "invalid=param&f„⁈.=7", "", "query=1"}, + {1, 0, 1, 1, 0, 1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java index 0e14f388d4..8f82bfc908 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java @@ -49,7 +49,18 @@ public static ColumnVector parseURIHost(ColumnView uriColumn) { return new ColumnVector(parseHost(uriColumn.getNativeView())); } + /** + * Parse query for each URI from the incoming column. + * + * @param URIColumn The input strings column in which each row contains a URI. + * @return A string column with query data extracted. + */ + public static ColumnVector parseURIQuery(ColumnView uriColumn) { + assert uriColumn.getType().equals(DType.STRING) : "Input type must be String"; + return new ColumnVector(parseQuery(uriColumn.getNativeView())); + } + private static native long parseProtocol(long jsonColumnHandle); private static native long parseHost(long jsonColumnHandle); - + private static native long parseQuery(long jsonColumnHandle); } diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java index c6e3b06ed1..ca76df2bf3 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java @@ -25,9 +25,8 @@ import ai.rapids.cudf.ColumnVector; public class ParseURITest { - void buildExpectedAndRun(String[] testData) { + void testProtocol(String[] testData) { String[] expectedProtocolStrings = new String[testData.length]; - String[] expectedHostStrings = new String[testData.length]; for (int i=0; i Date: Thu, 21 Dec 2023 13:08:51 -0500 Subject: [PATCH 071/127] Use cuda::proclaim_return_type on device lambdas (#1662) * adding proclaim_return_type to device lambdas Signed-off-by: Mike Wilson * clang-format Signed-off-by: Mike Wilson * No cuda::proclaim_return_type on non-device lambda Signed-off-by: Mike Wilson * Adding Mithun's changes for CCCL 2 Signed-off-by: Mike Wilson * linting Signed-off-by: Mike Wilson * updating return type Signed-off-by: Mike Wilson * Update src/main/cpp/CMakeLists.txt Co-authored-by: Bradley Dice * Update jni Signed-off-by: Nghia Truong * Apply suggestions from code review * Fix styles Signed-off-by: Nghia Truong * linting Signed-off-by: Mike Wilson * Update submodule manually Signed-off-by: Nghia Truong * Fix header Signed-off-by: Nghia Truong --------- Signed-off-by: Mike Wilson Signed-off-by: Nghia Truong Co-authored-by: Bradley Dice Co-authored-by: Nghia Truong --- src/main/cpp/CMakeLists.txt | 8 +- src/main/cpp/benchmarks/row_conversion.cpp | 16 +- src/main/cpp/src/RowConversionJni.cpp | 15 +- src/main/cpp/src/bloom_filter.cu | 19 +- src/main/cpp/src/datetime_rebase.cu | 192 +- src/main/cpp/src/map_utils.cu | 245 +- src/main/cpp/src/murmur_hash.cu | 11 +- src/main/cpp/src/parse_uri.cu | 117 +- src/main/cpp/src/row_conversion.cu | 2595 -------------------- src/main/cpp/src/row_conversion.hpp | 53 - src/main/cpp/src/utilities.cu | 27 +- src/main/cpp/src/xxhash64.cu | 11 +- src/main/cpp/src/zorder.cu | 28 +- src/main/cpp/tests/CMakeLists.txt | 3 - src/main/cpp/tests/row_conversion.cpp | 1043 -------- thirdparty/cudf | 2 +- 16 files changed, 367 insertions(+), 4018 deletions(-) delete mode 100644 src/main/cpp/src/row_conversion.cu delete mode 100644 src/main/cpp/src/row_conversion.hpp delete mode 100644 src/main/cpp/tests/row_conversion.cpp diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index fee3e60b8e..1ad65687e2 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -94,11 +94,8 @@ include(cmake/Modules/ConfigureCUDA.cmake) # set other CUDA compilation flags # ################################################################################################## # * dependencies ---------------------------------------------------------------------------------- -# find libcu++ -include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) - -# find thrust/cub -include(${CUDF_DIR}/cpp/cmake/thirdparty/get_thrust.cmake) +# find CCCL +include(${CUDF_DIR}/cpp/cmake/thirdparty/get_cccl.cmake) # JNI find_package(JNI REQUIRED) @@ -174,7 +171,6 @@ add_library( src/map_utils.cu src/murmur_hash.cu src/parse_uri.cu - src/row_conversion.cu src/timezones.cu src/utilities.cu src/xxhash64.cu diff --git a/src/main/cpp/benchmarks/row_conversion.cpp b/src/main/cpp/benchmarks/row_conversion.cpp index c625342867..b8694fbcdf 100644 --- a/src/main/cpp/benchmarks/row_conversion.cpp +++ b/src/main/cpp/benchmarks/row_conversion.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,15 +48,15 @@ void fixed_width(nvbench::state& state) bytes_per_row += cudf::size_of(t); } - auto rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(table->view()); + auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { if (direction == "to row") { - auto _rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(table->view()); + auto _rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); } else { for (auto const& r : rows) { cudf::lists_column_view const l(r->view()); - auto out = spark_rapids_jni::convert_from_rows_fixed_width_optimized(l, schema); + auto out = cudf::convert_from_rows_fixed_width_optimized(l, schema); } } }); @@ -117,16 +117,16 @@ static void variable_or_fixed_width(nvbench::state& state) } } - auto rows = spark_rapids_jni::convert_to_rows(table->view()); + auto rows = cudf::convert_to_rows(table->view()); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto new_rows = spark_rapids_jni::convert_to_rows(table->view()); + auto new_rows = cudf::convert_to_rows(table->view()); if (direction == "to row") { - auto _rows = spark_rapids_jni::convert_to_rows(table->view()); + auto _rows = cudf::convert_to_rows(table->view()); } else { for (auto const& r : rows) { cudf::lists_column_view const l(r->view()); - auto out = spark_rapids_jni::convert_from_rows(l, schema); + auto out = cudf::convert_from_rows(l, schema); } } }); diff --git a/src/main/cpp/src/RowConversionJni.cpp b/src/main/cpp/src/RowConversionJni.cpp index 1fdb8a86b5..8e900691f1 100644 --- a/src/main/cpp/src/RowConversionJni.cpp +++ b/src/main/cpp/src/RowConversionJni.cpp @@ -16,7 +16,8 @@ #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp" -#include "row_conversion.hpp" + +#include extern "C" { @@ -31,7 +32,7 @@ Java_com_nvidia_spark_rapids_jni_RowConversion_convertToRowsFixedWidthOptimized( cudf::jni::auto_set_device(env); cudf::table_view const* n_input_table = reinterpret_cast(input_table); std::vector> cols = - spark_rapids_jni::convert_to_rows_fixed_width_optimized(*n_input_table); + cudf::convert_to_rows_fixed_width_optimized(*n_input_table); int const num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); std::transform(cols.begin(), cols.end(), outcol_handles.begin(), [](auto& col) { @@ -50,9 +51,8 @@ Java_com_nvidia_spark_rapids_jni_RowConversion_convertToRows(JNIEnv* env, jclass try { cudf::jni::auto_set_device(env); cudf::table_view const* n_input_table = reinterpret_cast(input_table); - std::vector> cols = - spark_rapids_jni::convert_to_rows(*n_input_table); - int const num_columns = cols.size(); + std::vector> cols = cudf::convert_to_rows(*n_input_table); + int const num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); std::transform(cols.begin(), cols.end(), outcol_handles.begin(), [](auto& col) { return cudf::jni::release_as_jlong(col); @@ -84,7 +84,7 @@ Java_com_nvidia_spark_rapids_jni_RowConversion_convertFromRowsFixedWidthOptimize std::back_inserter(types_vec), [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); }); std::unique_ptr result = - spark_rapids_jni::convert_from_rows_fixed_width_optimized(list_input, types_vec); + cudf::convert_from_rows_fixed_width_optimized(list_input, types_vec); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); @@ -110,8 +110,7 @@ JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_RowConversion_conv n_scale.begin(), std::back_inserter(types_vec), [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); }); - std::unique_ptr result = - spark_rapids_jni::convert_from_rows(list_input, types_vec); + std::unique_ptr result = cudf::convert_from_rows(list_input, types_vec); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); diff --git a/src/main/cpp/src/bloom_filter.cu b/src/main/cpp/src/bloom_filter.cu index 7637c85f10..6270705178 100644 --- a/src/main/cpp/src/bloom_filter.cu +++ b/src/main/cpp/src/bloom_filter.cu @@ -34,6 +34,8 @@ #include +#include + #include namespace spark_rapids_jni { @@ -316,14 +318,15 @@ std::unique_ptr bloom_filter_merge(cudf::column_view const& b thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_words, dst, - [src, num_buffers = bloom_filters.size(), stride = buf_size] __device__( - cudf::size_type word_index) { - cudf::bitmask_type out = (reinterpret_cast(src))[word_index]; - for (auto idx = 1; idx < num_buffers; idx++) { - out |= (reinterpret_cast(src + idx * stride))[word_index]; - } - return out; - }); + cuda::proclaim_return_type( + [src, num_buffers = bloom_filters.size(), stride = buf_size] __device__( + cudf::size_type word_index) { + cudf::bitmask_type out = (reinterpret_cast(src))[word_index]; + for (auto idx = 1; idx < num_buffers; idx++) { + out |= (reinterpret_cast(src + idx * stride))[word_index]; + } + return out; + })); // create the 1-row list column and move it into a scalar. return std::make_unique( diff --git a/src/main/cpp/src/datetime_rebase.cu b/src/main/cpp/src/datetime_rebase.cu index 9548d09dad..8963acf491 100644 --- a/src/main/cpp/src/datetime_rebase.cu +++ b/src/main/cpp/src/datetime_rebase.cu @@ -30,6 +30,8 @@ #include #include +#include + namespace { // Convert a date in Julian calendar to the number of days since epoch. @@ -73,28 +75,29 @@ std::unique_ptr gregorian_to_julian_days(cudf::column_view const& thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.size()), output->mutable_view().begin(), - [d_input = input.begin()] __device__(auto const idx) { - auto constexpr julian_end = cuda::std::chrono::year_month_day{ - cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{4}}; - auto constexpr gregorian_start = cuda::std::chrono::year_month_day{ - cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{15}}; - - auto const days_ts = d_input[idx].time_since_epoch().count(); - auto const days_since_epoch = cuda::std::chrono::sys_days(cudf::duration_D{days_ts}); - - // Convert the input into local date in Proleptic Gregorian calendar. - auto const ymd = cuda::std::chrono::year_month_day(days_since_epoch); - if (ymd > julian_end && ymd < gregorian_start) { - // This is the same as rebasing from the local date given at `gregorian_start`. - return cudf::timestamp_D{cudf::duration_D{-141427}}; - } - - // No change since this time. - if (ymd >= gregorian_start) { return d_input[idx]; } - - // Reinterpret year/month/day as in Julian calendar then compute the days since epoch. - return cudf::timestamp_D{cudf::duration_D{days_from_julian(ymd)}}; - }); + cuda::proclaim_return_type( + [d_input = input.begin()] __device__(auto const idx) { + auto constexpr julian_end = cuda::std::chrono::year_month_day{ + cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{4}}; + auto constexpr gregorian_start = cuda::std::chrono::year_month_day{ + cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{15}}; + + auto const days_ts = d_input[idx].time_since_epoch().count(); + auto const days_since_epoch = cuda::std::chrono::sys_days(cudf::duration_D{days_ts}); + + // Convert the input into local date in Proleptic Gregorian calendar. + auto const ymd = cuda::std::chrono::year_month_day(days_since_epoch); + if (ymd > julian_end && ymd < gregorian_start) { + // This is the same as rebasing from the local date given at `gregorian_start`. + return cudf::timestamp_D{cudf::duration_D{-141427}}; + } + + // No change since this time. + if (ymd >= gregorian_start) { return d_input[idx]; } + + // Reinterpret year/month/day as in Julian calendar then compute the days since epoch. + return cudf::timestamp_D{cudf::duration_D{days_from_julian(ymd)}}; + })); return output; } @@ -142,19 +145,20 @@ std::unique_ptr julian_to_gregorian_days(cudf::column_view const& thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.size()), output->mutable_view().begin(), - [d_input = input.begin()] __device__(auto const idx) { - auto const days_ts = d_input[idx].time_since_epoch().count(); - if (days_ts >= -141427) { // Gregorian start day - return d_input[idx]; - } - - // Reinterpret year/month/day as in Gregorian calendar then compute the days - // since epoch. - auto const ymd = julian_from_days(days_ts); - auto const result = - cuda::std::chrono::local_days{ymd}.time_since_epoch().count(); - return cudf::timestamp_D{cudf::duration_D{result}}; - }); + cuda::proclaim_return_type( + [d_input = input.begin()] __device__(auto const idx) { + auto const days_ts = d_input[idx].time_since_epoch().count(); + if (days_ts >= -141427) { // Gregorian start day + return d_input[idx]; + } + + // Reinterpret year/month/day as in Gregorian calendar then compute the days + // since epoch. + auto const ymd = julian_from_days(days_ts); + auto const result = + cuda::std::chrono::local_days{ymd}.time_since_epoch().count(); + return cudf::timestamp_D{cudf::duration_D{result}}; + })); return output; } @@ -242,39 +246,40 @@ std::unique_ptr gregorian_to_julian_micros(cudf::column_view const thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.size()), output->mutable_view().begin(), - [d_input = input.begin()] __device__(auto const idx) { - // This timestamp corresponds to October 15th, 1582 UTC. - // After this day, there is no difference in microsecond values between Gregorian - // and Julian calendars. - int64_t constexpr last_switch_gregorian_ts = -12219292800000000L; - - auto const micros_ts = d_input[idx].time_since_epoch().count(); - if (micros_ts >= last_switch_gregorian_ts) { return d_input[idx]; } - - // Convert the input into local date-time in Proleptic Gregorian calendar. - auto const days_since_epoch = cuda::std::chrono::sys_days(static_cast( - cuda::std::chrono::floor(cudf::duration_us(micros_ts)))); - auto const ymd = cuda::std::chrono::year_month_day(days_since_epoch); - auto const timeparts = get_time_components(micros_ts); - - auto constexpr julian_end = cuda::std::chrono::year_month_day{ - cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{4}}; - auto constexpr gregorian_start = cuda::std::chrono::year_month_day{ - cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{15}}; - - // Reinterpret the local date-time as in Julian calendar and compute microseconds since - // the epoch from that Julian local date-time. - // If the input date is outside of both calendars, consider it as it is a local date - // given at `gregorian_start` (-141427 Julian days since epoch). - auto const julian_days = - (ymd > julian_end && ymd < gregorian_start) ? -141427 : days_from_julian(ymd); - int64_t result = (julian_days * 24L * 3600L) + (timeparts.hour * 3600L) + - (timeparts.minute * 60L) + timeparts.second; - result *= MICROS_PER_SECOND; // to microseconds - result += timeparts.subsecond; - - return cudf::timestamp_us{cudf::duration_us{result}}; - }); + cuda::proclaim_return_type( + [d_input = input.begin()] __device__(auto const idx) { + // This timestamp corresponds to October 15th, 1582 UTC. + // After this day, there is no difference in microsecond values between Gregorian + // and Julian calendars. + int64_t constexpr last_switch_gregorian_ts = -12219292800000000L; + + auto const micros_ts = d_input[idx].time_since_epoch().count(); + if (micros_ts >= last_switch_gregorian_ts) { return d_input[idx]; } + + // Convert the input into local date-time in Proleptic Gregorian calendar. + auto const days_since_epoch = cuda::std::chrono::sys_days(static_cast( + cuda::std::chrono::floor(cudf::duration_us(micros_ts)))); + auto const ymd = cuda::std::chrono::year_month_day(days_since_epoch); + auto const timeparts = get_time_components(micros_ts); + + auto constexpr julian_end = cuda::std::chrono::year_month_day{ + cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{4}}; + auto constexpr gregorian_start = cuda::std::chrono::year_month_day{ + cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{15}}; + + // Reinterpret the local date-time as in Julian calendar and compute microseconds since + // the epoch from that Julian local date-time. + // If the input date is outside of both calendars, consider it as it is a local date + // given at `gregorian_start` (-141427 Julian days since epoch). + auto const julian_days = + (ymd > julian_end && ymd < gregorian_start) ? -141427 : days_from_julian(ymd); + int64_t result = (julian_days * 24L * 3600L) + (timeparts.hour * 3600L) + + (timeparts.minute * 60L) + timeparts.second; + result *= MICROS_PER_SECOND; // to microseconds + result += timeparts.subsecond; + + return cudf::timestamp_us{cudf::duration_us{result}}; + })); return output; } @@ -304,31 +309,32 @@ std::unique_ptr julian_to_gregorian_micros(cudf::column_view const thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.size()), output->mutable_view().begin(), - [d_input = input.begin()] __device__(auto const idx) { - // This timestamp corresponds to October 15th, 1582 UTC. - // After this day, there is no difference in microsecond values between Gregorian - // and Julian calendars. - int64_t constexpr last_switch_gregorian_ts = -12219292800000000L; - - auto const micros_ts = d_input[idx].time_since_epoch().count(); - if (micros_ts >= last_switch_gregorian_ts) { return d_input[idx]; } - - // Convert the input into local date-time in Julian calendar. - auto const days_since_epoch = cuda::std::chrono::sys_days(static_cast( - cuda::std::chrono::floor(cudf::duration_us(micros_ts)))); - auto const ymd = julian_from_days(days_since_epoch.time_since_epoch().count()); - auto const timeparts = get_time_components(micros_ts); - - // Reinterpret the local date-time as in Gregorian calendar and compute microseconds since - // the epoch from that Gregorian local date-time. - auto const gregorian_days = cuda::std::chrono::local_days(ymd).time_since_epoch().count(); - int64_t result = (gregorian_days * 24L * 3600L) + (timeparts.hour * 3600L) + - (timeparts.minute * 60L) + timeparts.second; - result *= MICROS_PER_SECOND; // to microseconds - result += timeparts.subsecond; - - return cudf::timestamp_us{cudf::duration_us{result}}; - }); + cuda::proclaim_return_type( + [d_input = input.begin()] __device__(auto const idx) { + // This timestamp corresponds to October 15th, 1582 UTC. + // After this day, there is no difference in microsecond values between Gregorian + // and Julian calendars. + int64_t constexpr last_switch_gregorian_ts = -12219292800000000L; + + auto const micros_ts = d_input[idx].time_since_epoch().count(); + if (micros_ts >= last_switch_gregorian_ts) { return d_input[idx]; } + + // Convert the input into local date-time in Julian calendar. + auto const days_since_epoch = cuda::std::chrono::sys_days(static_cast( + cuda::std::chrono::floor(cudf::duration_us(micros_ts)))); + auto const ymd = julian_from_days(days_since_epoch.time_since_epoch().count()); + auto const timeparts = get_time_components(micros_ts); + + // Reinterpret the local date-time as in Gregorian calendar and compute microseconds since + // the epoch from that Gregorian local date-time. + auto const gregorian_days = cuda::std::chrono::local_days(ymd).time_since_epoch().count(); + int64_t result = (gregorian_days * 24L * 3600L) + (timeparts.hour * 3600L) + + (timeparts.minute * 60L) + timeparts.second; + result *= MICROS_PER_SECOND; // to microseconds + result += timeparts.subsecond; + + return cudf::timestamp_us{cudf::duration_us{result}}; + })); return output; } diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/map_utils.cu index f8ac369973..a51a7de57b 100644 --- a/src/main/cpp/src/map_utils.cu +++ b/src/main/cpp/src/map_utils.cu @@ -54,6 +54,8 @@ // #include +#include + namespace spark_rapids_jni { using namespace cudf::io::json; @@ -179,29 +181,33 @@ rmm::device_uvector compute_node_levels(int64_t num_nodes, auto token_levels = rmm::device_uvector(tokens.size(), stream); // Whether the token pops from the parent node stack. - auto const does_pop = [] __device__(PdaTokenT const token) -> bool { - switch (token) { - case token_t::StructMemberEnd: - case token_t::StructEnd: - case token_t::ListEnd: return true; - default: return false; - }; - }; + auto const does_pop = + cuda::proclaim_return_type([] __device__(PdaTokenT const token) -> bool { + switch (token) { + case token_t::StructMemberEnd: + case token_t::StructEnd: + case token_t::ListEnd: return true; + default: return false; + }; + }); // Whether the token pushes onto the parent node stack. - auto const does_push = [] __device__(PdaTokenT const token) -> bool { - switch (token) { - case token_t::FieldNameBegin: - case token_t::StructBegin: - case token_t::ListBegin: return true; - default: return false; - }; - }; + auto const does_push = + cuda::proclaim_return_type([] __device__(PdaTokenT const token) -> bool { + switch (token) { + case token_t::FieldNameBegin: + case token_t::StructBegin: + case token_t::ListBegin: return true; + default: return false; + }; + }); auto const push_pop_it = thrust::make_transform_iterator( - tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> cudf::size_type { - return does_push(token) - does_pop(token); - }); + tokens.begin(), + cuda::proclaim_return_type( + [does_push, does_pop] __device__(PdaTokenT const token) -> cudf::size_type { + return does_push(token) - does_pop(token); + })); thrust::exclusive_scan( rmm::exec_policy(stream), push_pop_it, push_pop_it + tokens.size(), token_levels.begin()); @@ -302,20 +308,20 @@ rmm::device_uvector compute_parent_node_ids( rmm::device_uvector const& node_token_ids, rmm::cuda_stream_view stream) { - auto const first_childs_parent_token_id = [tokens = - tokens.begin()] __device__(auto i) -> NodeIndexT { - if (i <= 0) { return -1; } - if (tokens[i - 1] == token_t::StructBegin || tokens[i - 1] == token_t::ListBegin) { - return i - 1; - } else if (tokens[i - 1] == token_t::FieldNameEnd) { - return i - 2; - } else if (tokens[i - 1] == token_t::StructMemberBegin && - (tokens[i - 2] == token_t::StructBegin || tokens[i - 2] == token_t::ListBegin)) { - return i - 2; - } else { - return -1; - } - }; + auto const first_childs_parent_token_id = cuda::proclaim_return_type( + [tokens = tokens.begin()] __device__(auto i) -> NodeIndexT { + if (i <= 0) { return -1; } + if (tokens[i - 1] == token_t::StructBegin || tokens[i - 1] == token_t::ListBegin) { + return i - 1; + } else if (tokens[i - 1] == token_t::FieldNameEnd) { + return i - 2; + } else if (tokens[i - 1] == token_t::StructMemberBegin && + (tokens[i - 2] == token_t::StructBegin || tokens[i - 2] == token_t::ListBegin)) { + return i - 2; + } else { + return -1; + } + }); auto parent_node_ids = rmm::device_uvector(num_nodes, stream); thrust::transform( @@ -323,14 +329,15 @@ rmm::device_uvector compute_parent_node_ids( node_token_ids.begin(), node_token_ids.end(), parent_node_ids.begin(), - [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__( - NodeIndexT const tid) -> NodeIndexT { - auto const pid = first_childs_parent_token_id(tid); - return pid < 0 - ? cudf::io::json::parent_node_sentinel - : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - - node_ids_gpu; - }); + cuda::proclaim_return_type( + [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__( + NodeIndexT const tid) -> NodeIndexT { + auto const pid = first_childs_parent_token_id(tid); + return pid < 0 + ? cudf::io::json::parent_node_sentinel + : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - + node_ids_gpu; + })); // Propagate parent node to siblings from first sibling - inplace. auto const node_levels = compute_node_levels(num_nodes, tokens, stream); @@ -356,20 +363,21 @@ rmm::device_uvector check_key_or_value_nodes( transform_it, transform_it + parent_node_ids.size(), key_or_value.begin(), - [key_sentinel = key_sentinel, - value_sentinel = value_sentinel, - parent_ids = parent_node_ids.begin()] __device__(auto const node_id) -> int8_t { - if (parent_ids[node_id] > 0) { - auto const grand_parent = parent_ids[parent_ids[node_id]]; - if (grand_parent == 0) { - return key_sentinel; - } else if (parent_ids[grand_parent] == 0) { - return value_sentinel; + cuda::proclaim_return_type( + [key_sentinel = key_sentinel, + value_sentinel = value_sentinel, + parent_ids = parent_node_ids.begin()] __device__(auto const node_id) -> int8_t { + if (parent_ids[node_id] > 0) { + auto const grand_parent = parent_ids[parent_ids[node_id]]; + if (grand_parent == 0) { + return key_sentinel; + } else if (parent_ids[grand_parent] == 0) { + return value_sentinel; + } } - } - return 0; - }); + return 0; + })); #ifdef DEBUG_FROM_JSON print_debug(key_or_value, "Nodes are key/value (1==key, 2==value)", ", ", stream); @@ -390,53 +398,58 @@ struct node_ranges_fn { __device__ thrust::pair operator()(cudf::size_type node_id) const { - [[maybe_unused]] auto const is_begin_of_section = [] __device__(PdaTokenT const token) { - switch (token) { - case token_t::StructBegin: - case token_t::ListBegin: - case token_t::StringBegin: - case token_t::ValueBegin: - case token_t::FieldNameBegin: return true; - default: return false; - }; - }; + [[maybe_unused]] auto const is_begin_of_section = + cuda::proclaim_return_type([] __device__(PdaTokenT const token) { + switch (token) { + case token_t::StructBegin: + case token_t::ListBegin: + case token_t::StringBegin: + case token_t::ValueBegin: + case token_t::FieldNameBegin: return true; + default: return false; + }; + }); // The end-of-* partner token for a given beginning-of-* token - auto const end_of_partner = [] __device__(PdaTokenT const token) { - switch (token) { - case token_t::StructBegin: return token_t::StructEnd; - case token_t::ListBegin: return token_t::ListEnd; - case token_t::StringBegin: return token_t::StringEnd; - case token_t::ValueBegin: return token_t::ValueEnd; - case token_t::FieldNameBegin: return token_t::FieldNameEnd; - default: return token_t::ErrorBegin; - }; - }; + auto const end_of_partner = + cuda::proclaim_return_type([] __device__(PdaTokenT const token) { + switch (token) { + case token_t::StructBegin: return token_t::StructEnd; + case token_t::ListBegin: return token_t::ListEnd; + case token_t::StringBegin: return token_t::StringEnd; + case token_t::ValueBegin: return token_t::ValueEnd; + case token_t::FieldNameBegin: return token_t::FieldNameEnd; + default: return token_t::ErrorBegin; + }; + }); // Encode a fixed value for nested node types (list+struct). - auto const nested_node_to_value = [] __device__(PdaTokenT const token) -> int32_t { - switch (token) { - case token_t::StructBegin: return 1; - case token_t::StructEnd: return -1; - case token_t::ListBegin: return 1 << 8; - case token_t::ListEnd: return -(1 << 8); - default: return 0; - }; - }; - - auto const get_token_index = [include_quote_char = include_quote_char] __device__( - PdaTokenT const token, SymbolOffsetT const token_index) { - constexpr SymbolOffsetT quote_char_size = 1; - switch (token) { - // Strip off quote char included for StringBegin - case token_t::StringBegin: return token_index + (include_quote_char ? 0 : quote_char_size); - // Strip off or Include trailing quote char for string values for StringEnd - case token_t::StringEnd: return token_index + (include_quote_char ? quote_char_size : 0); - // Strip off quote char included for FieldNameBegin - case token_t::FieldNameBegin: return token_index + quote_char_size; - default: return token_index; - }; - }; + auto const nested_node_to_value = + cuda::proclaim_return_type([] __device__(PdaTokenT const token) -> int32_t { + switch (token) { + case token_t::StructBegin: return 1; + case token_t::StructEnd: return -1; + case token_t::ListBegin: return 1 << 8; + case token_t::ListEnd: return -(1 << 8); + default: return 0; + }; + }); + + auto const get_token_index = cuda::proclaim_return_type( + [include_quote_char = include_quote_char] __device__(PdaTokenT const token, + SymbolOffsetT const token_index) { + constexpr SymbolOffsetT quote_char_size = 1; + switch (token) { + // Strip off quote char included for StringBegin + case token_t::StringBegin: + return token_index + (include_quote_char ? 0 : quote_char_size); + // Strip off or Include trailing quote char for string values for StringEnd + case token_t::StringEnd: return token_index + (include_quote_char ? quote_char_size : 0); + // Strip off quote char included for FieldNameBegin + case token_t::FieldNameBegin: return token_index + quote_char_size; + default: return token_index; + }; + }); if (key_or_value[node_id] != key_sentinel && key_or_value[node_id] != value_sentinel) { return thrust::make_pair(0, 0); @@ -529,13 +542,15 @@ std::unique_ptr extract_keys_or_values( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const is_key = [key_or_value = key_or_value.begin()] __device__(auto const node_id) { - return key_or_value[node_id] == key_sentinel; - }; + auto const is_key = cuda::proclaim_return_type( + [key_or_value = key_or_value.begin()] __device__(auto const node_id) { + return key_or_value[node_id] == key_sentinel; + }); - auto const is_value = [key_or_value = key_or_value.begin()] __device__(auto const node_id) { - return key_or_value[node_id] == value_sentinel; - }; + auto const is_value = cuda::proclaim_return_type( + [key_or_value = key_or_value.begin()] __device__(auto const node_id) { + return key_or_value[node_id] == value_sentinel; + }); auto extract_ranges = rmm::device_uvector>(num_nodes, stream, mr); @@ -574,17 +589,19 @@ rmm::device_uvector compute_list_offsets( // For the nodes having parent_id == 0 (they are json object given by one input row), set their // child counts to zero. Otherwise, set child counts to `-1` (a sentinel number). - thrust::transform(rmm::exec_policy(stream), - parent_node_ids.begin(), - parent_node_ids.end(), - node_child_counts.begin(), - [] __device__(auto const parent_id) -> NodeIndexT { - return parent_id == 0 ? 0 : std::numeric_limits::lowest(); - }); - - auto const is_key = [key_or_value = key_or_value.begin()] __device__(auto const node_id) { - return key_or_value[node_id] == key_sentinel; - }; + thrust::transform( + rmm::exec_policy(stream), + parent_node_ids.begin(), + parent_node_ids.end(), + node_child_counts.begin(), + cuda::proclaim_return_type([] __device__(auto const parent_id) -> NodeIndexT { + return parent_id == 0 ? 0 : std::numeric_limits::lowest(); + })); + + auto const is_key = cuda::proclaim_return_type( + [key_or_value = key_or_value.begin()] __device__(auto const node_id) { + return key_or_value[node_id] == key_sentinel; + }); // Count the number of keys for each json object using `atomicAdd`. auto const transform_it = thrust::counting_iterator(0); @@ -608,7 +625,7 @@ rmm::device_uvector compute_list_offsets( node_child_counts.begin(), node_child_counts.end(), list_offsets.begin(), - [] __device__(auto const count) { return count >= 0; }, + cuda::proclaim_return_type([] __device__(auto const count) { return count >= 0; }), stream); CUDF_EXPECTS(thrust::distance(list_offsets.begin(), copy_end) == static_cast(n_lists), "Invalid list size computation."); diff --git a/src/main/cpp/src/murmur_hash.cu b/src/main/cpp/src/murmur_hash.cu index 679f521e77..17ec120b5a 100644 --- a/src/main/cpp/src/murmur_hash.cu +++ b/src/main/cpp/src/murmur_hash.cu @@ -27,6 +27,8 @@ #include #include +#include + namespace spark_rapids_jni { namespace { @@ -77,10 +79,11 @@ class murmur_device_row_hasher { _table.begin(), _table.end(), _seed, - [row_index, nulls = this->_check_nulls] __device__(auto hash, auto column) { - return cudf::type_dispatcher( - column.type(), element_hasher_adapter{nulls, hash}, column, row_index); - }); + cuda::proclaim_return_type( + [row_index, nulls = this->_check_nulls] __device__(auto hash, auto column) { + return cudf::type_dispatcher( + column.type(), element_hasher_adapter{nulls, hash}, column, row_index); + })); } private: diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu index d75dfc18c1..897ebe0208 100644 --- a/src/main/cpp/src/parse_uri.cu +++ b/src/main/cpp/src/parse_uri.cu @@ -30,6 +30,8 @@ #include #include +#include + #include namespace spark_rapids_jni { @@ -395,90 +397,97 @@ chunk_validity __device__ validate_host(string_view host) bool __device__ validate_query(string_view query) { // query can be alphanum and _-!.~'()*,;:$&+=?/[]@" - return validate_chunk(query, [] __device__(string_view::const_iterator iter) { - auto const c = *iter; - if (c != '!' && c != '"' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && - !(c >= '?' && c <= ']' && c != '\\') && !(c >= 'a' && c <= 'z') && c != '_' && c != '~') { - return false; - } - return true; - }); -} - -bool __device__ validate_authority(string_view authority, bool allow_invalid_escapes) -{ - // authority needs to be alphanum and @[]_-!.'()*,;:$&+= return validate_chunk( - authority, - [allow_invalid_escapes] __device__(string_view::const_iterator iter) { + query, cuda::proclaim_return_type([] __device__(string_view::const_iterator iter) { auto const c = *iter; - if (c != '!' && c != '$' && !(c >= '&' && c <= ';' && c != '/') && c != '=' && - !(c >= '@' && c <= '_' && c != '^' && c != '\\') && !(c >= 'a' && c <= 'z') && c != '~' && - (!allow_invalid_escapes || c != '%')) { + if (c != '!' && c != '"' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && + !(c >= '?' && c <= ']' && c != '\\') && !(c >= 'a' && c <= 'z') && c != '_' && c != '~') { return false; } return true; - }, - allow_invalid_escapes); + })); +} + +bool __device__ validate_authority(string_view authority, bool allow_invalid_escapes) +{ + // authority needs to be alphanum and @[]_-!.'()*,;:$&+= + return validate_chunk(authority, + cuda::proclaim_return_type( + [allow_invalid_escapes] __device__(string_view::const_iterator iter) { + auto const c = *iter; + if (c != '!' && c != '$' && !(c >= '&' && c <= ';' && c != '/') && + c != '=' && !(c >= '@' && c <= '_' && c != '^' && c != '\\') && + !(c >= 'a' && c <= 'z') && c != '~' && + (!allow_invalid_escapes || c != '%')) { + return false; + } + return true; + }), + allow_invalid_escapes); } bool __device__ validate_userinfo(string_view userinfo) { // can't be ] or [ in here - return validate_chunk(userinfo, [] __device__(string_view::const_iterator iter) { - auto const c = *iter; - if (c == '[' || c == ']') { return false; } - return true; - }); + return validate_chunk( + userinfo, cuda::proclaim_return_type([] __device__(string_view::const_iterator iter) { + auto const c = *iter; + if (c == '[' || c == ']') { return false; } + return true; + })); } bool __device__ validate_port(string_view port) { // port is positive numeric >=0 according to spark...shrug - return validate_chunk(port, [] __device__(string_view::const_iterator iter) { - auto const c = *iter; - if (c < '0' && c > '9') { return false; } - return true; - }); + return validate_chunk( + port, cuda::proclaim_return_type([] __device__(string_view::const_iterator iter) { + auto const c = *iter; + if (c < '0' && c > '9') { return false; } + return true; + })); } bool __device__ validate_path(string_view path) { // path can be alphanum and @[]_-!.~'()*?/&,;:$+= - return validate_chunk(path, [] __device__(string_view::const_iterator iter) { - auto const c = *iter; - if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && !(c >= '?' && c <= 'Z') && - c != '_' && !(c >= 'a' && c <= 'z') && c != '~') { - return false; - } - return true; - }); + return validate_chunk( + path, cuda::proclaim_return_type([] __device__(string_view::const_iterator iter) { + auto const c = *iter; + if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && !(c >= '@' && c <= 'Z') && + c != '_' && !(c >= 'a' && c <= 'z') && c != '~') { + return false; + } + return true; + })); } bool __device__ validate_opaque(string_view opaque) { // opaque can be alphanum and @[]_-!.~'()*?/,;:$@+= - return validate_chunk(opaque, [] __device__(string_view::const_iterator iter) { - auto const c = *iter; - if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && - !(c >= '?' && c <= ']' && c != '\\') && c != '_' && c != '~' && !(c >= 'a' && c <= 'z')) { - return false; - } - return true; - }); + return validate_chunk( + opaque, cuda::proclaim_return_type([] __device__(string_view::const_iterator iter) { + auto const c = *iter; + if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && + !(c >= '?' && c <= ']' && c != '\\') && c != '_' && c != '~' && !(c >= 'a' && c <= 'z')) { + return false; + } + return true; + })); } bool __device__ validate_fragment(string_view fragment) { // fragment can be alphanum and @[]_-!.~'()*?/,;:$&+= - return validate_chunk(fragment, [] __device__(string_view::const_iterator iter) { - auto const c = *iter; - if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && - !(c >= '?' && c <= ']' && c != '\\') && c != '_' && c != '~' && !(c >= 'a' && c <= 'z')) { - return false; - } - return true; - }); + return validate_chunk( + fragment, cuda::proclaim_return_type([] __device__(string_view::const_iterator iter) { + auto const c = *iter; + if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && + !(c >= '?' && c <= ']' && c != '\\') && c != '_' && c != '~' && !(c >= 'a' && c <= 'z')) { + return false; + } + return true; + })); } uri_parts __device__ validate_uri(const char* str, int len) diff --git a/src/main/cpp/src/row_conversion.cu b/src/main/cpp/src/row_conversion.cu deleted file mode 100644 index f2416fb3ab..0000000000 --- a/src/main/cpp/src/row_conversion.cu +++ /dev/null @@ -1,2595 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "row_conversion.hpp" - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 -#define ASYNC_MEMCPY_SUPPORTED -#endif - -#if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED) -#include -#endif // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED) - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace { - -constexpr auto JCUDF_ROW_ALIGNMENT = 8; - -constexpr auto MAX_BATCH_SIZE = std::numeric_limits::max(); - -// Number of rows each block processes in the two kernels. Tuned via nsight -constexpr auto NUM_STRING_ROWS_PER_BLOCK_TO_ROWS = 1024; -constexpr auto NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS = 64; -constexpr auto MIN_STRING_BLOCKS = 32; -constexpr auto MAX_STRING_BLOCKS = MAX_BATCH_SIZE; - -constexpr auto NUM_WARPS_IN_BLOCK = 32; - -} // anonymous namespace - -// needed to suppress warning about cuda::barrier -#pragma nv_diag_suppress static_var_with_dynamic_init - -using namespace cudf; -using detail::make_device_uvector_async; -using detail::make_device_uvector_sync; -using rmm::device_uvector; - -#ifdef ASYNC_MEMCPY_SUPPORTED -using cuda::aligned_size_t; -#else -template -using aligned_size_t = size_t; // Local stub for cuda::aligned_size_t. -#endif // ASYNC_MEMCPY_SUPPORTED - -namespace spark_rapids_jni { -namespace detail { - -/* - * This module converts data from row-major to column-major and from column-major to row-major. It - * is a transpose of the data of sorts, but there are a few complicating factors. They are spelled - * out below: - * - * Row Batches: - * The row data has to fit inside a cuDF column, which limits it to 2 gigs currently. The calling - * code attempts to keep the data size under 2 gigs, but due to padding this isn't always the case, - * so being able to break this up into multiple columns is necessary. Internally, this is referred - * to as the row batch, which is a group of rows that will fit into this 2 gig space requirement. - * There are typically 1 of these batches, but there can be 2. - * - * Async Memcpy: - * The CUDA blocks are using memcpy_async, which allows for the device to schedule memcpy operations - * and then wait on them to complete at a later time with a barrier. On Ampere or later hardware - * there is dedicated hardware to do this copy and on pre-Ampere it should generate the same code - * that a hand-rolled loop would generate, so performance should be the same or better than a - * hand-rolled kernel. - * - * Tile Info: - * Each CUDA block will work on a single tile info before exiting. This single tile consumes all - * available shared memory. The kernel reads data into shared memory and then back out from shared - * memory to device memory via memcpy_async. This kernel is completely memory bound. - * - * Batch Data: - * This structure contains all the row batches and some book-keeping data necessary for the batches - * such as row numbers for the batches. - * - * Tiles: - * The tile info describes a tile of data to process. In a GPU with 48KB this equates to about 221 - * bytes in each direction of a table. The tiles are kept as square as possible to attempt to - * coalesce memory operations. The taller a tile is the better coalescing of columns, but row - * coalescing suffers. The wider a tile is the better the row coalescing, but columns coalescing - * suffers. The code attempts to produce a square tile to balance the coalescing. It starts by - * figuring out the optimal byte length and then adding columns to the data until the tile is too - * large. Since rows are different width with different alignment requirements, this isn't typically - * exact. Once a width is found the tiles are generated vertically with that width and height and - * then the process repeats. This means all the tiles will be the same height, but will have - * different widths based on what columns they encompass. Tiles in a vertical row will all have the - * same dimensions. - * - * -------------------------------- - * | 4 5.0f || True 8 3 1 | - * | 3 6.0f || False 3 1 1 | - * | 2 7.0f || True 7 4 1 | - * | 1 8.0f || False 2 5 1 | - * -------------------------------- - * | 0 9.0f || True 6 7 1 | - * ... - */ - -/** - * @brief The CUDA blocks work on one tile_info struct of data. - * This structure defines the workspaces for the blocks. - * - */ -struct tile_info { - int start_col; - int start_row; - int end_col; - int end_row; - int batch_number; - - __device__ inline size_type get_shared_row_size(size_type const* const col_offsets, - size_type const* const col_sizes) const - { - // this calculation is invalid if there are holes in the data such as a variable-width column. - // It is wrong in a safe way in that it will say this row size is larger than it should be, so - // we are not losing data we are just not as efficient as we could be with shared memory. This - // may be a problem if the tile is computed without regard to variable width offset/length sizes - // in that we overrun shared memory. - return util::round_up_unsafe(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], - JCUDF_ROW_ALIGNMENT); - } - - __device__ inline size_type num_cols() const { return end_col - start_col + 1; } - - __device__ inline size_type num_rows() const { return end_row - start_row + 1; } -}; - -/** - * @brief Returning rows is done in a byte cudf column. This is limited in size by - * `size_type` and so output is broken into batches of rows that fit inside - * this limit. - * - */ -struct row_batch { - size_type num_bytes; // number of bytes in this batch - size_type row_count; // number of rows in the batch - device_uvector row_offsets; // offsets column of output cudf column -}; - -/** - * @brief Holds information about the batches of data to be processed - * - */ -struct batch_data { - device_uvector batch_row_offsets; // offsets to each row in incoming data - device_uvector d_batch_row_boundaries; // row numbers for the start of each batch - std::vector - batch_row_boundaries; // row numbers for the start of each batch: 0, 1500, 2700 - std::vector row_batches; // information about each batch such as byte count -}; - -/** - * @brief builds row size information for tables that contain strings - * - * @param tbl table from which to compute row size information - * @param fixed_width_and_validity_size size of fixed-width and validity data in this table - * @param stream cuda stream on which to operate - * @return pair of device vector of size_types of the row sizes of the table and a device vector of - * offsets into the string column - */ -std::pair, rmm::device_uvector> -build_string_row_offsets(table_view const& tbl, - size_type fixed_width_and_validity_size, - rmm::cuda_stream_view stream) -{ - auto const num_rows = tbl.num_rows(); - rmm::device_uvector d_row_sizes(num_rows, stream); - thrust::uninitialized_fill(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), 0); - - auto d_offsets_iterators = [&]() { - std::vector offsets_iterators; - auto offsets_iter = thrust::make_transform_iterator( - tbl.begin(), [](auto const& col) -> strings_column_view::offset_iterator { - if (!is_fixed_width(col.type())) { - CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!"); - return strings_column_view(col).offsets_begin(); - } else { - return nullptr; - } - }); - std::copy_if(offsets_iter, - offsets_iter + tbl.num_columns(), - std::back_inserter(offsets_iterators), - [](auto const& offset_ptr) { return offset_ptr != nullptr; }); - return make_device_uvector_sync( - offsets_iterators, stream, rmm::mr::get_current_device_resource()); - }(); - - auto const num_columns = static_cast(d_offsets_iterators.size()); - - thrust::for_each(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_columns * num_rows), - [d_offsets_iterators = d_offsets_iterators.data(), - num_columns, - num_rows, - d_row_sizes = d_row_sizes.data()] __device__(auto element_idx) { - auto const row = element_idx % num_rows; - auto const col = element_idx / num_rows; - auto const val = - d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row]; - atomicAdd(&d_row_sizes[row], val); - }); - - // transform the row sizes to include fixed width size and alignment - thrust::transform(rmm::exec_policy(stream), - d_row_sizes.begin(), - d_row_sizes.end(), - d_row_sizes.begin(), - [fixed_width_and_validity_size] __device__(auto row_size) { - return util::round_up_unsafe(fixed_width_and_validity_size + row_size, - JCUDF_ROW_ALIGNMENT); - }); - - return {std::move(d_row_sizes), std::move(d_offsets_iterators)}; -} - -/** - * @brief functor to return the offset of a row in a table with string columns - * - */ -struct string_row_offset_functor { - string_row_offset_functor(device_span d_row_offsets) - : d_row_offsets(d_row_offsets){}; - - __device__ inline size_type operator()(int row_number, int) const - { - return d_row_offsets[row_number]; - } - - device_span d_row_offsets; -}; - -/** - * @brief functor to return the offset of a row in a table with only fixed-width columns - * - */ -struct fixed_width_row_offset_functor { - fixed_width_row_offset_functor(size_type fixed_width_only_row_size) - : _fixed_width_only_row_size(fixed_width_only_row_size){}; - - __device__ inline size_type operator()(int row_number, int tile_row_start) const - { - return (row_number - tile_row_start) * _fixed_width_only_row_size; - } - - size_type _fixed_width_only_row_size; -}; - -/** - * @brief Copies data from row-based JCUDF format to column-based cudf format. - * - * This optimized version of the conversion is faster for fixed-width tables that do not have more - * than 100 columns. - * - * @param num_rows number of rows in the incoming table - * @param num_columns number of columns in the incoming table - * @param row_size length in bytes of each row - * @param input_offset_in_row offset to each row of data - * @param num_bytes total number of bytes in the incoming data - * @param output_data array of pointers to the output data - * @param output_nm array of pointers to the output null masks - * @param input_data pointing to the incoming row data - */ -__global__ void copy_from_rows_fixed_width_optimized(const size_type num_rows, - const size_type num_columns, - const size_type row_size, - const size_type* input_offset_in_row, - const size_type* num_bytes, - int8_t** output_data, - bitmask_type** output_nm, - const int8_t* input_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // For simplicity we will refer to this as a row_group - - // In practice we have found writing more than 4 columns of data per thread - // results in performance loss. As such we are using a 2 dimensional - // kernel in terms of threads, but not in terms of blocks. Columns are - // controlled by the y dimension (there is no y dimension in blocks). Rows - // are controlled by the x dimension (there are multiple blocks in the x - // dimension). - - size_type const rows_per_group = blockDim.x; - size_type const row_group_start = blockIdx.x; - size_type const row_group_stride = gridDim.x; - size_type const row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; - - extern __shared__ int8_t shared_data[]; - - // Because we are copying fixed width only data and we stride the rows - // this thread will always start copying from shared data in the same place - int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - - for (auto row_group_index = row_group_start; row_group_index < row_group_end; - row_group_index += row_group_stride) { - // Step 1: Copy the data into shared memory - // We know row_size is always aligned with and a multiple of int64_t; - int64_t* long_shared = reinterpret_cast(shared_data); - int64_t const* long_input = reinterpret_cast(input_data); - - auto const shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); - auto const shared_output_stride = blockDim.x * blockDim.y; - auto const row_index_end = std::min(num_rows, ((row_group_index + 1) * rows_per_group)); - auto const num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - auto const shared_length = row_size * num_rows_in_group; - - size_type const shared_output_end = shared_length / sizeof(int64_t); - - auto const start_input_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - - for (size_type shared_index = shared_output_index; shared_index < shared_output_end; - shared_index += shared_output_stride) { - long_shared[shared_index] = long_input[start_input_index + shared_index]; - } - // Wait for all of the data to be in shared memory - __syncthreads(); - - // Step 2 copy the data back out - - // Within the row group there should be 1 thread for each row. This is a - // requirement for launching the kernel - auto const row_index = (row_group_index * rows_per_group) + threadIdx.x; - // But we might not use all of the threads if the number of rows does not go - // evenly into the thread count. We don't want those threads to exit yet - // because we may need them to copy data in for the next row group. - uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); - if (row_index < num_rows) { - auto const col_index_start = threadIdx.y; - auto const col_index_stride = blockDim.y; - for (auto col_index = col_index_start; col_index < num_columns; - col_index += col_index_stride) { - auto const col_size = num_bytes[col_index]; - int8_t const* col_tmp = &(row_tmp[input_offset_in_row[col_index]]); - int8_t* col_output = output_data[col_index]; - switch (col_size) { - case 1: { - col_output[row_index] = *col_tmp; - break; - } - case 2: { - int16_t* short_col_output = reinterpret_cast(col_output); - short_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - case 4: { - int32_t* int_col_output = reinterpret_cast(col_output); - int_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - case 8: { - int64_t* long_col_output = reinterpret_cast(col_output); - long_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - default: { - auto const output_offset = col_size * row_index; - // TODO this should just not be supported for fixed width columns, but just in case... - for (auto b = 0; b < col_size; b++) { - col_output[b + output_offset] = col_tmp[b]; - } - break; - } - } - - bitmask_type* nm = output_nm[col_index]; - int8_t* valid_byte = &row_vld_tmp[col_index / 8]; - size_type byte_bit_offset = col_index % 8; - int predicate = *valid_byte & (1 << byte_bit_offset); - uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } - } // end column loop - } // end row copy - // wait for the row_group to be totally copied before starting on the next row group - __syncthreads(); - } -} - -__global__ void copy_to_rows_fixed_width_optimized(const size_type start_row, - const size_type num_rows, - const size_type num_columns, - const size_type row_size, - const size_type* output_offset_in_row, - const size_type* num_bytes, - const int8_t** input_data, - const bitmask_type** input_nm, - int8_t* output_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // We do not support copying a subset of the columns in a row yet, so we don't - // currently support a row that is wider than shared memory. - // For simplicity we will refer to this as a row_group - - // In practice we have found reading more than 4 columns of data per thread - // results in performance loss. As such we are using a 2 dimensional - // kernel in terms of threads, but not in terms of blocks. Columns are - // controlled by the y dimension (there is no y dimension in blocks). Rows - // are controlled by the x dimension (there are multiple blocks in the x - // dimension). - - size_type rows_per_group = blockDim.x; - size_type row_group_start = blockIdx.x; - size_type row_group_stride = gridDim.x; - size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; - - extern __shared__ int8_t shared_data[]; - - // Because we are copying fixed width only data and we stride the rows - // this thread will always start copying to shared data in the same place - int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t* row_vld_tmp = - &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - - for (size_type row_group_index = row_group_start; row_group_index < row_group_end; - row_group_index += row_group_stride) { - // Within the row group there should be 1 thread for each row. This is a - // requirement for launching the kernel - size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; - // But we might not use all of the threads if the number of rows does not go - // evenly into the thread count. We don't want those threads to exit yet - // because we may need them to copy data back out. - if (row_index < (start_row + num_rows)) { - size_type col_index_start = threadIdx.y; - size_type col_index_stride = blockDim.y; - for (size_type col_index = col_index_start; col_index < num_columns; - col_index += col_index_stride) { - size_type col_size = num_bytes[col_index]; - int8_t* col_tmp = &(row_tmp[output_offset_in_row[col_index]]); - const int8_t* col_input = input_data[col_index]; - switch (col_size) { - case 1: { - *col_tmp = col_input[row_index]; - break; - } - case 2: { - const int16_t* short_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = short_col_input[row_index]; - break; - } - case 4: { - const int32_t* int_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = int_col_input[row_index]; - break; - } - case 8: { - const int64_t* long_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = long_col_input[row_index]; - break; - } - default: { - size_type input_offset = col_size * row_index; - // TODO this should just not be supported for fixed width columns, but just in case... - for (size_type b = 0; b < col_size; b++) { - col_tmp[b] = col_input[b + input_offset]; - } - break; - } - } - // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned - // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t* valid_byte = &row_vld_tmp[col_index / 8]; - size_type byte_bit_offset = col_index % 8; - uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; - int32_t* valid_int = reinterpret_cast(valid_byte - fixup_bytes); - size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); - // Now copy validity for the column - if (input_nm[col_index]) { - if (bit_is_set(input_nm[col_index], row_index)) { - atomicOr_block(valid_int, 1 << int_bit_offset); - } else { - atomicAnd_block(valid_int, ~(1 << int_bit_offset)); - } - } else { - // It is valid so just set the bit - atomicOr_block(valid_int, 1 << int_bit_offset); - } - } // end column loop - } // end row copy - // wait for the row_group to be totally copied into shared memory - __syncthreads(); - - // Step 2: Copy the data back out - // We know row_size is always aligned with and a multiple of int64_t; - int64_t* long_shared = reinterpret_cast(shared_data); - int64_t* long_output = reinterpret_cast(output_data); - - size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); - size_type shared_input_stride = blockDim.x * blockDim.y; - size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { row_index_end = num_rows; } - size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - size_type shared_length = row_size * num_rows_in_group; - - size_type shared_input_end = shared_length / sizeof(int64_t); - - size_type start_output_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - - for (size_type shared_index = shared_input_index; shared_index < shared_input_end; - shared_index += shared_input_stride) { - long_output[start_output_index + shared_index] = long_shared[shared_index]; - } - __syncthreads(); - // Go for the next round - } -} - -#ifdef ASYNC_MEMCPY_SUPPORTED -#define MEMCPY(dst, src, size, barrier) cuda::memcpy_async(dst, src, size, barrier) -#else -#define MEMCPY(dst, src, size, barrier) memcpy(dst, src, size) -#endif // ASYNC_MEMCPY_SUPPORTED - -/** - * @brief copy data from cudf columns into JCUDF format, which is row-based - * - * @tparam RowOffsetFunctor iterator that gives the size of a specific row of the table. - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_tile shared memory amount each `tile_info` is using - * @param tile_infos span of `tile_info` structs the define the work - * @param input_data pointer to raw table data - * @param col_sizes array of sizes for each element in a column - one per column - * @param col_offsets offset into input data row for each column's start - * @param row_offsets offset to a specific row in the output data - * @param batch_row_boundaries row numbers for batch starts - * @param output_data pointer to output data - * - */ -template -__global__ void copy_to_rows(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_tile, - device_span tile_infos, - const int8_t** input_data, - const size_type* col_sizes, - const size_type* col_offsets, - RowOffsetFunctor row_offsets, - size_type const* batch_row_boundaries, - int8_t** output_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // This has been broken up for us in the tile_info struct, so we don't have - // any calculation to do here, but it is important to note. - - auto const group = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(group); - extern __shared__ int8_t shared_data[]; - -#ifdef ASYNC_MEMCPY_SUPPORTED - __shared__ cuda::barrier tile_barrier; - if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); } - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED - - auto const tile = tile_infos[blockIdx.x]; - auto const num_tile_cols = tile.num_cols(); - auto const num_tile_rows = tile.num_rows(); - auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes); - auto const starting_column_offset = col_offsets[tile.start_col]; - - // to do the copy we need to do n column copies followed by m element copies OR we have to do m - // element copies followed by r row copies. When going from column to row it is much easier to - // copy by elements first otherwise we would need a running total of the column sizes for our - // tile, which isn't readily available. This makes it more appealing to copy element-wise from - // input data into shared matching the end layout and do row-based memcopies out. - - // read each column across the tile - // each warp takes a column with each thread of a warp taking a row this is done with cooperative - // groups where each column is chosen by the tiled partition and each thread in that partition - // works on a row - for (int relative_col = warp.meta_group_rank(); relative_col < num_tile_cols; - relative_col += warp.meta_group_size()) { - auto const absolute_col = relative_col + tile.start_col; - auto const col_size = col_sizes[absolute_col]; - auto const col_offset = col_offsets[absolute_col]; - auto const relative_col_offset = col_offset - starting_column_offset; - auto const col_ptr = input_data[absolute_col]; - - if (col_ptr == nullptr) { - // variable-width data column - continue; - } - - for (int relative_row = warp.thread_rank(); relative_row < num_tile_rows; - relative_row += warp.size()) { - if (relative_row >= num_tile_rows) { - // out of bounds - continue; - } - auto const absolute_row = relative_row + tile.start_row; - - auto const shared_offset = relative_row * tile_row_size + relative_col_offset; - auto const input_src = col_ptr + col_size * absolute_row; - - // copy the element from global memory - switch (col_size) { - case 2: { - const int16_t* short_col_input = reinterpret_cast(input_src); - *reinterpret_cast(&shared_data[shared_offset]) = *short_col_input; - break; - } - case 4: { - const int32_t* int_col_input = reinterpret_cast(input_src); - *reinterpret_cast(&shared_data[shared_offset]) = *int_col_input; - break; - } - case 8: { - const int64_t* long_col_input = reinterpret_cast(input_src); - *reinterpret_cast(&shared_data[shared_offset]) = *long_col_input; - break; - } - case 1: shared_data[shared_offset] = *input_src; break; - default: { - for (int i = 0; i < col_size; ++i) { - shared_data[shared_offset] = *input_src; - } - break; - } - } - } - } - - auto const tile_output_buffer = output_data[tile.batch_number]; - auto const row_batch_start = tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; - - // no async copies above waiting on the barrier, so we sync the group here to ensure all copies to - // shared memory are completed before copying data out - group.sync(); - - // each warp takes a row - for (int copy_row = warp.meta_group_rank(); copy_row < tile.num_rows(); - copy_row += warp.meta_group_size()) { - auto const src = &shared_data[tile_row_size * copy_row]; - auto const dst = tile_output_buffer + row_offsets(copy_row + tile.start_row, row_batch_start) + - starting_column_offset; -#ifdef ASYNC_MEMCPY_SUPPORTED - cuda::memcpy_async(warp, dst, src, tile_row_size, tile_barrier); -#else - for (int b = warp.thread_rank(); b < tile_row_size; b += warp.size()) { - dst[b] = src[b]; - } -#endif - } - -#ifdef ASYNC_MEMCPY_SUPPORTED - // wait on the last copies to complete - tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED -} - -/** - * @brief copy data from row-based format to cudf columns - * - * @tparam RowOffsetFunctor iterator that gives the size of a specific row of the table. - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_tile amount of shared memory that is used by a tile - * @param row_offsets offset to a specific row in the output data - * @param batch_row_boundaries row numbers for batch starts - * @param output_data pointer to output data, partitioned by data size - * @param validity_offsets offset into input data row for validity data - * @param tile_infos information about the tiles of work - * @param input_nm pointer to input data - * - */ -template -__global__ void copy_validity_to_rows(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_tile, - RowOffsetFunctor row_offsets, - size_type const* batch_row_boundaries, - int8_t** output_data, - const size_type validity_offset, - device_span tile_infos, - const bitmask_type** input_nm) -{ - extern __shared__ int8_t shared_data[]; - - // each thread of warp reads a single int32 of validity - so we read 128 bytes then ballot_sync - // the bits and write the result to shmem after we fill shared mem memcpy it out in a blob. - auto const group = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(group); - -#ifdef ASYNC_MEMCPY_SUPPORTED - // Initialize cuda barriers for each tile. - __shared__ cuda::barrier shared_tile_barrier; - if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); } - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED - - auto tile = tile_infos[blockIdx.x]; - auto const num_tile_cols = tile.num_cols(); - auto const num_tile_rows = tile.num_rows(); - - auto const threads_per_warp = warp.size(); - auto const rows_per_read = cudf::detail::size_in_bits(); - - auto const num_sections_x = util::div_rounding_up_unsafe(num_tile_cols, threads_per_warp); - auto const num_sections_y = util::div_rounding_up_unsafe(num_tile_rows, rows_per_read); - auto const validity_data_row_length = util::round_up_unsafe( - util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT), JCUDF_ROW_ALIGNMENT); - auto const total_sections = num_sections_x * num_sections_y; - - // the tile is divided into sections. A warp operates on a section at a time. - for (int my_section_idx = warp.meta_group_rank(); my_section_idx < total_sections; - my_section_idx += warp.meta_group_size()) { - // convert to rows and cols - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; - auto const relative_col = section_x * threads_per_warp + warp.thread_rank(); - auto const relative_row = section_y * rows_per_read; - auto const absolute_col = relative_col + tile.start_col; - auto const absolute_row = relative_row + tile.start_row; - auto const participating = absolute_col < num_columns && absolute_row < num_rows; - auto const participation_mask = __ballot_sync(0xFFFFFFFF, participating); - - if (participating) { - auto my_data = input_nm[absolute_col] != nullptr - ? input_nm[absolute_col][word_index(absolute_row)] - : std::numeric_limits::max(); - - // every thread that is participating in the warp has 4 bytes, but it's column-based data and - // we need it in row-based. So we shuffle the bits around with ballot_sync to make the bytes - // we actually write. - bitmask_type dw_mask = 0x1; - for (int i = 0; i < threads_per_warp && relative_row + i < num_rows; ++i, dw_mask <<= 1) { - auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); - // lead thread in each warp writes data - auto const validity_write_offset = - validity_data_row_length * (relative_row + i) + (relative_col / CHAR_BIT); - if (warp.thread_rank() == 0) { - *reinterpret_cast(&shared_data[validity_write_offset]) = validity_data; - } - } - } - } - - auto const output_data_base = - output_data[tile.batch_number] + validity_offset + tile.start_col / CHAR_BIT; - - // each warp copies a row at a time - auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT); - auto const row_batch_start = tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; - - // make sure entire tile has finished copy - // Note that this was copied from above just under the for loop due to nsight complaints about - // divergent threads - group.sync(); - - for (int relative_row = warp.meta_group_rank(); relative_row < num_tile_rows; - relative_row += warp.meta_group_size()) { - auto const src = &shared_data[validity_data_row_length * relative_row]; - auto const dst = output_data_base + row_offsets(relative_row + tile.start_row, row_batch_start); -#ifdef ASYNC_MEMCPY_SUPPORTED - cuda::memcpy_async(warp, dst, src, row_bytes, shared_tile_barrier); -#else - for (int b = warp.thread_rank(); b < row_bytes; b += warp.size()) { - dst[b] = src[b]; - } -#endif - } - -#ifdef ASYNC_MEMCPY_SUPPORTED - // wait for tile of data to arrive - shared_tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED -} - -/** - * @brief kernel to copy string data to JCUDF row format - * - * @tparam RowOffsetFunctor iterator for row offsets into the destination data - * @param num_rows number of rows in this portion of the table - * @param num_variable_columns number of columns of variable-width data - * @param variable_input_data variable width data column pointers - * @param variable_col_output_offsets output offset information for variable-width columns - * @param variable_col_offsets input offset information for variable-width columns - * @param fixed_width_row_size offset to variable-width data in a row - * @param row_offsets offsets for each row in output data - * @param batch_row_offset row start for this batch - * @param output_data pointer to output data for this batch - * - */ -template -__global__ void copy_strings_to_rows(size_type const num_rows, - size_type const num_variable_columns, - int8_t const** variable_input_data, - size_type const* variable_col_output_offsets, - size_type const** variable_col_offsets, - size_type fixed_width_row_size, - RowOffsetFunctor row_offsets, - size_type const batch_row_offset, - int8_t* output_data) -{ - // Each block will take a group of rows controlled by NUM_STRING_ROWS_PER_BLOCK_TO_ROWS. Each warp - // will copy a row at a time. The base thread will first go through column data and fill out - // offset/length information for the column. Then all threads of the warp will participate in the - // memcpy of the string data. - auto const my_block = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(my_block); -#ifdef ASYNC_MEMCPY_SUPPORTED - cuda::barrier block_barrier; -#endif - - auto const start_row = - blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset; - auto const end_row = - std::min(num_rows, static_cast(start_row + NUM_STRING_ROWS_PER_BLOCK_TO_ROWS)); - - for (int row = start_row; row < end_row; row += warp.meta_group_size()) { - auto offset = fixed_width_row_size; // initial offset to variable-width data - auto const base_row_offset = row_offsets(row, 0); - for (int col = 0; col < num_variable_columns; ++col) { - auto const string_start_offset = variable_col_offsets[col][row]; - auto const string_length = variable_col_offsets[col][row + 1] - string_start_offset; - if (warp.thread_rank() == 0) { - // write the offset/length to column - uint32_t* output_dest = reinterpret_cast( - &output_data[base_row_offset + variable_col_output_offsets[col]]); - output_dest[0] = offset; - output_dest[1] = string_length; - } - auto string_output_dest = &output_data[base_row_offset + offset]; - auto string_output_src = &variable_input_data[col][string_start_offset]; - warp.sync(); -#ifdef ASYNC_MEMCPY_SUPPORTED - cuda::memcpy_async(warp, string_output_dest, string_output_src, string_length, block_barrier); -#else - for (int c = warp.thread_rank(); c < string_length; c += warp.size()) { - string_output_dest[c] = string_output_src[c]; - } -#endif - offset += string_length; - } - } -} -/** - * @brief copy data from row-based format to cudf columns - * - * @tparam RowOffsetFunctor iterator that gives the size of a specific row of the table. - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_tile amount of shared memory that is used by a tile - * @param row_offsets offset to a specific row in the input data - * @param batch_row_boundaries row numbers for batch starts - * @param output_data pointers to column data - * @param col_sizes array of sizes for each element in a column - one per column - * @param col_offsets offset into input data row for each column's start - * @param tile_infos information about the tiles of work - * @param input_data pointer to input data - * - */ -template -__global__ void copy_from_rows(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_tile, - RowOffsetFunctor row_offsets, - size_type const* batch_row_boundaries, - int8_t** output_data, - const size_type* col_sizes, - const size_type* col_offsets, - device_span tile_infos, - const int8_t* input_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. This has been broken - // up for us in the tile_info struct, so we don't have any calculation to do here, but it is - // important to note. - - // To speed up some of the random access memory we do, we copy col_sizes and col_offsets to shared - // memory for each of the tiles that we work on - - auto const group = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(group); - extern __shared__ int8_t shared[]; - -#ifdef ASYNC_MEMCPY_SUPPORTED - // Initialize cuda barriers for each tile. - __shared__ cuda::barrier tile_barrier; - if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); } - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED - - { - auto const fetch_tile = tile_infos[blockIdx.x]; - auto const fetch_tile_start_row = fetch_tile.start_row; - auto const starting_col_offset = col_offsets[fetch_tile.start_col]; - auto const fetch_tile_row_size = fetch_tile.get_shared_row_size(col_offsets, col_sizes); - auto const row_batch_start = - fetch_tile.batch_number == 0 ? 0 : batch_row_boundaries[fetch_tile.batch_number]; - - for (int absolute_row = warp.meta_group_rank() + fetch_tile.start_row; - absolute_row <= fetch_tile.end_row; - absolute_row += warp.meta_group_size()) { - warp.sync(); - auto shared_offset = (absolute_row - fetch_tile_start_row) * fetch_tile_row_size; - auto dst = &shared[shared_offset]; - auto src = &input_data[row_offsets(absolute_row, row_batch_start) + starting_col_offset]; - // copy the data -#ifdef ASYNC_MEMCPY_SUPPORTED - cuda::memcpy_async(warp, dst, src, fetch_tile_row_size, tile_barrier); -#else - for (int b = warp.thread_rank(); b < fetch_tile_row_size; b += warp.size()) { - dst[b] = src[b]; - } -#endif - } - } - - { - auto const tile = tile_infos[blockIdx.x]; - auto const rows_in_tile = tile.num_rows(); - auto const cols_in_tile = tile.num_cols(); - auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes); - -#ifdef ASYNC_MEMCPY_SUPPORTED - // ensure our data is ready - tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED - - // Now we copy from shared memory to final destination. The data is laid out in rows in shared - // memory, so the reads for a column will be "vertical". Because of this and the different sizes - // for each column, this portion is handled on row/column basis. to prevent each thread working - // on a single row and also to ensure that all threads can do work in the case of more threads - // than rows, we do a global index instead of a double for loop with col/row. - for (int relative_row = warp.thread_rank(); relative_row < rows_in_tile; - relative_row += warp.size()) { - auto const absolute_row = relative_row + tile.start_row; - auto const shared_memory_row_offset = tile_row_size * relative_row; - - for (int relative_col = warp.meta_group_rank(); relative_col < cols_in_tile; - relative_col += warp.meta_group_size()) { - auto const absolute_col = relative_col + tile.start_col; - - auto const shared_memory_offset = - col_offsets[absolute_col] - col_offsets[tile.start_col] + shared_memory_row_offset; - auto const column_size = col_sizes[absolute_col]; - - int8_t* shmem_src = &shared[shared_memory_offset]; - int8_t* dst = &output_data[absolute_col][absolute_row * column_size]; - - MEMCPY(dst, shmem_src, column_size, tile_barrier); - } - } - } - -#ifdef ASYNC_MEMCPY_SUPPORTED - // wait on the last copies to complete - tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED -} - -/** - * @brief copy data from row-based format to cudf columns - * - * @tparam RowOffsetFunctor iterator that gives the size of a specific row of the table. - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_tile amount of shared memory that is used by a tile - * @param row_offsets offset to the first column a specific row in the input data - * @param batch_row_boundaries row numbers for batch starts - * @param output_nm pointers to null masks for columns - * @param validity_offsets offset into input data row for validity data - * @param tile_infos information about the tiles of work - * @param input_data pointer to input data - * - */ -template -__global__ void copy_validity_from_rows(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_tile, - RowOffsetFunctor row_offsets, - size_type const* batch_row_boundaries, - bitmask_type** output_nm, - const size_type validity_offset, - device_span tile_infos, - const int8_t* input_data) -{ - extern __shared__ int8_t shared[]; - - using cudf::detail::warp_size; - - // each thread of warp reads a single byte of validity - so we read 32 bytes then ballot_sync the - // bits and write the result to shmem after we fill shared mem memcpy it out in a blob. Probably - // need knobs for number of rows vs columns to balance read/write - - // C0 C1 C2 C3 C4 C5 C6 C7 - // R0 1 0 1 0 0 1 1 0 <-- thread 0 reads byte r0 - // R1 1 1 1 1 1 1 1 0 <-- thread 1 reads byte r1 - // R2 0 0 1 0 0 1 1 0 <-- thread 2 reads byte r2 - // ... - // R31 1 1 1 1 1 1 1 1 <-- thread 31 reads byte r31 - // ^ - // | 1 bit of each input byte, by column, are swizzled into a single 32 bit word via - // __ballot_sync, representing 32 rows of that column. - - auto const group = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(group); - -#ifdef ASYNC_MEMCPY_SUPPORTED - // Initialize cuda barriers for each tile. - __shared__ cuda::barrier shared_tile_barrier; - if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); } - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED - - auto const tile = tile_infos[blockIdx.x]; - auto const tile_start_col = tile.start_col; - auto const tile_start_row = tile.start_row; - auto const num_tile_cols = tile.num_cols(); - auto const num_tile_rows = tile.num_rows(); - - auto const threads_per_warp = warp.size(); - auto const cols_per_read = CHAR_BIT; - - auto const rows_per_read = static_cast(threads_per_warp); - auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, cols_per_read); - auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, rows_per_read); - auto const validity_data_col_length = num_sections_y * 4; // words to bytes - auto const total_sections = num_sections_x * num_sections_y; - - // the tile is divided into sections. A warp operates on a section at a time. - for (int my_section_idx = warp.meta_group_rank(); my_section_idx < total_sections; - my_section_idx += warp.meta_group_size()) { - // convert section to row and col - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; - auto const relative_col = section_x * cols_per_read; - auto const relative_row = section_y * rows_per_read + warp.thread_rank(); - auto const absolute_col = relative_col + tile_start_col; - auto const absolute_row = relative_row + tile_start_row; - auto const row_batch_start = - tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; - - auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); - - if (absolute_row < num_rows) { - auto const my_byte = input_data[row_offsets(absolute_row, row_batch_start) + validity_offset + - (absolute_col / cols_per_read)]; - - // so every thread that is participating in the warp has a byte, but it's row-based data and - // we need it in column-based. So we shuffle the bits around to make the bytes we actually - // write. - for (int i = 0, byte_mask = 0x1; (i < cols_per_read) && ((relative_col + i) < num_columns); - ++i, byte_mask <<= 1) { - auto const validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); - // lead thread in each warp writes data - if (warp.thread_rank() == 0) { - auto const validity_write_offset = - validity_data_col_length * (relative_col + i) + relative_row / cols_per_read; - *reinterpret_cast(&shared[validity_write_offset]) = validity_data; - } - } - } - } - - // now memcpy the shared memory out to the final destination - auto const col_words = util::div_rounding_up_unsafe(num_tile_rows, CHAR_BIT * 4); - - // make sure entire tile has finished copy - group.sync(); - - for (int relative_col = warp.meta_group_rank(); relative_col < num_tile_cols; - relative_col += warp.meta_group_size()) { - auto const absolute_col = relative_col + tile_start_col; - auto dst = output_nm[absolute_col] + word_index(tile_start_row); - auto const src = - reinterpret_cast(&shared[validity_data_col_length * relative_col]); - -#ifdef ASYNC_MEMCPY_SUPPORTED - cuda::memcpy_async( - warp, dst, src, aligned_size_t<4>(validity_data_col_length), shared_tile_barrier); -#else - for (int b = warp.thread_rank(); b < col_words; b += warp.size()) { - dst[b] = src[b]; - } -#endif - } - -#ifdef ASYNC_MEMCPY_SUPPORTED - // wait for tile of data to arrive - shared_tile_barrier.arrive_and_wait(); -#else - group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED -} - -/** - * @brief copies string data from jcudf row format to cudf columns - * - * @tparam RowOffsetFunctor iterator for row offsets into the destination data - * @param row_offsets offsets for each row in input data - * @param string_row_offsets offset data into jcudf row data for each string - * @param string_lengths length of each incoming string in each column - * @param string_column_offsets offset column data for cudf column - * @param string_col_data output cudf string column data - * @param row_data jcudf row data - * @param num_rows number of rows in data - * @param num_string_columns number of string columns in the table - */ -template -__global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets, - int32_t** string_row_offsets, - int32_t** string_lengths, - size_type** string_column_offsets, - char** string_col_data, - int8_t const* row_data, - size_type const num_rows, - size_type const num_string_columns) -{ - // Each warp takes a tile, which is a single column and up to ROWS_PER_BLOCK rows. A tile will not - // wrap around the bottom of the table. The warp will copy the strings for each row in the tile. - // Traversing in row-major order to coalesce the offsets and size reads. - auto my_block = cooperative_groups::this_thread_block(); - auto warp = cooperative_groups::tiled_partition(my_block); -#ifdef ASYNC_MEMCPY_SUPPORTED - cuda::barrier block_barrier; -#endif - - // workaround for not being able to take a reference to a constexpr host variable - auto const ROWS_PER_BLOCK = NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS; - auto const tiles_per_col = util::div_rounding_up_unsafe(num_rows, ROWS_PER_BLOCK); - auto const starting_tile = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank(); - auto const num_tiles = tiles_per_col * num_string_columns; - auto const tile_stride = warp.meta_group_size() * gridDim.x; - // Each warp will copy strings in its tile. This is handled by all the threads of a warp passing - // the same parameters to async_memcpy and all threads in the warp participating in the copy. - for (auto my_tile = starting_tile; my_tile < num_tiles; my_tile += tile_stride) { - auto const starting_row = (my_tile % tiles_per_col) * ROWS_PER_BLOCK; - auto const col = my_tile / tiles_per_col; - auto const str_len = string_lengths[col]; - auto const str_row_off = string_row_offsets[col]; - auto const str_col_off = string_column_offsets[col]; - auto str_col_data = string_col_data[col]; - for (int row = starting_row; row < starting_row + ROWS_PER_BLOCK && row < num_rows; ++row) { - auto const src = &row_data[row_offsets(row, 0) + str_row_off[row]]; - auto dst = &str_col_data[str_col_off[row]]; - -#ifdef ASYNC_MEMCPY_SUPPORTED - cuda::memcpy_async(warp, dst, src, str_len[row], block_barrier); -#else - for (int c = warp.thread_rank(); c < str_len[row]; c += warp.size()) { - dst[c] = src[c]; - } -#endif - } - } -} - -/** - * @brief Calculate the dimensions of the kernel for fixed width only columns. - * - * @param [in] num_columns the number of columns being copied. - * @param [in] num_rows the number of rows being copied. - * @param [in] size_per_row the size each row takes up when padded. - * @param [out] blocks the size of the blocks for the kernel - * @param [out] threads the size of the threads for the kernel - * @return the size in bytes of shared memory needed for each block. - */ -static int calc_fixed_width_kernel_dims(const size_type num_columns, - const size_type num_rows, - const size_type size_per_row, - dim3& blocks, - dim3& threads) -{ - // We have found speed degrades when a thread handles more than 4 columns. - // Each block is 2 dimensional. The y dimension indicates the columns. - // We limit this to 32 threads in the y dimension so we can still - // have at least 32 threads in the x dimension (1 warp) which should - // result in better coalescing of memory operations. We also - // want to guarantee that we are processing a multiple of 32 threads - // in the x dimension because we use atomic operations at the block - // level when writing validity data out to main memory, and that would - // need to change if we split a word of validity data between blocks. - int const y_block_size = min(util::div_rounding_up_safe(num_columns, 4), 32); - int const x_possible_block_size = 1024 / y_block_size; - // 48KB is the default setting for shared memory per block according to the cuda tutorials - // If someone configures the GPU to only have 16 KB this might not work. - int const max_shared_size = 48 * 1024; - // If we don't have enough shared memory there is no point in having more threads - // per block that will just sit idle - auto const max_block_size = std::min(x_possible_block_size, max_shared_size / size_per_row); - // Make sure that the x dimension is a multiple of 32 this not only helps - // coalesce memory access it also lets us do a ballot sync for validity to write - // the data back out the warp level. If x is a multiple of 32 then each thread in the y - // dimension is associated with one or more warps, that should correspond to the validity - // words directly. - int const block_size = (max_block_size / 32) * 32; - CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory"); - - // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1 - // but in practice haveing too many can cause some overhead that I don't totally - // understand. Playing around with this haveing as little as 600 blocks appears - // to be able to saturate memory on V100, so this is an order of magnitude higher - // to try and future proof this a bit. - int const num_blocks = std::clamp((num_rows + block_size - 1) / block_size, 1, 10240); - - blocks.x = num_blocks; - blocks.y = 1; - blocks.z = 1; - threads.x = block_size; - threads.y = y_block_size; - threads.z = 1; - return size_per_row * block_size; -} - -/** - * When converting to rows it is possible that the size of the table was too big to fit - * in a single column. This creates an output column for a subset of the rows in a table - * going from start row and containing the next num_rows. Most of the parameters passed - * into this function are common between runs and should be calculated once. - */ -static std::unique_ptr fixed_width_convert_to_rows( - const size_type start_row, - const size_type num_rows, - const size_type num_columns, - const size_type size_per_row, - rmm::device_uvector& column_start, - rmm::device_uvector& column_size, - rmm::device_uvector& input_data, - rmm::device_uvector& input_nm, - const scalar& zero, - const scalar& scalar_size_per_row, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - int64_t const total_allocation = size_per_row * num_rows; - // We made a mistake in the split somehow - CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), - "Table is too large to fit!"); - - // Allocate and set the offsets row for the byte array - std::unique_ptr offsets = - cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream, mr); - - std::unique_ptr data = make_numeric_column(data_type(type_id::INT8), - static_cast(total_allocation), - mask_state::UNALLOCATED, - stream, - mr); - - dim3 blocks; - dim3 threads; - int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - - copy_to_rows_fixed_width_optimized<<>>( - start_row, - num_rows, - num_columns, - size_per_row, - column_start.data(), - column_size.data(), - input_data.data(), - input_nm.data(), - data->mutable_view().data()); - - return make_lists_column(num_rows, - std::move(offsets), - std::move(data), - 0, - rmm::device_buffer{0, cudf::get_default_stream(), mr}, - stream, - mr); -} - -static inline bool are_all_fixed_width(std::vector const& schema) -{ - return std::all_of( - schema.begin(), schema.end(), [](const data_type& t) { return is_fixed_width(t); }); -} - -/** - * @brief Given a set of fixed width columns, calculate how the data will be laid out in memory. - * - * @param [in] schema the types of columns that need to be laid out. - * @param [out] column_start the byte offset where each column starts in the row. - * @param [out] column_size the size in bytes of the data for each columns in the row. - * @return the size in bytes each row needs. - */ -static inline int32_t compute_fixed_width_layout(std::vector const& schema, - std::vector& column_start, - std::vector& column_size) -{ - // We guarantee that the start of each column is 64-bit aligned so anything can go - // there, but to make the code simple we will still do an alignment for it. - int32_t at_offset = 0; - for (auto col = schema.begin(); col < schema.end(); col++) { - size_type s = size_of(*col); - column_size.emplace_back(s); - std::size_t allocation_needed = s; - std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types - at_offset = util::round_up_unsafe(at_offset, static_cast(alignment_needed)); - column_start.emplace_back(at_offset); - at_offset += allocation_needed; - } - - // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add - // it in - int32_t const validity_bytes_needed = - util::div_rounding_up_safe(schema.size(), CHAR_BIT); - // validity comes at the end and is byte aligned so we can pack more in. - at_offset += validity_bytes_needed; - // Now we need to pad the end so all rows are 64 bit aligned - return util::round_up_unsafe(at_offset, JCUDF_ROW_ALIGNMENT); -} - -/** - * @brief column sizes and column start offsets for a table - */ -struct column_info_s { - size_type size_per_row; - std::vector column_starts; - std::vector column_sizes; - std::vector variable_width_column_starts; - - column_info_s& operator=(column_info_s const& other) = delete; - column_info_s& operator=(column_info_s&& other) = delete; -}; - -/** - * @brief Compute information about a table such as bytes per row and offsets. - * - * @tparam iterator iterator of column schema data - * @param begin starting iterator of column schema - * @param end ending iterator of column schema - * @param column_starts column start offsets - * @param column_sizes size in bytes of each column - * @return size of the fixed_width data portion of a row. - */ -template -column_info_s compute_column_information(iterator begin, iterator end) -{ - size_type size_per_row = 0; - std::vector column_starts; - std::vector column_sizes; - std::vector variable_width_column_starts; - - column_starts.reserve(std::distance(begin, end) + 1); - column_sizes.reserve(std::distance(begin, end)); - - for (auto col_type = begin; col_type != end; ++col_type) { - bool const compound_type = is_compound(*col_type); - - // a list or string column will write a single uint64 of data here for offset/length - auto const col_size = compound_type ? sizeof(uint32_t) + sizeof(uint32_t) : size_of(*col_type); - - // align size for this type - They are the same for fixed width types and 4 bytes for variable - // width length/offset combos - size_type const alignment_needed = compound_type ? __alignof(uint32_t) : col_size; - size_per_row = util::round_up_unsafe(size_per_row, alignment_needed); - if (compound_type) { variable_width_column_starts.push_back(size_per_row); } - column_starts.push_back(size_per_row); - column_sizes.push_back(col_size); - size_per_row += col_size; - } - - // add validity offset to the end of fixed_width offsets - auto validity_offset = size_per_row; - column_starts.push_back(validity_offset); - - // validity is byte-aligned in the JCUDF format - size_per_row += - util::div_rounding_up_safe(static_cast(std::distance(begin, end)), CHAR_BIT); - - return {size_per_row, - std::move(column_starts), - std::move(column_sizes), - std::move(variable_width_column_starts)}; -} - -/** - * @brief Build `tile_info` for the validity data to break up the work. - * - * @param num_columns number of columns in the table - * @param num_rows number of rows in the table - * @param shmem_limit_per_tile size of shared memory available to a single gpu tile - * @param row_batches batched row information for multiple output locations - * @return vector of `tile_info` structs for validity data - */ -std::vector build_validity_tile_infos(size_type const& num_columns, - size_type const& num_rows, - size_type const& shmem_limit_per_tile, - std::vector const& row_batches) -{ - auto const desired_rows_and_columns = static_cast(sqrt(shmem_limit_per_tile)); - auto const column_stride = util::round_up_unsafe( - [&]() { - if (desired_rows_and_columns > num_columns) { - // not many columns, build a single tile for table width and ship it off - return num_columns; - } else { - return util::round_down_safe(desired_rows_and_columns, CHAR_BIT); - } - }(), - JCUDF_ROW_ALIGNMENT); - - // we fit as much as we can given the column stride note that an element in the table takes just 1 - // bit, but a row with a single element still takes 8 bytes! - auto const bytes_per_row = - util::round_up_safe(util::div_rounding_up_unsafe(column_stride, CHAR_BIT), JCUDF_ROW_ALIGNMENT); - auto const row_stride = - std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64)); - std::vector validity_tile_infos; - validity_tile_infos.reserve(num_columns / column_stride * num_rows / row_stride); - for (int col = 0; col < num_columns; col += column_stride) { - int current_tile_row_batch = 0; - int rows_left_in_batch = row_batches[current_tile_row_batch].row_count; - int row = 0; - while (row < num_rows) { - if (rows_left_in_batch == 0) { - current_tile_row_batch++; - rows_left_in_batch = row_batches[current_tile_row_batch].row_count; - } - int const tile_height = std::min(row_stride, rows_left_in_batch); - validity_tile_infos.emplace_back( - detail::tile_info{col, - row, - std::min(col + column_stride - 1, num_columns - 1), - row + tile_height - 1, - current_tile_row_batch}); - row += tile_height; - rows_left_in_batch -= tile_height; - } - } - - return validity_tile_infos; -} - -/** - * @brief functor that returns the size of a row or 0 is row is greater than the number of rows in - * the table - * - * @tparam RowSize iterator that returns the size of a specific row - */ -template -struct row_size_functor { - row_size_functor(size_type row_end, RowSize row_sizes, size_type last_row_end) - : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end) - { - } - - __device__ inline uint64_t operator()(int i) const - { - return i >= _row_end ? 0 : _row_sizes[i + _last_row_end]; - } - - size_type _row_end; - RowSize _row_sizes; - size_type _last_row_end; -}; - -/** - * @brief Builds batches of rows that will fit in the size limit of a column. - * - * @tparam RowSize iterator that gives the size of a specific row of the table. - * @param num_rows Total number of rows in the table - * @param row_sizes iterator that gives the size of a specific row of the table. - * @param all_fixed_width bool indicating all data in this table is fixed width - * @param stream stream to operate on for this work - * @param mr memory resource used to allocate any returned data - * @returns vector of size_type's that indicate row numbers for batch boundaries and a - * device_uvector of row offsets - */ -template -batch_data build_batches(size_type num_rows, - RowSize row_sizes, - bool all_fixed_width, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows); - auto const num_batches = static_cast( - util::div_rounding_up_safe(total_size, static_cast(MAX_BATCH_SIZE))); - auto const num_offsets = num_batches + 1; - std::vector row_batches; - std::vector batch_row_boundaries; - device_uvector batch_row_offsets(all_fixed_width ? 0 : num_rows, stream); - - // at most max gpu memory / 2GB iterations. - batch_row_boundaries.reserve(num_offsets); - batch_row_boundaries.push_back(0); - size_type last_row_end = 0; - device_uvector cumulative_row_sizes(num_rows, stream); - - // Evaluate the row size values before calling `inclusive_scan` to workaround - // memory issue in https://github.com/NVIDIA/spark-rapids-jni/issues/1567. - thrust::copy( - rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, cumulative_row_sizes.begin()); - thrust::inclusive_scan(rmm::exec_policy(stream), - cumulative_row_sizes.begin(), - cumulative_row_sizes.end(), - cumulative_row_sizes.begin()); - - // This needs to be split this into 2 gig batches. Care must be taken to avoid a batch larger than - // 2 gigs. Imagine a table with 900 meg rows. The batches should occur every 2 rows, but if a - // lower bound is run at 2 gigs, 4 gigs, 6 gigs. the batches will be 2 rows, 2 rows, 3 rows, which - // will be invalid. The previous batch size must be taken into account when building a new batch. - // One way is to pull the batch size back to the host and add it to MAX_BATCH_SIZE for the lower - // bound search. The other method involves keeping everything on device, but subtracting the - // previous batch from cumulative_row_sizes based on index. This involves no synchronization - // between GPU and CPU, but involves more work on the GPU. These further need to be broken on a - // 32-row boundary to match the fixed_width optimized versions. - - while (last_row_end < num_rows) { - auto offset_row_sizes = thrust::make_transform_iterator( - cumulative_row_sizes.begin(), - [last_row_end, cumulative_row_sizes = cumulative_row_sizes.data()] __device__(auto i) { - return i - cumulative_row_sizes[last_row_end]; - }); - auto search_start = offset_row_sizes + last_row_end; - auto search_end = offset_row_sizes + num_rows; - - // find the next MAX_BATCH_SIZE boundary - auto const lb = - thrust::lower_bound(rmm::exec_policy(stream), search_start, search_end, MAX_BATCH_SIZE); - size_type const batch_size = lb - search_start; - - size_type const row_end = lb == search_end - ? batch_size + last_row_end - : last_row_end + util::round_down_safe(batch_size, 32); - - // build offset list for each row in this batch - auto const num_rows_in_batch = row_end - last_row_end; - - // build offset list for each row in this batch - auto const num_entries = row_end - last_row_end + 1; - device_uvector output_batch_row_offsets(num_entries, stream, mr); - - auto row_size_iter_bounded = cudf::detail::make_counting_transform_iterator( - 0, row_size_functor(row_end, row_sizes, last_row_end)); - - thrust::exclusive_scan(rmm::exec_policy(stream), - row_size_iter_bounded, - row_size_iter_bounded + num_entries, - output_batch_row_offsets.begin()); - - auto const batch_bytes = output_batch_row_offsets.element(num_rows_in_batch, stream); - - // The output_batch_row_offsets vector is used as the offset column of the returned data. This - // needs to be individually allocated, but the kernel needs a contiguous array of offsets or - // more global lookups are necessary. - if (!all_fixed_width) { - cudaMemcpy(batch_row_offsets.data() + last_row_end, - output_batch_row_offsets.data(), - num_rows_in_batch * sizeof(size_type), - cudaMemcpyDeviceToDevice); - } - - batch_row_boundaries.push_back(row_end); - row_batches.push_back({batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)}); - - last_row_end = row_end; - } - - return { - std::move(batch_row_offsets), - make_device_uvector_async(batch_row_boundaries, stream, rmm::mr::get_current_device_resource()), - std::move(batch_row_boundaries), - std::move(row_batches)}; -} - -/** - * @brief Computes the number of tiles necessary given a tile height and batch offsets - * - * @param batch_row_boundaries row boundaries for each batch - * @param desired_tile_height height of each tile in the table - * @param stream stream to use - * @return number of tiles necessary - */ -int compute_tile_counts(device_span const& batch_row_boundaries, - int desired_tile_height, - rmm::cuda_stream_view stream) -{ - size_type const num_batches = batch_row_boundaries.size() - 1; - device_uvector num_tiles(num_batches, stream); - auto iter = thrust::make_counting_iterator(0); - thrust::transform( - rmm::exec_policy(stream), - iter, - iter + num_batches, - num_tiles.begin(), - [desired_tile_height, - batch_row_boundaries = batch_row_boundaries.data()] __device__(auto batch_index) -> size_type { - return util::div_rounding_up_unsafe( - batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index], - desired_tile_height); - }); - return thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end()); -} - -/** - * @brief Builds the `tile_info` structs for a given table. - * - * @param tiles span of tiles to populate - * @param batch_row_boundaries boundary to row batches - * @param column_start starting column of the tile - * @param column_end ending column of the tile - * @param desired_tile_height height of the tile - * @param total_number_of_rows total number of rows in the table - * @param stream stream to use - * @return number of tiles created - */ -size_type build_tiles( - device_span tiles, - device_uvector const& batch_row_boundaries, // comes from build_batches - int column_start, - int column_end, - int desired_tile_height, - int total_number_of_rows, - rmm::cuda_stream_view stream) -{ - size_type const num_batches = batch_row_boundaries.size() - 1; - device_uvector num_tiles(num_batches, stream); - auto iter = thrust::make_counting_iterator(0); - thrust::transform( - rmm::exec_policy(stream), - iter, - iter + num_batches, - num_tiles.begin(), - [desired_tile_height, - batch_row_boundaries = batch_row_boundaries.data()] __device__(auto batch_index) -> size_type { - return util::div_rounding_up_unsafe( - batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index], - desired_tile_height); - }); - - size_type const total_tiles = - thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end()); - - device_uvector tile_starts(num_batches + 1, stream); - auto tile_iter = cudf::detail::make_counting_transform_iterator( - 0, [num_tiles = num_tiles.data(), num_batches] __device__(auto i) { - return (i < num_batches) ? num_tiles[i] : 0; - }); - thrust::exclusive_scan(rmm::exec_policy(stream), - tile_iter, - tile_iter + num_batches + 1, - tile_starts.begin()); // in tiles - - thrust::transform( - rmm::exec_policy(stream), - iter, - iter + total_tiles, - tiles.begin(), - [ =, - tile_starts = tile_starts.data(), - batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type tile_index) { - // what batch this tile falls in - auto const batch_index_iter = - thrust::upper_bound(thrust::seq, tile_starts, tile_starts + num_batches, tile_index); - auto const batch_index = std::distance(tile_starts, batch_index_iter) - 1; - // local index within the tile - int const local_tile_index = tile_index - tile_starts[batch_index]; - // the start row for this batch. - int const batch_row_start = batch_row_boundaries[batch_index]; - // the start row for this tile - int const tile_row_start = batch_row_start + (local_tile_index * desired_tile_height); - // the end row for this tile - int const max_row = std::min(total_number_of_rows - 1, - batch_index + 1 > num_batches - ? std::numeric_limits::max() - : static_cast(batch_row_boundaries[batch_index + 1]) - 1); - int const tile_row_end = - std::min(batch_row_start + ((local_tile_index + 1) * desired_tile_height) - 1, max_row); - - // stuff the tile - return tile_info{ - column_start, tile_row_start, column_end, tile_row_end, static_cast(batch_index)}; - }); - - return total_tiles; -} - -/** - * @brief Determines what data should be operated on by each tile for the incoming table. - * - * @tparam TileCallback Callback that receives the start and end columns of tiles - * @param column_sizes vector of the size of each column - * @param column_starts vector of the offset of each column - * @param first_row_batch_size size of the first row batch to limit max tile size since a tile - * is unable to span batches - * @param total_number_of_rows total number of rows in the table - * @param shmem_limit_per_tile shared memory allowed per tile - * @param f callback function called when building a tile - */ -template -void determine_tiles(std::vector const& column_sizes, - std::vector const& column_starts, - size_type const first_row_batch_size, - size_type const total_number_of_rows, - size_type const& shmem_limit_per_tile, - TileCallback f) -{ - // tile infos are organized with the tile going "down" the columns this provides the most - // coalescing of memory access - int current_tile_width = 0; - int current_tile_start_col = 0; - - // the ideal tile height has lots of 8-byte reads and 8-byte writes. The optimal read/write would - // be memory cache line sized access, but since other tiles will read/write the edges this may not - // turn out to be overly important. For now, we will attempt to build a square tile as far as byte - // sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we want them - // equal, so height and width are sqrt(shared_mem_size). The trick is that it's in bytes, not rows - // or columns. - auto const square_bias = 32; // bias towards columns for performance reasons - auto const optimal_square_len = static_cast(sqrt(shmem_limit_per_tile)); - auto const desired_tile_height = util::round_up_safe( - std::min(optimal_square_len / square_bias, total_number_of_rows), cudf::detail::warp_size); - auto const tile_height = std::clamp(desired_tile_height, 1, first_row_batch_size); - - int row_size = 0; - - // march each column and build the tiles of appropriate sizes - for (uint col = 0; col < column_sizes.size(); ++col) { - auto const col_size = column_sizes[col]; - - // align size for this type - auto const alignment_needed = col_size; // They are the same for fixed width types - auto const row_size_aligned = util::round_up_unsafe(row_size, alignment_needed); - auto const row_size_with_this_col = row_size_aligned + col_size; - auto const row_size_with_end_pad = - util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT); - - if (row_size_with_end_pad * tile_height > shmem_limit_per_tile) { - // too large, close this tile, generate vertical tiles and restart - f(current_tile_start_col, col == 0 ? col : col - 1, tile_height); - - row_size = - util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed); - row_size += col_size; // alignment required for shared memory tile boundary to match - // alignment of output row - current_tile_start_col = col; - current_tile_width = 0; - } else { - row_size = row_size_with_this_col; - current_tile_width++; - } - } - - // build last set of tiles - if (current_tile_width > 0) { - f(current_tile_start_col, static_cast(column_sizes.size()) - 1, tile_height); - } -} - -/** - * @brief convert cudf table into JCUDF row format - * - * @tparam offsetFunctor functor type for offset functor - * @param tbl table to convert to JCUDF row format - * @param batch_info information about the batches of data - * @param offset_functor functor that returns the starting offset of each row - * @param column_info information about incoming columns - * @param variable_width_offsets optional vector of offsets for variable-with columns - * @param stream stream used - * @param mr selected memory resource for returned data - * @return vector of list columns containing byte columns of the JCUDF row data - */ -template -std::vector> convert_to_rows( - table_view const& tbl, - batch_data& batch_info, - offsetFunctor offset_functor, - column_info_s const& column_info, - std::optional> variable_width_offsets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - int device_id; - CUDF_CUDA_TRY(cudaGetDevice(&device_id)); - int total_shmem_in_bytes; - CUDF_CUDA_TRY( - cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - -#ifndef __CUDA_ARCH__ // __host__ code. - // Need to reduce total shmem available by the size of barriers in the kernel's shared memory - total_shmem_in_bytes -= - util::round_up_unsafe(sizeof(cuda::barrier), 16ul); -#endif // __CUDA_ARCH__ - - auto const shmem_limit_per_tile = total_shmem_in_bytes; - - auto const num_rows = tbl.num_rows(); - auto const fixed_width_only = !variable_width_offsets.has_value(); - - auto select_columns = [](auto const& tbl, auto column_predicate) { - std::vector cols; - std::copy_if(tbl.begin(), tbl.end(), std::back_inserter(cols), [&](auto c) { - return column_predicate(c); - }); - return table_view(cols); - }; - - auto dev_col_sizes = make_device_uvector_async( - column_info.column_sizes, stream, rmm::mr::get_current_device_resource()); - auto dev_col_starts = make_device_uvector_async( - column_info.column_starts, stream, rmm::mr::get_current_device_resource()); - - // Get the pointers to the input columnar data ready - auto const data_begin = thrust::make_transform_iterator(tbl.begin(), [](auto const& c) { - return is_compound(c.type()) ? nullptr : c.template data(); - }); - std::vector input_data(data_begin, data_begin + tbl.num_columns()); - - // validity code handles variable and fixed-width data, so give it everything - auto const nm_begin = - thrust::make_transform_iterator(tbl.begin(), [](auto const& c) { return c.null_mask(); }); - std::vector input_nm(nm_begin, nm_begin + tbl.num_columns()); - - auto dev_input_data = - make_device_uvector_async(input_data, stream, rmm::mr::get_current_device_resource()); - auto dev_input_nm = - make_device_uvector_async(input_nm, stream, rmm::mr::get_current_device_resource()); - - // the first batch always exists unless we were sent an empty table - auto const first_batch_size = batch_info.row_batches[0].row_count; - - std::vector output_buffers; - std::vector output_data; - output_data.reserve(batch_info.row_batches.size()); - output_buffers.reserve(batch_info.row_batches.size()); - std::transform( - batch_info.row_batches.begin(), - batch_info.row_batches.end(), - std::back_inserter(output_buffers), - [&](auto const& batch) { return rmm::device_buffer(batch.num_bytes, stream, mr); }); - std::transform( - output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data), [](auto& buf) { - return static_cast(buf.data()); - }); - - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - - int info_count = 0; - detail::determine_tiles( - column_info.column_sizes, - column_info.column_starts, - first_batch_size, - num_rows, - shmem_limit_per_tile, - [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &info_count, &stream]( - int const start_col, int const end_col, int const tile_height) { - int i = detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream); - info_count += i; - }); - - // allocate space for tiles - device_uvector gpu_tile_infos(info_count, stream); - int tile_offset = 0; - - detail::determine_tiles( - column_info.column_sizes, - column_info.column_starts, - first_batch_size, - num_rows, - shmem_limit_per_tile, - [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, - &gpu_tile_infos, - num_rows, - &tile_offset, - stream](int const start_col, int const end_col, int const tile_height) { - tile_offset += detail::build_tiles( - {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset}, - gpu_batch_row_boundaries, - start_col, - end_col, - tile_height, - num_rows, - stream); - }); - - // build validity tiles for ALL columns, variable and fixed width. - auto validity_tile_infos = detail::build_validity_tile_infos( - tbl.num_columns(), num_rows, shmem_limit_per_tile, batch_info.row_batches); - - auto dev_validity_tile_infos = - make_device_uvector_async(validity_tile_infos, stream, rmm::mr::get_current_device_resource()); - - auto const validity_offset = column_info.column_starts.back(); - - // blast through the entire table and convert it - detail::copy_to_rows<<>>(num_rows, - tbl.num_columns(), - shmem_limit_per_tile, - gpu_tile_infos, - dev_input_data.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - offset_functor, - batch_info.d_batch_row_boundaries.data(), - reinterpret_cast(dev_output_data.data())); - - // note that validity gets the entire table and not the fixed-width portion - detail::copy_validity_to_rows<<>>(num_rows, - tbl.num_columns(), - shmem_limit_per_tile, - offset_functor, - batch_info.d_batch_row_boundaries.data(), - dev_output_data.data(), - validity_offset, - dev_validity_tile_infos, - dev_input_nm.data()); - - if (!fixed_width_only) { - // build table view for variable-width data only - auto const variable_width_table = - select_columns(tbl, [](auto col) { return is_compound(col.type()); }); - - CUDF_EXPECTS(!variable_width_table.is_empty(), "No variable-width columns when expected!"); - CUDF_EXPECTS(variable_width_offsets.has_value(), "No variable width offset data!"); - - auto const variable_data_begin = - thrust::make_transform_iterator(variable_width_table.begin(), [](auto const& c) { - strings_column_view const scv{c}; - return is_compound(c.type()) ? scv.chars().template data() : nullptr; - }); - std::vector variable_width_input_data( - variable_data_begin, variable_data_begin + variable_width_table.num_columns()); - - auto dev_variable_input_data = make_device_uvector_async( - variable_width_input_data, stream, rmm::mr::get_current_device_resource()); - auto dev_variable_col_output_offsets = make_device_uvector_async( - column_info.variable_width_column_starts, stream, rmm::mr::get_current_device_resource()); - - for (uint i = 0; i < batch_info.row_batches.size(); i++) { - auto const batch_row_offset = batch_info.batch_row_boundaries[i]; - auto const batch_num_rows = batch_info.row_batches[i].row_count; - - dim3 const string_blocks( - std::min(MAX_STRING_BLOCKS, - util::div_rounding_up_unsafe(batch_num_rows, NUM_STRING_ROWS_PER_BLOCK_TO_ROWS))); - - detail::copy_strings_to_rows<<>>(batch_num_rows, - variable_width_table.num_columns(), - dev_variable_input_data.data(), - dev_variable_col_output_offsets.data(), - variable_width_offsets->data(), - column_info.size_per_row, - offset_functor, - batch_row_offset, - reinterpret_cast(output_data[i])); - } - } - - // split up the output buffer into multiple buffers based on row batch sizes and create list of - // byte columns - std::vector> ret; - ret.reserve(batch_info.row_batches.size()); - auto counting_iter = thrust::make_counting_iterator(0); - std::transform(counting_iter, - counting_iter + batch_info.row_batches.size(), - std::back_inserter(ret), - [&](auto batch) { - auto const offset_count = batch_info.row_batches[batch].row_offsets.size(); - auto offsets = - std::make_unique(data_type{type_id::INT32}, - (size_type)offset_count, - batch_info.row_batches[batch].row_offsets.release(), - rmm::device_buffer{}, - 0); - auto data = std::make_unique(data_type{type_id::INT8}, - batch_info.row_batches[batch].num_bytes, - std::move(output_buffers[batch]), - rmm::device_buffer{}, - 0); - - return make_lists_column(batch_info.row_batches[batch].row_count, - std::move(offsets), - std::move(data), - 0, - rmm::device_buffer{0, cudf::get_default_stream(), mr}, - stream, - mr); - }); - - return ret; -} - -} // namespace detail - -/** - * @brief convert a cudf table to JCUDF row format - * - * @param tbl incoming table to convert - * @param stream stream to use for operations - * @param mr memory resource used for returned data - * @return vector of list columns containing byte columns of the JCUDF row data - */ -std::vector> convert_to_rows(table_view const& tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const num_columns = tbl.num_columns(); - auto const num_rows = tbl.num_rows(); - - auto const fixed_width_only = std::all_of( - tbl.begin(), tbl.end(), [](column_view const& c) { return is_fixed_width(c.type()); }); - - // Break up the work into tiles, which are a starting and ending row/col #. This tile size is - // calculated based on the shared memory size available we want a single tile to fill up the - // entire shared memory space available for the transpose-like conversion. - - // There are two different processes going on here. The GPU conversion of the data and the writing - // of the data into the list of byte columns that are a maximum of 2 gigs each due to offset - // maximum size. The GPU conversion portion has to understand this limitation because the column - // must own the data inside and as a result it must be a distinct allocation for that column. - // Copying the data into these final buffers would be prohibitively expensive, so care is taken to - // ensure the GPU writes to the proper buffer. The tiles are broken at the boundaries of specific - // rows based on the row sizes up to that point. These are row batches and they are decided first - // before building the tiles so the tiles can be properly cut around them. - - auto schema_column_iter = - thrust::make_transform_iterator(tbl.begin(), [](auto const& i) { return i.type(); }); - - auto column_info = - detail::compute_column_information(schema_column_iter, schema_column_iter + num_columns); - auto const size_per_row = column_info.size_per_row; - if (fixed_width_only) { - // total encoded row size. This includes fixed-width data and validity only. It does not include - // variable-width data since it isn't copied with the fixed-width and validity kernel. - auto row_size_iter = thrust::make_constant_iterator( - util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT)); - - auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); - - detail::fixed_width_row_offset_functor offset_functor( - util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT)); - - return detail::convert_to_rows( - tbl, batch_info, offset_functor, std::move(column_info), std::nullopt, stream, mr); - } else { - auto offset_data = detail::build_string_row_offsets(tbl, size_per_row, stream); - auto& row_sizes = std::get<0>(offset_data); - - auto row_size_iter = cudf::detail::make_counting_transform_iterator( - 0, detail::row_size_functor(num_rows, row_sizes.data(), 0)); - - auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); - - detail::string_row_offset_functor offset_functor(batch_info.batch_row_offsets); - - return detail::convert_to_rows(tbl, - batch_info, - offset_functor, - std::move(column_info), - std::make_optional(std::move(std::get<1>(offset_data))), - stream, - mr); - } -} - -std::vector> convert_to_rows_fixed_width_optimized( - table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) -{ - auto const num_columns = tbl.num_columns(); - - std::vector schema; - schema.resize(num_columns); - std::transform( - tbl.begin(), tbl.end(), schema.begin(), [](auto i) -> data_type { return i.type(); }); - - if (detail::are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; - - int32_t const size_per_row = - detail::compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = make_device_uvector_async(column_start, stream, mr); - auto dev_column_size = make_device_uvector_async(column_size, stream, mr); - - // Make the number of rows per batch a multiple of 32 so we don't have to worry about splitting - // validity at a specific row offset. This might change in the future. - auto const max_rows_per_batch = - util::round_down_safe(std::numeric_limits::max() / size_per_row, 32); - - auto const num_rows = tbl.num_rows(); - - // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; - for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - - using ScalarType = scalar_type_t; - auto zero = make_numeric_scalar(data_type(type_id::INT32), stream.value()); - zero->set_valid_async(true, stream); - static_cast(zero.get())->set_value(0, stream); - - auto step = make_numeric_scalar(data_type(type_id::INT32), stream.value()); - step->set_valid_async(true, stream); - static_cast(step.get())->set_value(static_cast(size_per_row), stream); - - std::vector> ret; - for (size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { - size_type row_count = num_rows - row_start; - row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; - ret.emplace_back(detail::fixed_width_convert_to_rows(row_start, - row_count, - num_columns, - size_per_row, - dev_column_start, - dev_column_size, - dev_input_data, - dev_input_nm, - *zero, - *step, - stream, - mr)); - } - - return ret; - } else { - CUDF_FAIL("Only fixed width types are currently supported"); - } -} - -namespace { - -/// @brief Calculates and sets null counts for specified columns -void fixup_null_counts(std::vector>& output_columns, - rmm::cuda_stream_view stream) -{ - for (auto& col : output_columns) { - col->set_null_count(cudf::detail::null_count(col->view().null_mask(), 0, col->size(), stream)); - } -} - -} // namespace - -/** - * @brief convert from JCUDF row format to cudf columns - * - * @param input vector of list columns containing byte columns of the JCUDF row data - * @param schema incoming schema of the data - * @param stream stream to use for compute - * @param mr memory resource for returned data - * @return cudf table of the data - */ -std::unique_ptr convert_from_rows(lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // verify that the types are what we expect - column_view child = input.child(); - auto const list_type = child.type().id(); - CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8, - "Only a list of bytes is supported as input"); - - // convert any strings in the schema to two int32 columns - // This allows us to leverage the fixed-width copy code to fill in our offset and string length - // data. - std::vector string_schema; - string_schema.reserve(schema.size()); - for (auto i : schema) { - if (i.id() == type_id::STRING) { - string_schema.push_back(data_type(type_id::INT32)); - string_schema.push_back(data_type(type_id::INT32)); - } else { - string_schema.push_back(i); - } - } - - auto const num_columns = string_schema.size(); - auto const num_rows = input.parent().size(); - - int device_id; - CUDF_CUDA_TRY(cudaGetDevice(&device_id)); - int total_shmem_in_bytes; - CUDF_CUDA_TRY( - cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - -#ifndef __CUDA_ARCH__ // __host__ code. - // Need to reduce total shmem available by the size of barriers in the kernel's shared memory - total_shmem_in_bytes -= - util::round_up_unsafe(sizeof(cuda::barrier), 16ul); -#endif // __CUDA_ARCH__ - - auto const shmem_limit_per_tile = total_shmem_in_bytes; - - auto column_info = detail::compute_column_information(string_schema.begin(), string_schema.end()); - auto const size_per_row = util::round_up_unsafe(column_info.size_per_row, JCUDF_ROW_ALIGNMENT); - - // Ideally we would check that the offsets are all the same, etc. but for now this is probably - // fine - CUDF_EXPECTS(size_per_row * num_rows <= child.size(), "The layout of the data appears to be off"); - auto dev_col_starts = make_device_uvector_async( - column_info.column_starts, stream, rmm::mr::get_current_device_resource()); - auto dev_col_sizes = make_device_uvector_async( - column_info.column_sizes, stream, rmm::mr::get_current_device_resource()); - - // Allocate the columns we are going to write into - std::vector> output_columns; - std::vector> string_row_offset_columns; - std::vector> string_length_columns; - std::vector output_data; - std::vector output_nm; - std::vector string_row_offsets; - std::vector string_lengths; - for (auto i : schema) { - auto make_col = [&output_data, &output_nm](data_type type, - size_type num_rows, - bool include_nm, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) { - auto column = - make_fixed_width_column(type, - num_rows, - include_nm ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, - stream, - mr); - auto mut = column->mutable_view(); - output_data.emplace_back(mut.data()); - if (include_nm) { output_nm.emplace_back(mut.null_mask()); } - return column; - }; - if (i.id() == type_id::STRING) { - auto const int32type = data_type(type_id::INT32); - auto offset_col = - make_col(int32type, num_rows, true, stream, rmm::mr::get_current_device_resource()); - string_row_offsets.push_back(offset_col->mutable_view().data()); - string_row_offset_columns.emplace_back(std::move(offset_col)); - auto length_col = - make_col(int32type, num_rows, false, stream, rmm::mr::get_current_device_resource()); - string_lengths.push_back(length_col->mutable_view().data()); - string_length_columns.emplace_back(std::move(length_col)); - // placeholder - output_columns.emplace_back(make_empty_column(type_id::STRING)); - } else { - output_columns.emplace_back(make_col(i, num_rows, true, stream, mr)); - } - } - - auto dev_string_row_offsets = - make_device_uvector_async(string_row_offsets, stream, rmm::mr::get_current_device_resource()); - auto dev_string_lengths = - make_device_uvector_async(string_lengths, stream, rmm::mr::get_current_device_resource()); - - // build the row_batches from the passed in list column - std::vector row_batches; - row_batches.push_back( - {detail::row_batch{child.size(), num_rows, device_uvector(0, stream)}}); - - auto dev_output_data = - make_device_uvector_async(output_data, stream, rmm::mr::get_current_device_resource()); - auto dev_output_nm = - make_device_uvector_async(output_nm, stream, rmm::mr::get_current_device_resource()); - - // only ever get a single batch when going from rows, so boundaries are 0, num_rows - constexpr auto num_batches = 2; - device_uvector gpu_batch_row_boundaries(num_batches, stream); - - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_batches), - gpu_batch_row_boundaries.begin(), - [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; }); - - int info_count = 0; - detail::determine_tiles(column_info.column_sizes, - column_info.column_starts, - num_rows, - num_rows, - shmem_limit_per_tile, - [&gpu_batch_row_boundaries, &info_count, &stream]( - int const start_col, int const end_col, int const tile_height) { - info_count += detail::compute_tile_counts( - gpu_batch_row_boundaries, tile_height, stream); - }); - - // allocate space for tiles - device_uvector gpu_tile_infos(info_count, stream); - - int tile_offset = 0; - detail::determine_tiles( - column_info.column_sizes, - column_info.column_starts, - num_rows, - num_rows, - shmem_limit_per_tile, - [&gpu_batch_row_boundaries, &gpu_tile_infos, num_rows, &tile_offset, stream]( - int const start_col, int const end_col, int const tile_height) { - tile_offset += detail::build_tiles( - {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset}, - gpu_batch_row_boundaries, - start_col, - end_col, - tile_height, - num_rows, - stream); - }); - - dim3 const blocks(gpu_tile_infos.size()); - - // validity needs to be calculated based on the actual number of final table columns - auto validity_tile_infos = - detail::build_validity_tile_infos(schema.size(), num_rows, shmem_limit_per_tile, row_batches); - - auto dev_validity_tile_infos = - make_device_uvector_async(validity_tile_infos, stream, rmm::mr::get_current_device_resource()); - - dim3 const validity_blocks(validity_tile_infos.size()); - - if (dev_string_row_offsets.size() == 0) { - detail::fixed_width_row_offset_functor offset_functor(size_per_row); - - detail::copy_from_rows<<>>(num_rows, - num_columns, - shmem_limit_per_tile, - offset_functor, - gpu_batch_row_boundaries.data(), - dev_output_data.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - gpu_tile_infos, - child.data()); - - detail::copy_validity_from_rows<<>>(num_rows, - num_columns, - shmem_limit_per_tile, - offset_functor, - gpu_batch_row_boundaries.data(), - dev_output_nm.data(), - column_info.column_starts.back(), - dev_validity_tile_infos, - child.data()); - - } else { - detail::string_row_offset_functor offset_functor(device_span{input.offsets()}); - detail::copy_from_rows<<>>(num_rows, - num_columns, - shmem_limit_per_tile, - offset_functor, - gpu_batch_row_boundaries.data(), - dev_output_data.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - gpu_tile_infos, - child.data()); - - detail::copy_validity_from_rows<<>>(num_rows, - num_columns, - shmem_limit_per_tile, - offset_functor, - gpu_batch_row_boundaries.data(), - dev_output_nm.data(), - column_info.column_starts.back(), - dev_validity_tile_infos, - child.data()); - - std::vector> string_col_offsets; - std::vector> string_data_cols; - std::vector string_col_offset_ptrs; - std::vector string_data_col_ptrs; - for (auto& col_string_lengths : string_lengths) { - device_uvector output_string_offsets(num_rows + 1, stream, mr); - auto tmp = [num_rows, col_string_lengths] __device__(auto const& i) { - return i < num_rows ? col_string_lengths[i] : 0; - }; - auto bounded_iter = cudf::detail::make_counting_transform_iterator(0, tmp); - thrust::exclusive_scan(rmm::exec_policy(stream), - bounded_iter, - bounded_iter + num_rows + 1, - output_string_offsets.begin()); - - // allocate destination string column - rmm::device_uvector string_data( - output_string_offsets.element(num_rows, stream), stream, mr); - - string_col_offset_ptrs.push_back(output_string_offsets.data()); - string_data_col_ptrs.push_back(string_data.data()); - string_col_offsets.push_back(std::move(output_string_offsets)); - string_data_cols.push_back(std::move(string_data)); - } - auto dev_string_col_offsets = make_device_uvector_async( - string_col_offset_ptrs, stream, rmm::mr::get_current_device_resource()); - auto dev_string_data_cols = make_device_uvector_async( - string_data_col_ptrs, stream, rmm::mr::get_current_device_resource()); - - dim3 const string_blocks( - std::min(std::max(MIN_STRING_BLOCKS, num_rows / NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS), - MAX_STRING_BLOCKS)); - - detail::copy_strings_from_rows<<>>( - offset_functor, - dev_string_row_offsets.data(), - dev_string_lengths.data(), - dev_string_col_offsets.data(), - dev_string_data_cols.data(), - child.data(), - num_rows, - static_cast(string_col_offsets.size())); - - // merge strings back into output_columns - int string_idx = 0; - for (int i = 0; i < static_cast(schema.size()); ++i) { - if (schema[i].id() == type_id::STRING) { - // stuff real string column - auto string_data = string_row_offset_columns[string_idx].release()->release(); - output_columns[i] = make_strings_column(num_rows, - std::move(string_col_offsets[string_idx]), - std::move(string_data_cols[string_idx]), - std::move(*string_data.null_mask.release()), - 0); - // Null count set to 0, temporarily. Will be fixed up before return. - string_idx++; - } - } - } - - // Set null counts, because output_columns are modified via mutable-view, - // in the kernel above. - // TODO(future): Consider setting null count in the kernel itself. - fixup_null_counts(output_columns, stream); - - return std::make_unique
(std::move(output_columns)); -} - -std::unique_ptr
convert_from_rows_fixed_width_optimized(lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // verify that the types are what we expect - column_view child = input.child(); - auto const list_type = child.type().id(); - CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8, - "Only a list of bytes is supported as input"); - - auto const num_columns = schema.size(); - - if (detail::are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; - - auto const num_rows = input.parent().size(); - auto const size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - - // Ideally we would check that the offsets are all the same, etc. but for now this is probably - // fine - CUDF_EXPECTS(size_per_row * num_rows == child.size(), - "The layout of the data appears to be off"); - auto dev_column_start = - make_device_uvector_async(column_start, stream, rmm::mr::get_current_device_resource()); - auto dev_column_size = - make_device_uvector_async(column_size, stream, rmm::mr::get_current_device_resource()); - - // Allocate the columns we are going to write into - std::vector> output_columns; - std::vector output_data; - std::vector output_nm; - for (int i = 0; i < static_cast(num_columns); i++) { - auto column = - make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr); - auto mut = column->mutable_view(); - output_data.emplace_back(mut.data()); - output_nm.emplace_back(mut.null_mask()); - output_columns.emplace_back(std::move(column)); - } - - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); - - dim3 blocks; - dim3 threads; - int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - - detail::copy_from_rows_fixed_width_optimized<<>>( - num_rows, - num_columns, - size_per_row, - dev_column_start.data(), - dev_column_size.data(), - dev_output_data.data(), - dev_output_nm.data(), - child.data()); - - // Set null counts, because output_columns are modified via mutable-view, - // in the kernel above. - // TODO(future): Consider setting null count in the kernel itself. - fixup_null_counts(output_columns, stream); - - return std::make_unique
(std::move(output_columns)); - } else { - CUDF_FAIL("Only fixed width types are currently supported"); - } -} - -} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/row_conversion.hpp b/src/main/cpp/src/row_conversion.hpp deleted file mode 100644 index 6e9835e3d2..0000000000 --- a/src/main/cpp/src/row_conversion.hpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include - -#include - -namespace spark_rapids_jni { - -std::vector> convert_to_rows_fixed_width_optimized( - cudf::table_view const& tbl, - // TODO need something for validity - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -std::vector> convert_to_rows( - cudf::table_view const& tbl, - // TODO need something for validity - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr convert_from_rows_fixed_width_optimized( - cudf::lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr convert_from_rows( - cudf::lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/utilities.cu b/src/main/cpp/src/utilities.cu index c66ee5cbcb..7c202a1bec 100644 --- a/src/main/cpp/src/utilities.cu +++ b/src/main/cpp/src/utilities.cu @@ -25,6 +25,8 @@ #include #include +#include + namespace spark_rapids_jni { std::unique_ptr bitmask_bitwise_or( @@ -51,18 +53,19 @@ std::unique_ptr bitmask_bitwise_or( std::unique_ptr out = std::make_unique(mask_size * sizeof(cudf::bitmask_type), stream, mr); - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + mask_size, - static_cast(out->data()), - [buffers = d_input.data(), num_buffers = input.size()] __device__(cudf::size_type word_index) { - cudf::bitmask_type out = buffers[0][word_index]; - for (auto idx = 1; idx < num_buffers; idx++) { - out |= buffers[idx][word_index]; - } - return out; - }); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + mask_size, + static_cast(out->data()), + cuda::proclaim_return_type( + [buffers = d_input.data(), + num_buffers = input.size()] __device__(cudf::size_type word_index) { + cudf::bitmask_type out = buffers[0][word_index]; + for (auto idx = 1; idx < num_buffers; idx++) { + out |= buffers[idx][word_index]; + } + return out; + })); return out; } diff --git a/src/main/cpp/src/xxhash64.cu b/src/main/cpp/src/xxhash64.cu index 561aa49862..8c0b1b8766 100644 --- a/src/main/cpp/src/xxhash64.cu +++ b/src/main/cpp/src/xxhash64.cu @@ -25,6 +25,8 @@ #include +#include + namespace spark_rapids_jni { namespace { @@ -286,10 +288,11 @@ class device_row_hasher { _table.begin(), _table.end(), _seed, - [row_index, nulls = _check_nulls] __device__(auto hash, auto column) { - return cudf::type_dispatcher( - column.type(), element_hasher_adapter{}, column, row_index, nulls, hash); - }); + cuda::proclaim_return_type( + [row_index, nulls = _check_nulls] __device__(auto hash, auto column) { + return cudf::type_dispatcher( + column.type(), element_hasher_adapter{}, column, row_index, nulls, hash); + })); } /** diff --git a/src/main/cpp/src/zorder.cu b/src/main/cpp/src/zorder.cu index c0f21b9b3a..f9c2d4da07 100644 --- a/src/main/cpp/src/zorder.cu +++ b/src/main/cpp/src/zorder.cu @@ -28,6 +28,8 @@ #include #include +#include + namespace { // pretends to be an array of uint32_t, but really only stores @@ -253,18 +255,20 @@ std::unique_ptr hilbert_index(int32_t const num_bits_per_entry, thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_rows, output_dv_ptr->begin(), - [num_bits_per_entry, num_columns, input = *input_dv] __device__(cudf::size_type row_index) { - uint_backed_array row(num_bits_per_entry); - for (cudf::size_type column_index = 0; column_index < num_columns; column_index++) { - auto const column = input.column(column_index); - uint32_t const data = column.is_valid(row_index) ? column.data()[row_index] : 0; - row.set(column_index, data); - } - - auto const transposed_index = hilbert_transposed_index(row, num_bits_per_entry, num_columns); - return static_cast( - to_hilbert_index(transposed_index, num_bits_per_entry, num_columns)); - }); + cuda::proclaim_return_type( + [num_bits_per_entry, num_columns, input = *input_dv] __device__(cudf::size_type row_index) { + uint_backed_array row(num_bits_per_entry); + for (cudf::size_type column_index = 0; column_index < num_columns; column_index++) { + auto const column = input.column(column_index); + uint32_t const data = column.is_valid(row_index) ? column.data()[row_index] : 0; + row.set(column_index, data); + } + + auto const transposed_index = + hilbert_transposed_index(row, num_bits_per_entry, num_columns); + return static_cast( + to_hilbert_index(transposed_index, num_bits_per_entry, num_columns)); + })); return output_data_col; } diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt index b34b1b8b01..617df6dfde 100644 --- a/src/main/cpp/tests/CMakeLists.txt +++ b/src/main/cpp/tests/CMakeLists.txt @@ -60,9 +60,6 @@ ConfigureTest(CAST_FLOAT_TO_STRING ConfigureTest(DATETIME_REBASE datetime_rebase.cpp) -ConfigureTest(ROW_CONVERSION - row_conversion.cpp) - ConfigureTest(HASH hash.cpp) diff --git a/src/main/cpp/tests/row_conversion.cpp b/src/main/cpp/tests/row_conversion.cpp deleted file mode 100644 index 7e104c3871..0000000000 --- a/src/main/cpp/tests/row_conversion.cpp +++ /dev/null @@ -1,1043 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include - -struct ColumnToRowTests : public cudf::test::BaseFixture {}; -struct RowToColumnTests : public cudf::test::BaseFixture {}; - -TEST_F(ColumnToRowTests, Single) -{ - cudf::test::fixed_width_column_wrapper a({-1}); - cudf::table_view in(std::vector{a}); - std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); ++i) { - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, SimpleString) -{ - cudf::test::fixed_width_column_wrapper a({-1, 0, 1, 0, -1}); - cudf::test::strings_column_wrapper b( - {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"}); - cudf::table_view in(std::vector{a, b}); - std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - EXPECT_EQ(new_rows[0]->size(), 5); -} - -TEST_F(ColumnToRowTests, DoubleString) -{ - cudf::test::strings_column_wrapper a( - {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"}); - cudf::test::fixed_width_column_wrapper b({0, 1, 2, 3, 4}); - cudf::test::strings_column_wrapper c({"world", - "hello", - "this string isn't as long", - "this one isn't so short though when you think about it", - "dlrow"}); - cudf::table_view in(std::vector{a, b, c}); - - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - EXPECT_EQ(new_rows[0]->size(), 5); -} - -TEST_F(ColumnToRowTests, BigStrings) -{ - char const* TEST_STRINGS[] = { - "These", - "are", - "the", - "test", - "strings", - "that", - "we", - "have", - "some are really long", - "and some are kinda short", - "They are all over on purpose with different sizes for the strings in order to test the code " - "on all different lengths of strings", - "a", - "good test", - "is required to produce reasonable confidence that this is working"}; - auto num_generator = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - auto string_generator = - cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* { - return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))]; - }); - - auto const num_rows = 50; - auto const num_cols = 50; - std::vector schema; - - std::vector cols; - std::vector views; - - for (auto col = 0; col < num_cols; ++col) { - if (rand() % 2) { - cols.emplace_back( - cudf::test::fixed_width_column_wrapper(num_generator, num_generator + num_rows)); - views.push_back(cols.back()); - schema.emplace_back(cudf::data_type{cudf::type_id::INT32}); - } else { - cols.emplace_back( - cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows)); - views.push_back(cols.back()); - schema.emplace_back(cudf::type_id::STRING); - } - } - - cudf::table_view in(views); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - EXPECT_EQ(new_rows[0]->size(), num_rows); -} - -TEST_F(ColumnToRowTests, ManyStrings) -{ - char const* TEST_STRINGS[] = { - "These", - "are", - "the", - "test", - "strings", - "that", - "we", - "have", - "some are really long", - "and some are kinda short", - "They are all over on purpose with different sizes for the strings in order to test the code " - "on all different lengths of strings", - "a", - "good test", - "is required to produce reasonable confidence that this is working", - "some strings", - "are split into multiple strings", - "some strings have all their data", - "lots of choices of strings and sizes is sure to test the offset calculation code to ensure " - "that even a really long string ends up in the correct spot for the final destination allowing " - "for even crazy run-on sentences to be inserted into the data"}; - auto num_generator = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - auto string_generator = - cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* { - return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))]; - }); - - auto const num_rows = 1000000; - auto const num_cols = 50; - std::vector schema; - - std::vector cols; - std::vector views; - - for (auto col = 0; col < num_cols; ++col) { - if (rand() % 2) { - cols.emplace_back( - cudf::test::fixed_width_column_wrapper(num_generator, num_generator + num_rows)); - views.push_back(cols.back()); - schema.emplace_back(cudf::data_type{cudf::type_id::INT32}); - } else { - cols.emplace_back( - cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows)); - views.push_back(cols.back()); - schema.emplace_back(cudf::type_id::STRING); - } - } - - cudf::table_view in(views); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - EXPECT_EQ(new_rows[0]->size(), num_rows); -} - -TEST_F(ColumnToRowTests, Simple) -{ - cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); - cudf::table_view in(std::vector{a}); - std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Tall) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); - cudf::table_view in(std::vector{a}); - std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Wide) -{ - std::vector> cols; - std::vector views; - std::vector schema; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, SingleByteWide) -{ - std::vector> cols; - std::vector views; - std::vector schema; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); - views.push_back(cols.back()); - - schema.push_back(cudf::data_type{cudf::type_id::INT8}); - } - cudf::table_view in(views); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Non2Power) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - constexpr auto num_rows = 6 * 1024 + 557; - for (int i = 0; i < 131; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - for (int j = 0; j < old_tbl->num_columns(); ++j) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); - } - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Big) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 28 columns of 1 million rows - constexpr auto num_rows = 1024 * 1024; - for (int i = 0; i < 28; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - for (int j = 0; j < old_tbl->num_columns(); ++j) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); - } - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Bigger) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 128 columns of 1 million rows - constexpr auto num_rows = 1024 * 1024; - for (int i = 0; i < 128; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - for (int j = 0; j < old_tbl->num_columns(); ++j) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); - } - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Biggest) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 128 columns of 2 million rows - constexpr auto num_rows = 2 * 1024 * 1024; - for (int i = 0; i < 128; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - for (int j = 0; j < old_tbl->num_columns(); ++j) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); - } - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Single) -{ - cudf::test::fixed_width_column_wrapper a({-1}); - cudf::table_view in(std::vector{a}); - - auto old_rows = spark_rapids_jni::convert_to_rows(in); - std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Simple) -{ - cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); - cudf::table_view in(std::vector{a}); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Tall) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); - cudf::table_view in(std::vector{a}); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Wide) -{ - std::vector> cols; - std::vector views; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({i})); // rand()})); - views.push_back(cols.back()); - } - cudf::table_view in(views); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, SingleByteWide) -{ - std::vector> cols; - std::vector views; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); - views.push_back(cols.back()); - } - cudf::table_view in(views); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, AllTypes) -{ - std::vector> cols; - std::vector views; - std::vector schema{cudf::data_type{cudf::type_id::INT64}, - cudf::data_type{cudf::type_id::FLOAT64}, - cudf::data_type{cudf::type_id::INT8}, - cudf::data_type{cudf::type_id::BOOL8}, - cudf::data_type{cudf::type_id::FLOAT32}, - cudf::data_type{cudf::type_id::INT8}, - cudf::data_type{cudf::type_id::INT32}, - cudf::data_type{cudf::type_id::INT64}}; - - cudf::test::fixed_width_column_wrapper c0({3, 9, 4, 2, 20, 0}, {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c1({5.0, 9.5, 0.9, 7.23, 2.8, 0.0}, - {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c2({5, 1, 0, 2, 7, 0}, {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c3({true, false, false, true, false, false}, - {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c4({1.0f, 3.5f, 5.9f, 7.1f, 9.8f, 0.0f}, - {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c5({2, 3, 4, 5, 9, 0}, {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_point_column_wrapper c6( - {-300, 500, 950, 90, 723, 0}, {1, 1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-2}); - cudf::test::fixed_point_column_wrapper c7( - {-80, 30, 90, 20, 200, 0}, {1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-1}); - - cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7}); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, AllTypesLarge) -{ - std::vector cols; - std::vector schema{}; - - // 15 columns of each type with 1 million entries - constexpr int num_rows{1024 * 1024 * 1}; - - std::default_random_engine re; - std::uniform_real_distribution rand_double(std::numeric_limits::min(), - std::numeric_limits::max()); - std::uniform_int_distribution rand_int64(std::numeric_limits::min(), - std::numeric_limits::max()); - auto r = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) -> int64_t { return rand_int64(re); }); - auto d = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) -> double { return rand_double(re); }); - - auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); - auto none_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; }); - auto most_valid = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return rand() % 2 == 0 ? 0 : 1; }); - auto few_valid = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return rand() % 13 == 0 ? 1 : 0; }); - - for (int i = 0; i < 15; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, all_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::INT8}); - } - - for (int i = 0; i < 15; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::INT16}); - } - - for (int i = 0; i < 15; ++i) { - if (i < 5) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) - .release() - .release()); - } else { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, none_valid) - .release() - .release()); - } - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - - for (int i = 0; i < 15; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(d, d + num_rows, most_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::FLOAT32}); - } - - for (int i = 0; i < 15; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(d, d + num_rows, most_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::FLOAT64}); - } - - for (int i = 0; i < 15; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::BOOL8}); - } - - for (int i = 0; i < 15; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper( - r, r + num_rows, all_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); - } - - for (int i = 0; i < 15; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper( - r, r + num_rows, most_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}); - } - - for (int i = 0; i < 15; ++i) { - cols.push_back(*cudf::test::fixed_point_column_wrapper( - r, r + num_rows, all_valid, numeric::scale_type{-2}) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32}); - } - - for (int i = 0; i < 15; ++i) { - cols.push_back(*cudf::test::fixed_point_column_wrapper( - r, r + num_rows, most_valid, numeric::scale_type{-1}) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64}); - } - - std::vector views(cols.begin(), cols.end()); - cudf::table_view in(views); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Non2Power) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - constexpr auto num_rows = 6 * 1024 + 557; - for (int i = 0; i < 131; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Big) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 28 columns of 1 million rows - constexpr auto num_rows = 1024 * 1024; - for (int i = 0; i < 28; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Bigger) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 28 columns of 1 million rows - constexpr auto num_rows = 1024 * 1024; - for (int i = 0; i < 128; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Biggest) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 128 columns of 1 million rows - constexpr auto num_rows = 5 * 1024 * 1024; - for (int i = 0; i < 128; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized( - cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, SimpleString) -{ - cudf::test::fixed_width_column_wrapper a({-1, 0, 1, 0, -1}); - cudf::test::strings_column_wrapper b( - {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"}); - cudf::table_view in(std::vector{a, b}); - std::vector schema = {cudf::data_type{cudf::type_id::INT32}, - cudf::data_type{cudf::type_id::STRING}}; - - auto new_rows = spark_rapids_jni::convert_to_rows(in); - EXPECT_EQ(new_rows.size(), 1); - for (auto& row : new_rows) { - auto new_cols = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*row), schema); - - EXPECT_EQ(row->size(), 5); - auto const num_columns = new_cols->num_columns(); - - cudf::strings_column_view str_col = new_cols->get_column(1).view(); - std::vector> col_data; - std::vector> offset_data; - for (int i = 0; i < num_columns; ++i) { - offset_data.emplace_back( - std::get<0>(cudf::test::to_host(str_col.offsets()))); - col_data.emplace_back(std::get<0>(cudf::test::to_host(str_col.chars()))); - } - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols); - } -} - -TEST_F(RowToColumnTests, DoubleString) -{ - cudf::test::strings_column_wrapper a( - {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"}); - cudf::test::fixed_width_column_wrapper b({0, 1, 2, 3, 4}); - cudf::test::strings_column_wrapper c({"world", - "hello", - "this string isn't as long", - "this one isn't so short though when you think about it", - "dlrow"}); - cudf::table_view in(std::vector{a, b, c}); - std::vector schema = {cudf::data_type{cudf::type_id::STRING}, - cudf::data_type{cudf::type_id::INT32}, - cudf::data_type{cudf::type_id::STRING}}; - - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - for (uint i = 0; i < new_rows.size(); ++i) { - auto new_cols = - spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); - - EXPECT_EQ(new_rows[0]->size(), 5); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols); - } -} - -TEST_F(RowToColumnTests, BigStrings) -{ - char const* TEST_STRINGS[] = { - "These", - "are", - "the", - "test", - "strings", - "that", - "we", - "have", - "some are really long", - "and some are kinda short", - "They are all over on purpose with different sizes for the strings in order to test the code " - "on all different lengths of strings", - "a", - "good test", - "is required to produce reasonable confidence that this is working"}; - auto num_generator = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - auto string_generator = - cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* { - return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))]; - }); - - auto const num_rows = 50; - auto const num_cols = 50; - std::vector schema; - - std::vector cols; - std::vector views; - - for (auto col = 0; col < num_cols; ++col) { - if (rand() % 2) { - cols.emplace_back( - cudf::test::fixed_width_column_wrapper(num_generator, num_generator + num_rows)); - views.push_back(cols.back()); - schema.emplace_back(cudf::data_type{cudf::type_id::INT32}); - } else { - cols.emplace_back( - cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows)); - views.push_back(cols.back()); - schema.emplace_back(cudf::type_id::STRING); - } - } - - cudf::table_view in(views); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - for (auto& i : new_rows) { - auto new_cols = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*i), schema); - - auto in_view = cudf::slice(in, {0, new_cols->num_rows()}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in_view[0], *new_cols); - } -} - -TEST_F(RowToColumnTests, ManyStrings) -{ - char const* TEST_STRINGS[] = { - "These", - "are", - "the", - "test", - "strings", - "that", - "we", - "have", - "some are really long", - "and some are kinda short", - "They are all over on purpose with different sizes for the strings in order to test the code " - "on all different lengths of strings", - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "this string is the longest string because it is duplicated more than you can imagine " - "a", - "good test", - "is required to produce reasonable confidence that this is working", - "some strings", - "are split into multiple strings", - "some strings have all their data", - "lots of choices of strings and sizes is sure to test the offset calculation code to ensure " - "that even a really long string ends up in the correct spot for the final destination allowing " - "for even crazy run-on sentences to be inserted into the data"}; - auto num_generator = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - auto string_generator = - cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* { - return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))]; - }); - - auto const num_rows = 500000; - auto const num_cols = 50; - std::vector schema; - - std::vector cols; - std::vector views; - - for (auto col = 0; col < num_cols; ++col) { - if (rand() % 2) { - cols.emplace_back( - cudf::test::fixed_width_column_wrapper(num_generator, num_generator + num_rows)); - views.push_back(cols.back()); - schema.emplace_back(cudf::data_type{cudf::type_id::INT32}); - } else { - cols.emplace_back( - cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows)); - views.push_back(cols.back()); - schema.emplace_back(cudf::type_id::STRING); - } - } - - cudf::table_view in(views); - auto new_rows = spark_rapids_jni::convert_to_rows(in); - - for (auto& i : new_rows) { - auto new_cols = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*i), schema); - - auto in_view = cudf::slice(in, {0, new_cols->num_rows()}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in_view[0], *new_cols); - } -} diff --git a/thirdparty/cudf b/thirdparty/cudf index 8b695e3403..36f56c97b9 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 8b695e340355d43261800a1cff876369e916ae90 +Subproject commit 36f56c97b94446f29fef5d2ddd8818275a28e406 From 38e503c5df1d9d8d6f84f3578abc2252e3b767f7 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 28 Dec 2023 11:30:23 +0800 Subject: [PATCH 072/127] Update submodule cudf to 72e6f9b08d3c52ca96ed64d963305ab9005ebff6 (#1669) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 36f56c97b9..72e6f9b08d 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 36f56c97b94446f29fef5d2ddd8818275a28e406 +Subproject commit 72e6f9b08d3c52ca96ed64d963305ab9005ebff6 From 4358580f1f54e409786149049981b8bec16d2817 Mon Sep 17 00:00:00 2001 From: Navin Kumar <97137715+NVnavkumar@users.noreply.github.com> Date: Tue, 2 Jan 2024 07:49:12 -0800 Subject: [PATCH 073/127] Make the GpuTimeZoneDB class idempotent, such that when it is shutdown, it can be recovered and useable again (#1670) --- src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index 0eb56100e4..b63a9dc282 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -90,6 +90,8 @@ public Thread newThread(Runnable r) { public static void shutdown() { if (instance.isLoaded()) { instance.close(); + // Recreate a new instance to reload the database if necessary + instance = new GpuTimeZoneDB(); } } From 2b50b5415cc5482fe121e23b217a5fe60f3944e8 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 2 Jan 2024 14:19:53 -0600 Subject: [PATCH 074/127] Update copyright date in NOTICE file (#1673) Signed-off-by: Jason Lowe --- NOTICE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NOTICE b/NOTICE index a0975c00c8..5e01c7e14c 100644 --- a/NOTICE +++ b/NOTICE @@ -1,5 +1,5 @@ RAPIDS Accelerator JNI For Apache Spark -Copyright (c) 2022-2023, NVIDIA CORPORATION +Copyright (c) 2022-2024, NVIDIA CORPORATION -------------------------------------------------------------------------------- From cbbf553b4d8fe43b6c326bfbf1157f98b15cb4e1 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 3 Jan 2024 11:26:05 +0800 Subject: [PATCH 075/127] Update submodule cudf to af65d52c7d4ca41606482926bdcc001644b7d108 (#1674) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 72e6f9b08d..af65d52c7d 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 72e6f9b08d3c52ca96ed64d963305ab9005ebff6 +Subproject commit af65d52c7d4ca41606482926bdcc001644b7d108 From fd95e5c2032daab28a812c37f70bc63fcbd2bb48 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 4 Jan 2024 12:05:34 +0800 Subject: [PATCH 076/127] Update submodule cudf to 4c01e9513c28ef590184d34c0c54292743562c8f (#1675) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index af65d52c7d..4c01e9513c 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit af65d52c7d4ca41606482926bdcc001644b7d108 +Subproject commit 4c01e9513c28ef590184d34c0c54292743562c8f From 1c34077a5afee0fe707d6b45bccf255e6967218a Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 5 Jan 2024 05:31:14 +0800 Subject: [PATCH 077/127] Update submodule cudf to fab5af24afd36a2a58fc18492bc79b4212762b96 (#1677) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 4c01e9513c..fab5af24af 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 4c01e9513c28ef590184d34c0c54292743562c8f +Subproject commit fab5af24afd36a2a58fc18492bc79b4212762b96 From e3fe4158690ec584ac826fbc1d0a0a77387369cd Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 5 Jan 2024 09:10:08 +0800 Subject: [PATCH 078/127] Fix a bug in format_float kernel (#1676) Signed-off-by: Haoyang Li --- src/main/cpp/src/ftos_converter.cuh | 44 ++++++++++++++--------------- src/main/cpp/tests/format_float.cpp | 6 +++- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh index e684f73921..cbbf28e749 100644 --- a/src/main/cpp/src/ftos_converter.cuh +++ b/src/main/cpp/src/ftos_converter.cuh @@ -1202,11 +1202,8 @@ __device__ inline T round_half_even(T const input, int const olength, int const { // "round" a integer to digits digits, with the half-even rounding mode. if (digits > olength) { - T num = input; - for (int i = 0; i < digits - olength; i++) { - num *= 10; - } - return num; + // trailing zeros will be handled later + return input; } T div = POW10_TABLE[olength - digits]; T mod = input % div; @@ -1215,10 +1212,10 @@ __device__ inline T round_half_even(T const input, int const olength, int const return num; } -__device__ inline int to_formated_chars(floating_decimal_64 const v, - bool const sign, - char* const result, - int digits) +__device__ inline int to_formated_double_chars(floating_decimal_64 const v, + bool const sign, + char* const result, + int digits) { int index = 0; if (sign) { result[index++] = '-'; } @@ -1289,9 +1286,10 @@ __device__ inline int to_formated_chars(floating_decimal_64 const v, result[index++] = '0'; } } else { + // 0 <= exp < olength - 1 uint32_t temp_d = digits, tailing_zero = 0; - if (exp + digits > olength) { - temp_d = olength - exp; + if (exp + digits + 1 > olength) { + temp_d = olength - exp - 1; tailing_zero = digits - temp_d; } uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1); @@ -1329,7 +1327,7 @@ __device__ inline int to_formated_chars(floating_decimal_64 const v, return index; } -__device__ inline int format_float_size(floating_decimal_64 const v, bool const sign, int digits) +__device__ inline int format_double_size(floating_decimal_64 const v, bool const sign, int digits) { int index = 0; if (sign) { index++; } @@ -1342,7 +1340,7 @@ __device__ inline int format_float_size(floating_decimal_64 const v, bool const index += exp + 1 + exp / 3 + 1 + digits; } else { uint32_t temp_d = digits; - if (exp + digits > olength) { temp_d = olength - exp; } + if (exp + digits + 1 > olength) { temp_d = olength - exp - 1; } uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1); uint64_t pow10 = POW10_TABLE[temp_d]; uint64_t integer = rounded_output / pow10; @@ -1353,10 +1351,10 @@ __device__ inline int format_float_size(floating_decimal_64 const v, bool const return index; } -__device__ inline int to_formated_chars(floating_decimal_32 const v, - bool const sign, - char* const result, - int digits) +__device__ inline int to_formated_float_chars(floating_decimal_32 const v, + bool const sign, + char* const result, + int digits) { int index = 0; if (sign) { result[index++] = '-'; } @@ -1428,8 +1426,8 @@ __device__ inline int to_formated_chars(floating_decimal_32 const v, } } else { uint32_t temp_d = digits, tailing_zero = 0; - if (exp + digits > olength) { - temp_d = olength - exp; + if (exp + digits + 1 > olength) { + temp_d = olength - exp - 1; tailing_zero = digits - temp_d; } uint32_t rounded_output = round_half_even(output, olength, exp + temp_d + 1); @@ -1480,7 +1478,7 @@ __device__ inline int format_float_size(floating_decimal_32 const v, bool const index += exp + 1 + exp / 3 + 1 + digits; } else { uint32_t temp_d = digits; - if (exp + digits > olength) { temp_d = olength - exp; } + if (exp + digits + 1 > olength) { temp_d = olength - exp - 1; } uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1); uint64_t pow10 = POW10_TABLE[temp_d]; uint64_t integer = rounded_output / pow10; @@ -1539,7 +1537,7 @@ __device__ inline int compute_format_float_size(double value, int digits, bool i } else { floating_decimal_64 v = d2d(value, sign, special); if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); } - return format_float_size(v, sign, digits); + return format_double_size(v, sign, digits); } } @@ -1549,11 +1547,11 @@ __device__ inline int format_float(double value, int digits, bool is_float, char if (is_float) { floating_decimal_32 v = f2d(value, sign, special); if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); } - return to_formated_chars(v, sign, output, digits); + return to_formated_float_chars(v, sign, output, digits); } else { floating_decimal_64 v = d2d(value, sign, special); if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); } - return to_formated_chars(v, sign, output, digits); + return to_formated_double_chars(v, sign, output, digits); } } diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp index b9d77593db..20989a8c20 100644 --- a/src/main/cpp/tests/format_float.cpp +++ b/src/main/cpp/tests/format_float.cpp @@ -68,6 +68,8 @@ TEST_F(FormatFloatTests, FormatFloats64) -4.0d, std::numeric_limits::quiet_NaN(), 839542223232.794248339d, + 3232.794248339d, + 11234000000.0d, -0.0d}; auto const expected = cudf::test::strings_column_wrapper{"100.00000", @@ -80,9 +82,11 @@ TEST_F(FormatFloatTests, FormatFloats64) "-4.00000", "\xEF\xBF\xBD", "839,542,223,232.79420", + "3,232.79425", + "11,234,000,000.00000", "-0.00000"}; auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream()); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); -} \ No newline at end of file +} From ce9a6cc07f86df278c568313dbbce49c22189ce2 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 5 Jan 2024 17:30:56 +0800 Subject: [PATCH 079/127] Update submodule cudf to b83ab433fb1eb6eb832cf72cb3574909e270edaf (#1680) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index fab5af24af..b83ab433fb 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit fab5af24afd36a2a58fc18492bc79b4212762b96 +Subproject commit b83ab433fb1eb6eb832cf72cb3574909e270edaf From b32430372cd63b0fc3449b000a93241e03b5e8ea Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 5 Jan 2024 23:29:55 +0800 Subject: [PATCH 080/127] Update submodule cudf to 9c7b05b75de5ebdb438643ad1bdf4fa0b821a480 (#1681) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index b83ab433fb..9c7b05b75d 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit b83ab433fb1eb6eb832cf72cb3574909e270edaf +Subproject commit 9c7b05b75de5ebdb438643ad1bdf4fa0b821a480 From 82de17bd73d28fe82f1918a5bef5518d69d4340f Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Sat, 6 Jan 2024 11:25:36 +0800 Subject: [PATCH 081/127] Update submodule cudf to 6083efa73b3282a457c963f68a8cab94d41cdd70 (#1682) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 9c7b05b75d..6083efa73b 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 9c7b05b75de5ebdb438643ad1bdf4fa0b821a480 +Subproject commit 6083efa73b3282a457c963f68a8cab94d41cdd70 From 473270bbfdfca5b0b8ceb914f1d06060003483ab Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 9 Jan 2024 05:29:53 +0800 Subject: [PATCH 082/127] Update submodule cudf to ba7550a17f57d17d6d6decec3b2f8a0a5f687aa8 (#1684) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 6083efa73b..ba7550a17f 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 6083efa73b3282a457c963f68a8cab94d41cdd70 +Subproject commit ba7550a17f57d17d6d6decec3b2f8a0a5f687aa8 From 4c6de74cbb71accec464b6be97963fb3df85f680 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 9 Jan 2024 09:24:56 +0800 Subject: [PATCH 083/127] Fix compile warnings and refactor ftos_converter.cuh (#1679) * Fix compile warnings and refactor ftos_converter.cuh Signed-off-by: Haoyang Li * Apply suggestions from code review Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> * address comments Signed-off-by: Haoyang Li --------- Signed-off-by: Haoyang Li Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- src/main/cpp/src/ftos_converter.cuh | 423 ++++++++++------------------ 1 file changed, 146 insertions(+), 277 deletions(-) diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh index cbbf28e749..c2fa07377c 100644 --- a/src/main/cpp/src/ftos_converter.cuh +++ b/src/main/cpp/src/ftos_converter.cuh @@ -147,9 +147,25 @@ __constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {1ull, //===== common.h from ryu ===== -// Returns the number of decimal digits in v, which must not contain more than 9 digits. -__device__ inline uint32_t decimalLength9(uint32_t const v) +// Returns the number of decimal digits in v, which must not contain more than 9/17 digits. +template +__device__ inline uint32_t decimal_length(T const v) { + static_assert(std::is_integral_v && !std::is_signed_v); + if constexpr (sizeof(T) == sizeof(int64_t)) { + // The average output length is 16.38 digits, so we check high-to-low. + // Function precondition: v is not an 18, 19, or 20-digit number. + // (17 digits are sufficient for round-tripping.) + assert(v < 100000000000000000L); + if (v >= 10000000000000000L) { return 17; } + if (v >= 1000000000000000L) { return 16; } + if (v >= 100000000000000L) { return 15; } + if (v >= 10000000000000L) { return 14; } + if (v >= 1000000000000L) { return 13; } + if (v >= 100000000000L) { return 12; } + if (v >= 10000000000L) { return 11; } + if (v >= 1000000000L) { return 10; } + } // Function precondition: v is not a 10-digit number. // (f2s: 9 digits are sufficient for round-tripping.) // (d2fixed: We print 9-digit blocks.) @@ -173,7 +189,7 @@ __device__ inline int32_t pow5bits(int32_t const e) // than 2^9297. assert(e >= 0); assert(e <= 3528); - return (int32_t)(((((uint32_t)e) * 1217359) >> 19) + 1); + return static_cast((((static_cast(e)) * 1217359) >> 19) + 1); } // Returns floor(log_10(2^e)); requires 0 <= e <= 1650. @@ -182,7 +198,7 @@ __device__ inline uint32_t log10Pow2(int32_t const e) // The first value this approximation fails for is 2^1651 which is just greater than 10^297. assert(e >= 0); assert(e <= 1650); - return (((uint32_t)e) * 78913) >> 18; + return ((static_cast(e)) * 78913) >> 18; } // Returns floor(log_10(5^e)); requires 0 <= e <= 2620. @@ -191,7 +207,7 @@ __device__ inline uint32_t log10Pow5(int32_t const e) // The first value this approximation fails for is 5^2621 which is just greater than 10^1832. assert(e >= 0); assert(e <= 2620); - return (((uint32_t)e) * 732923) >> 20; + return (static_cast(e) * 732923) >> 20; } __device__ inline uint32_t pow5factor_32(uint32_t value) @@ -229,15 +245,15 @@ __device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, i // The casts here help MSVC to avoid calls to the __allmul library // function. - uint32_t const factorLo = (uint32_t)(factor); - uint32_t const factorHi = (uint32_t)(factor >> 32); - uint64_t const bits0 = (uint64_t)m * factorLo; - uint64_t const bits1 = (uint64_t)m * factorHi; + uint32_t const factorLo = static_cast(factor); + uint32_t const factorHi = static_cast(factor >> 32); + uint64_t const bits0 = static_cast(m) * factorLo; + uint64_t const bits1 = static_cast(m) * factorHi; uint64_t const sum = (bits0 >> 32) + bits1; uint64_t const shiftedSum = sum >> (shift - 32); assert(shiftedSum <= UINT32_MAX); - return (uint32_t)shiftedSum; + return static_cast(shiftedSum); } __device__ inline int copy_special_str(char* const result, @@ -284,29 +300,29 @@ __device__ inline uint64_t double_to_bits(double const d) __device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* const productHi) { // The casts here help MSVC to avoid calls to the __allmul library function. - uint32_t const aLo = (uint32_t)a; - uint32_t const aHi = (uint32_t)(a >> 32); - uint32_t const bLo = (uint32_t)b; - uint32_t const bHi = (uint32_t)(b >> 32); + uint32_t const aLo = static_cast(a); + uint32_t const aHi = static_cast(a >> 32); + uint32_t const bLo = static_cast(b); + uint32_t const bHi = static_cast(b >> 32); - uint64_t const b00 = (uint64_t)aLo * bLo; - uint64_t const b01 = (uint64_t)aLo * bHi; - uint64_t const b10 = (uint64_t)aHi * bLo; - uint64_t const b11 = (uint64_t)aHi * bHi; + uint64_t const b00 = static_cast(aLo) * bLo; + uint64_t const b01 = static_cast(aLo) * bHi; + uint64_t const b10 = static_cast(aHi) * bLo; + uint64_t const b11 = static_cast(aHi) * bHi; - uint32_t const b00Lo = (uint32_t)b00; - uint32_t const b00Hi = (uint32_t)(b00 >> 32); + uint32_t const b00Lo = static_cast(b00); + uint32_t const b00Hi = static_cast(b00 >> 32); uint64_t const mid1 = b10 + b00Hi; - uint32_t const mid1Lo = (uint32_t)(mid1); - uint32_t const mid1Hi = (uint32_t)(mid1 >> 32); + uint32_t const mid1Lo = static_cast(mid1); + uint32_t const mid1Hi = static_cast(mid1 >> 32); uint64_t const mid2 = b01 + mid1Lo; - uint32_t const mid2Lo = (uint32_t)(mid2); - uint32_t const mid2Hi = (uint32_t)(mid2 >> 32); + uint32_t const mid2Lo = static_cast(mid2); + uint32_t const mid2Hi = static_cast(mid2 >> 32); uint64_t const pHi = b11 + mid1Hi + mid2Hi; - uint64_t const pLo = ((uint64_t)mid2Lo << 32) | b00Lo; + uint64_t const pLo = (static_cast(mid2Lo) << 32) | b00Lo; *productHi = pHi; return pLo; @@ -461,42 +477,16 @@ __device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, in //===== d2s.c and f2s.c from ryu ===== -__device__ inline uint32_t decimalLength17(uint64_t const v) -{ - // This is slightly faster than a loop. - // The average output length is 16.38 digits, so we check high-to-low. - // Function precondition: v is not an 18, 19, or 20-digit number. - // (17 digits are sufficient for round-tripping.) - assert(v < 100000000000000000L); - if (v >= 10000000000000000L) { return 17; } - if (v >= 1000000000000000L) { return 16; } - if (v >= 100000000000000L) { return 15; } - if (v >= 10000000000000L) { return 14; } - if (v >= 1000000000000L) { return 13; } - if (v >= 100000000000L) { return 12; } - if (v >= 10000000000L) { return 11; } - if (v >= 1000000000L) { return 10; } - if (v >= 100000000L) { return 9; } - if (v >= 10000000L) { return 8; } - if (v >= 1000000L) { return 7; } - if (v >= 100000L) { return 6; } - if (v >= 10000L) { return 5; } - if (v >= 1000L) { return 4; } - if (v >= 100L) { return 3; } - if (v >= 10L) { return 2; } - return 1; -} - __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t const ieeeExponent) { int32_t e2; uint64_t m2; if (ieeeExponent == 0) { // We subtract 2 so that the bounds computation has 2 additional bits. - e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; + e2 = static_cast(1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2); m2 = ieeeMantissa; } else { - e2 = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; + e2 = static_cast(ieeeExponent) - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; } bool const even = (m2 & 1) == 0; @@ -519,9 +509,9 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t // I tried special-casing q == 0, but there was no effect on performance. // This expression is slightly faster than max(0, log10Pow2(e2) - 1). uint32_t const q = log10Pow2(e2) - (e2 > 3); - e10 = (int32_t)q; - int32_t const k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1; - int32_t const i = -e2 + (int32_t)q + k; + e10 = static_cast(q); + int32_t const k = DOUBLE_POW5_INV_BITCOUNT + pow5bits(static_cast(q)) - 1; + int32_t const i = -e2 + static_cast(q) + k; uint64_t pow5[2]; double_computeInvPow5(q, pow5); vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift); @@ -530,7 +520,7 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t // This should use q <= 22, but I think 21 is also safe. Smaller values // may still be safe, but it's more difficult to reason about them. // Only one of mp, mv, and mm can be a multiple of 5, if any. - uint32_t const mvMod5 = ((uint32_t)mv) - 5 * ((uint32_t)div5(mv)); + uint32_t const mvMod5 = (static_cast(mv)) - 5 * (static_cast(div5(mv))); if (mvMod5 == 0) { vrIsTrailingZeros = multipleOfPowerOf5(mv, q); } else if (acceptBounds) { @@ -546,10 +536,10 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t } else { // This expression is slightly faster than max(0, log10Pow5(-e2) - 1). uint32_t const q = log10Pow5(-e2) - (-e2 > 1); - e10 = (int32_t)q + e2; - int32_t const i = -e2 - (int32_t)q; + e10 = static_cast(q) + e2; + int32_t const i = -e2 - static_cast(q); int32_t const k = pow5bits(i) - DOUBLE_POW5_BITCOUNT; - int32_t const j = (int32_t)q - k; + int32_t const j = static_cast(q) - k; uint64_t pow5[2]; double_computePow5(i, pow5); @@ -586,12 +576,12 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t uint64_t const vpDiv10 = div10(vp); uint64_t const vmDiv10 = div10(vm); if (vpDiv10 <= vmDiv10) { break; } - uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10); + uint32_t const vmMod10 = (static_cast(vm)) - 10 * (static_cast(vmDiv10)); uint64_t const vrDiv10 = div10(vr); - uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10); + uint32_t const vrMod10 = (static_cast(vr)) - 10 * (static_cast(vrDiv10)); vmIsTrailingZeros &= vmMod10 == 0; vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t)vrMod10; + lastRemovedDigit = static_cast(vrMod10); vr = vrDiv10; vp = vpDiv10; vm = vmDiv10; @@ -601,13 +591,15 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t if (vmIsTrailingZeros) { for (;;) { uint64_t const vmDiv10 = div10(vm); - uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10); + uint32_t const vmMod10 = + (static_cast(vm)) - 10 * (static_cast(vmDiv10)); if (vmMod10 != 0) { break; } uint64_t const vpDiv10 = div10(vp); uint64_t const vrDiv10 = div10(vr); - uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10); + uint32_t const vrMod10 = + (static_cast(vr)) - 10 * (static_cast(vrDiv10)); vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t)vrMod10; + lastRemovedDigit = static_cast(vrMod10); vr = vrDiv10; vp = vpDiv10; vm = vmDiv10; @@ -628,11 +620,12 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t uint64_t const vmDiv100 = div100(vm); if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%). uint64_t const vrDiv100 = div100(vr); - uint32_t const vrMod100 = ((uint32_t)vr) - 100 * ((uint32_t)vrDiv100); - roundUp = vrMod100 >= 50; - vr = vrDiv100; - vp = vpDiv100; - vm = vmDiv100; + uint32_t const vrMod100 = + (static_cast(vr)) - 100 * (static_cast(vrDiv100)); + roundUp = vrMod100 >= 50; + vr = vrDiv100; + vp = vpDiv100; + vm = vmDiv100; removed += 2; } // Loop iterations below (approximately), without optimization above: @@ -644,7 +637,7 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t uint64_t const vmDiv10 = div10(vm); if (vpDiv10 <= vmDiv10) { break; } uint64_t const vrDiv10 = div10(vr); - uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10); + uint32_t const vrMod10 = (static_cast(vr)) - 10 * (static_cast(vrDiv10)); roundUp = vrMod10 >= 5; vr = vrDiv10; vp = vpDiv10; @@ -669,10 +662,10 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t uint32_t m2; if (ieeeExponent == 0) { // We subtract 2 so that the bounds computation has 2 additional bits. - e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; + e2 = static_cast(1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2); m2 = ieeeMantissa; } else { - e2 = (int32_t)ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; + e2 = static_cast(ieeeExponent) - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa; } bool const even = (m2 & 1) == 0; @@ -693,9 +686,9 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t uint8_t lastRemovedDigit = 0; if (e2 >= 0) { uint32_t const q = log10Pow2(e2); - e10 = (int32_t)q; - int32_t const k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1; - int32_t const i = -e2 + (int32_t)q + k; + e10 = static_cast(q); + int32_t const k = FLOAT_POW5_INV_BITCOUNT + pow5bits(static_cast(q)) - 1; + int32_t const i = -e2 + static_cast(q) + k; vr = mulPow5InvDivPow2(mv, q, i); vp = mulPow5InvDivPow2(mp, q, i); vm = mulPow5InvDivPow2(mm, q, i); @@ -703,8 +696,9 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t // We need to know one removed digit even if we are not going to loop below. We could use // q = X - 1 above, except that would require 33 bits for the result, and we've found that // 32-bit arithmetic is faster even on 64-bit machines. - int32_t const l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)(q - 1)) - 1; - lastRemovedDigit = (uint8_t)(mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t)q - 1 + l) % 10); + int32_t const l = FLOAT_POW5_INV_BITCOUNT + pow5bits(static_cast(q - 1)) - 1; + lastRemovedDigit = static_cast( + mulPow5InvDivPow2(mv, q - 1, -e2 + static_cast(q) - 1 + l) % 10); } if (q <= 9) { // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well. @@ -719,16 +713,17 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t } } else { uint32_t const q = log10Pow5(-e2); - e10 = (int32_t)q + e2; - int32_t const i = -e2 - (int32_t)q; + e10 = static_cast(q) + e2; + int32_t const i = -e2 - static_cast(q); int32_t const k = pow5bits(i) - FLOAT_POW5_BITCOUNT; - int32_t j = (int32_t)q - k; - vr = mulPow5divPow2(mv, (uint32_t)i, j); - vp = mulPow5divPow2(mp, (uint32_t)i, j); - vm = mulPow5divPow2(mm, (uint32_t)i, j); + int32_t j = static_cast(q) - k; + vr = mulPow5divPow2(mv, static_cast(i), j); + vp = mulPow5divPow2(mp, static_cast(i), j); + vm = mulPow5divPow2(mm, static_cast(i), j); if (q != 0 && (vp - 1) / 10 <= vm / 10) { - j = (int32_t)q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT); - lastRemovedDigit = (uint8_t)(mulPow5divPow2(mv, (uint32_t)(i + 1), j) % 10); + j = static_cast(q) - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT); + lastRemovedDigit = + static_cast(mulPow5divPow2(mv, static_cast(i + 1), j) % 10); } if (q <= 1) { // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. @@ -754,7 +749,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t while (vp / 10 > vm / 10) { vmIsTrailingZeros &= vm % 10 == 0; vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t)(vr % 10); + lastRemovedDigit = static_cast(vr % 10); vr /= 10; vp /= 10; vm /= 10; @@ -763,7 +758,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t if (vmIsTrailingZeros) { while (vm % 10 == 0) { vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t)(vr % 10); + lastRemovedDigit = static_cast(vr % 10); vr /= 10; vp /= 10; vm /= 10; @@ -781,7 +776,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t // Loop iterations below (approximately): // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01% while (vp / 10 > vm / 10) { - lastRemovedDigit = (uint8_t)(vr % 10); + lastRemovedDigit = static_cast(vr % 10); vr /= 10; vp /= 10; vm /= 10; @@ -805,8 +800,8 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha if (sign) { result[index++] = '-'; } uint64_t output = v.mantissa; - uint32_t const olength = decimalLength17(output); - int32_t exp = v.exponent + (int32_t)olength - 1; + uint32_t const olength = decimal_length(output); + int32_t exp = v.exponent + static_cast(olength) - 1; bool scientificNotation = (exp < -3) || (exp >= 7); // Values in the interval [1E-3, 1E7) are special. @@ -885,8 +880,8 @@ __device__ inline int d2s_size(floating_decimal_64 const v, bool const sign) if (sign) { index++; } uint64_t output = v.mantissa; - uint32_t const olength = decimalLength17(output); - int32_t exp = v.exponent + (int32_t)olength - 1; + uint32_t const olength = decimal_length(output); + int32_t exp = v.exponent + static_cast(olength) - 1; bool scientificNotation = (exp < -3) || (exp >= 7); if (scientificNotation) { @@ -925,7 +920,7 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha if (sign) { result[index++] = '-'; } uint32_t output = v.mantissa; - uint32_t const olength = decimalLength9(output); + uint32_t const olength = decimal_length(output); int32_t exp = v.exponent + olength - 1; bool scientificNotation = (exp < -3) || (exp >= 7); @@ -1000,7 +995,7 @@ __device__ inline int f2s_size(floating_decimal_32 const v, bool const sign) if (sign) { index++; } uint32_t output = v.mantissa; - uint32_t const olength = decimalLength9(output); + uint32_t const olength = decimal_length(output); int32_t exp = v.exponent + olength - 1; bool scientificNotation = (exp < -3) || (exp >= 7); @@ -1036,7 +1031,7 @@ __device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, floating_decimal_64* const v) { uint64_t const m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; - int32_t const e2 = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; + int32_t const e2 = static_cast(ieeeExponent) - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; if (e2 > 0) { // f = m2 * 2^e2 >= 2^53 is an integer. @@ -1057,7 +1052,7 @@ __device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, // f is an integer in the range [1, 2^53). // Note: mantissa might contain trailing (decimal) 0's. - // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17(). + // Note: since 2^53 < 10^16, there is no need to adjust decimal_length(). v->mantissa = m2 >> -e2; v->exponent = 0; return true; @@ -1072,12 +1067,12 @@ __device__ inline floating_decimal_64 d2d(double f, bool& ieeeSign, bool& specia ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0; uint64_t const ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1); uint32_t const ieeeExponent = - (uint32_t)((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); + static_cast((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); // Case distinction; exit early for the easy cases. if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { special = true; - return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent}; + return floating_decimal_64{ieeeMantissa, static_cast(ieeeExponent)}; } special = false; floating_decimal_64 v; @@ -1089,7 +1084,7 @@ __device__ inline floating_decimal_64 d2d(double f, bool& ieeeSign, bool& specia // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.) for (;;) { uint64_t const q = div10(v.mantissa); - uint32_t const r = ((uint32_t)v.mantissa) - 10 * ((uint32_t)q); + uint32_t const r = (static_cast(v.mantissa)) - 10 * (static_cast(q)); if (r != 0) { break; } v.mantissa = q; ++v.exponent; @@ -1122,7 +1117,7 @@ __device__ inline floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { special = true; - return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent}; + return floating_decimal_32{ieeeMantissa, static_cast(ieeeExponent)}; } special = false; return f2d(ieeeMantissa, ieeeExponent); @@ -1212,48 +1207,57 @@ __device__ inline T round_half_even(T const input, int const olength, int const return num; } -__device__ inline int to_formated_double_chars(floating_decimal_64 const v, - bool const sign, - char* const result, - int digits) +/* + * Convert a floating_decimal_32/64 to a formatted string as the default format (#,###,###.##) + * of format_number in Spark. + * + * @param v The input floating_decimal_32/64 value + * @param sign Sign of the number + * @param result Output string + * @param digits Number of digits after decimal point + */ +template +__device__ inline int to_formatted_chars(T const v, bool const sign, char* const result, int digits) { + static_assert(std::is_same_v || std::is_same_v); + using U = std::conditional_t, uint32_t, uint64_t>; int index = 0; if (sign) { result[index++] = '-'; } - uint64_t output = v.mantissa; - const uint32_t olength = decimalLength17(output); - int32_t exp = v.exponent + (int32_t)olength - 1; + U output = v.mantissa; + uint32_t const olength = decimal_length(output); + int32_t exp = v.exponent + static_cast(olength) - 1; if (exp < 0) { // Decimal dot is before any of the digits. int index_for_carrier = index; result[index++] = '0'; if (digits == 0) { return index; } - result[index++] = '.'; - int actural_round = digits; + result[index++] = '.'; + int actual_round = digits; for (int i = -1; i > exp; i--) { index_for_carrier = index; result[index++] = '0'; - actural_round--; - if (actural_round == 0) { + actual_round--; + if (actual_round == 0) { if (i != exp + 1) { return index; } // else, possible carry break; } } - int actural_olength = fmin(int(olength), actural_round); - uint64_t rounded_output = round_half_even(output, olength, actural_round); + int actual_olength = fmin(int(olength), actual_round); + U rounded_output = round_half_even(output, olength, actual_round); // check if carry - if (rounded_output >= POW10_TABLE[actural_olength]) { + if (rounded_output >= POW10_TABLE[actual_olength]) { result[index_for_carrier] = '1'; - rounded_output -= POW10_TABLE[actural_olength]; + rounded_output -= POW10_TABLE[actual_olength]; } int current = index; - for (int i = 0; i < actural_olength; i++) { - result[current + actural_olength - i - 1] = (char)('0' + rounded_output % 10); + for (int i = 0; i < actual_olength; i++) { + result[current + actual_olength - i - 1] = (char)('0' + rounded_output % 10); rounded_output /= 10; index++; } - actural_round -= actural_olength; - if (actural_round > 0) { - for (int i = 0; i < actural_round; i++) { + actual_round -= actual_olength; + if (actual_round > 0) { + for (int i = 0; i < actual_round; i++) { result[index++] = '0'; } } @@ -1292,12 +1296,12 @@ __device__ inline int to_formated_double_chars(floating_decimal_64 const v, temp_d = olength - exp - 1; tailing_zero = digits - temp_d; } - uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1); - uint64_t pow10 = POW10_TABLE[temp_d]; - uint64_t integer = rounded_output / pow10; - uint64_t decimal = rounded_output % pow10; + U rounded_output = round_half_even(output, olength, exp + temp_d + 1); + U pow10 = POW10_TABLE[temp_d]; + U integer = rounded_output / pow10; + U decimal = rounded_output % pow10; // calculate integer length after format to cover carry case - uint32_t integer_len = decimalLength17(integer); + uint32_t integer_len = decimal_length(integer); uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; uint32_t sep_cnt = 0; int rev_index = 0; @@ -1327,151 +1331,16 @@ __device__ inline int to_formated_double_chars(floating_decimal_64 const v, return index; } -__device__ inline int format_double_size(floating_decimal_64 const v, bool const sign, int digits) -{ - int index = 0; - if (sign) { index++; } - uint64_t output = v.mantissa; - const uint32_t olength = decimalLength17(output); - int32_t exp = v.exponent + (int32_t)olength - 1; - if (exp < 0) { - index += 2 + digits; - } else if (exp + 1 >= olength) { - index += exp + 1 + exp / 3 + 1 + digits; - } else { - uint32_t temp_d = digits; - if (exp + digits + 1 > olength) { temp_d = olength - exp - 1; } - uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1); - uint64_t pow10 = POW10_TABLE[temp_d]; - uint64_t integer = rounded_output / pow10; - uint32_t integer_len = decimalLength17(integer); - index += integer_len + (integer_len - 1) / 3 + 1 + digits; - } - if (digits == 0) { index--; } - return index; -} - -__device__ inline int to_formated_float_chars(floating_decimal_32 const v, - bool const sign, - char* const result, - int digits) -{ - int index = 0; - if (sign) { result[index++] = '-'; } - uint32_t output = v.mantissa; - uint32_t const olength = decimalLength9(output); - int32_t exp = v.exponent + (int32_t)olength - 1; - if (exp < 0) { - // Decimal dot is before any of the digits. - int index_for_carrier = index; - result[index++] = '0'; - if (digits == 0) { return index; } - result[index++] = '.'; - int actural_round = digits; - for (int i = -1; i > exp; i--) { - index_for_carrier = index; - result[index++] = '0'; - actural_round--; - if (actural_round == 0) { - if (i != exp + 1) { return index; } // else, possible carry - break; - } - } - int actural_olength = fmin(int(olength), actural_round); - uint64_t rounded_output = round_half_even(output, olength, actural_round); - // check if carry - if (rounded_output >= POW10_TABLE[actural_olength]) { - result[index_for_carrier] = '1'; - rounded_output -= POW10_TABLE[actural_olength]; - } - int current = index; - for (int i = 0; i < actural_olength; i++) { - result[current + actural_olength - i - 1] = (char)('0' + rounded_output % 10); - rounded_output /= 10; - index++; - } - actural_round -= actural_olength; - if (actural_round > 0) { - for (int i = 0; i < actural_round; i++) { - result[index++] = '0'; - } - } - } else if (exp + 1 >= olength) { - // Decimal dot is after any of the digits. - int integer_len = index + exp + 1 + exp / 3; - int sep_cnt = 0; - int rev_index = 0; - for (int i = olength; i < exp + 1; i++) { - result[integer_len - (rev_index++) - 1] = '0'; - sep_cnt++; - if (sep_cnt == 3) { - result[integer_len - (rev_index++) - 1] = ','; - sep_cnt = 0; - } - } - for (int i = 0; i < olength; i++) { - if (sep_cnt == 3) { - result[integer_len - (rev_index++) - 1] = ','; - sep_cnt = 0; - } - result[integer_len - (rev_index++) - 1] = (char)('0' + output % 10); - sep_cnt++; - output /= 10; - } - index = integer_len; - if (digits == 0) { return index; } - result[index++] = '.'; - for (int i = 0; i < digits; i++) { - result[index++] = '0'; - } - } else { - uint32_t temp_d = digits, tailing_zero = 0; - if (exp + digits + 1 > olength) { - temp_d = olength - exp - 1; - tailing_zero = digits - temp_d; - } - uint32_t rounded_output = round_half_even(output, olength, exp + temp_d + 1); - uint32_t pow10 = POW10_TABLE[temp_d]; - uint32_t integer = rounded_output / pow10; - uint32_t decimal = rounded_output % pow10; - // calculate integer length after format to cover carry case - uint32_t integer_len = decimalLength9(integer); - uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; - uint32_t sep_cnt = 0; - int rev_index = 0; - for (int i = 0; i < integer_len; i++) { - if (sep_cnt == 3) { - result[formated_integer_len - (rev_index++) - 1] = ','; - sep_cnt = 0; - } - result[formated_integer_len - (rev_index++) - 1] = (char)('0' + integer % 10); - sep_cnt++; - integer /= 10; - } - index = formated_integer_len; - if (digits == 0) { return index; } - result[index++] = '.'; - int current = index; - for (int i = 0; i < tailing_zero; i++) { - result[current + digits - i - 1] = '0'; - index++; - } - for (int i = tailing_zero; i < digits; i++) { - result[current + digits - i - 1] = (char)('0' + decimal % 10); - decimal /= 10; - index++; - } - } - return index; -} - -__device__ inline int format_float_size(floating_decimal_32 const v, bool const sign, int digits) +template +__device__ inline int format_size(T const v, bool const sign, int digits) { + static_assert(std::is_same_v || std::is_same_v); + using U = std::conditional_t, uint32_t, uint64_t>; int index = 0; if (sign) { index++; } - uint64_t output = v.mantissa; - uint32_t const olength = decimalLength9(output); - int32_t exp = v.exponent + (int32_t)olength - 1; + U output = v.mantissa; + uint32_t const olength = decimal_length(output); + int32_t exp = v.exponent + static_cast(olength) - 1; if (exp < 0) { index += 2 + digits; } else if (exp + 1 >= olength) { @@ -1479,10 +1348,10 @@ __device__ inline int format_float_size(floating_decimal_32 const v, bool const } else { uint32_t temp_d = digits; if (exp + digits + 1 > olength) { temp_d = olength - exp - 1; } - uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1); - uint64_t pow10 = POW10_TABLE[temp_d]; - uint64_t integer = rounded_output / pow10; - uint32_t integer_len = decimalLength9(integer); + U rounded_output = round_half_even(output, olength, exp + temp_d + 1); + U pow10 = POW10_TABLE[temp_d]; + U integer = rounded_output / pow10; + uint32_t integer_len = decimal_length(integer); index += integer_len + (integer_len - 1) / 3 + 1 + digits; } if (digits == 0) { index--; } @@ -1533,11 +1402,11 @@ __device__ inline int compute_format_float_size(double value, int digits, bool i if (is_float) { floating_decimal_32 v = f2d(value, sign, special); if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); } - return format_float_size(v, sign, digits); + return format_size(v, sign, digits); } else { floating_decimal_64 v = d2d(value, sign, special); if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); } - return format_double_size(v, sign, digits); + return format_size(v, sign, digits); } } @@ -1547,11 +1416,11 @@ __device__ inline int format_float(double value, int digits, bool is_float, char if (is_float) { floating_decimal_32 v = f2d(value, sign, special); if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); } - return to_formated_float_chars(v, sign, output, digits); + return to_formatted_chars(v, sign, output, digits); } else { floating_decimal_64 v = d2d(value, sign, special); if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); } - return to_formated_double_chars(v, sign, output, digits); + return to_formatted_chars(v, sign, output, digits); } } From f42872a4ad3443605030f7bc8f07f7aa9f7451da Mon Sep 17 00:00:00 2001 From: Tim Liu Date: Tue, 9 Jan 2024 10:06:09 +0800 Subject: [PATCH 084/127] Download boost package for CI jobs (#1672) * Download boost package for CI jobs To fix: https://github.com/NVIDIA/spark-rapids-jni/issues/1671 The old Boost package is not available to build CI Docker container. [boost_1_79_0.tar.gz] (https://github.com/NVIDIA/spark-rapids-jni/blob/branch-24.02/ci/Dockerfile#L64) not available. Update the correct Boost linkage for CI scripts, to PASS the spark-rapids-jni nightly build/test CI jobs. Signed-off-by: Tim Liu * Update copyright Signed-off-by: Tim Liu --------- Signed-off-by: Tim Liu --- ci/Dockerfile | 4 ++-- ci/Dockerfile.multi | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 7d59fef5f5..e3b703a11e 100755 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -1,5 +1,5 @@ # -# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -61,7 +61,7 @@ RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v rm -rf ccache-${CCACHE_VERSION} ## install a version of boost that is needed for arrow/parquet to work -RUN cd /usr/local && wget --quiet https://boostorg.jfrog.io/artifactory/main/release/1.79.0/source/boost_1_79_0.tar.gz && \ +RUN cd /usr/local && wget --quiet https://archives.boost.io/release/1.79.0/source/boost_1_79_0.tar.gz && \ tar -xzf boost_1_79_0.tar.gz && \ rm boost_1_79_0.tar.gz && \ cd boost_1_79_0 && \ diff --git a/ci/Dockerfile.multi b/ci/Dockerfile.multi index 720c9bc4df..d3b198530b 100644 --- a/ci/Dockerfile.multi +++ b/ci/Dockerfile.multi @@ -1,5 +1,5 @@ # -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -63,7 +63,7 @@ RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v rm -rf ccache-${CCACHE_VERSION} ## install a version of boost that is needed for arrow/parquet to work -RUN cd /usr/local && wget --quiet https://boostorg.jfrog.io/artifactory/main/release/1.79.0/source/boost_1_79_0.tar.gz && \ +RUN cd /usr/local && wget --quiet https://archives.boost.io/release/1.79.0/source/boost_1_79_0.tar.gz && \ tar -xzf boost_1_79_0.tar.gz && \ rm boost_1_79_0.tar.gz && \ cd boost_1_79_0 && \ From 4a27fdb7e57a405d5f48a023eb529d0e8dad5594 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 9 Jan 2024 11:31:18 +0800 Subject: [PATCH 085/127] Update submodule cudf to 3a1601d61437b339c47a015dab7a830998b182f9 (#1685) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index ba7550a17f..3a1601d614 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit ba7550a17f57d17d6d6decec3b2f8a0a5f687aa8 +Subproject commit 3a1601d61437b339c47a015dab7a830998b182f9 From f6972922986eb5f0f23b3456fb3e8a39349709c3 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 10 Jan 2024 11:30:59 +0800 Subject: [PATCH 086/127] Update submodule cudf to 6a23775db29dc4b38820994297c94201c9287aaf (#1688) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 3a1601d614..6a23775db2 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 3a1601d61437b339c47a015dab7a830998b182f9 +Subproject commit 6a23775db29dc4b38820994297c94201c9287aaf From 035dbd669d428a19c2b57b30e2ba3cb8064e909a Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 11 Jan 2024 06:02:54 +0800 Subject: [PATCH 087/127] Update submodule cudf to fa37e13db360e0b685bc6af020aa7510f1fbbdbd (#1691) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 6a23775db2..fa37e13db3 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 6a23775db29dc4b38820994297c94201c9287aaf +Subproject commit fa37e13db360e0b685bc6af020aa7510f1fbbdbd From 681391435d43ffb82dfafed4bd433e4d38aa1c5b Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 11 Jan 2024 11:25:10 +0800 Subject: [PATCH 088/127] Update submodule cudf to 1078326535c9989a2e904d78ceb708a097be989b (#1693) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index fa37e13db3..1078326535 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit fa37e13db360e0b685bc6af020aa7510f1fbbdbd +Subproject commit 1078326535c9989a2e904d78ceb708a097be989b From ed7c814f14bd1f2dba6351c66da0e220cde2ea79 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Thu, 11 Jan 2024 10:29:38 -0600 Subject: [PATCH 089/127] Add explicit mention of git submodule use at top of CONTRIBUTING.md (#1690) Signed-off-by: Jason Lowe --- CONTRIBUTING.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e8f64a9ae9..5fb76d548e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -28,6 +28,17 @@ There are two types of branches in this repository: is held here. `main` will change with new releases, but otherwise it should not change with every pull request merged, making it a more stable branch. +## Git Submodules + +This repository uses Git submodules. After cloning this repository or moving to a new commit +in this repository you will need to ensure the submodules are initialized and updated to the +expected submodule commits. This can be done by executing the following command at the top of +the repository: + +```commandline +git submodule update --init --recursive +``` + ## Building From Source [Maven](https://maven.apache.org) is used for most aspects of the build. For example, the From dd00ca2d9304388e668dba8429f05b076dbbf7b0 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 12 Jan 2024 05:26:18 +0800 Subject: [PATCH 090/127] Update submodule cudf to 85acdc640701940e47b3969b14a811f33e7faf5b (#1694) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 1078326535..85acdc6407 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 1078326535c9989a2e904d78ceb708a097be989b +Subproject commit 85acdc640701940e47b3969b14a811f33e7faf5b From a525bafbdac7bbcc4d9e49bcd6b2aa4a7c70a3dc Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 12 Jan 2024 12:06:39 +0800 Subject: [PATCH 091/127] Update submodule cudf to 7a42b8b57923b9515391cfe2c4668380b15ed118 (#1695) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 85acdc6407..7a42b8b579 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 85acdc640701940e47b3969b14a811f33e7faf5b +Subproject commit 7a42b8b57923b9515391cfe2c4668380b15ed118 From 3187eebd684a6efa5924f5667791a96d6d1f4012 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 12 Jan 2024 17:25:48 +0800 Subject: [PATCH 092/127] Update submodule cudf to 27b106f832999afa5b3353aaa2adcdb695fb4a47 (#1696) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 7a42b8b579..27b106f832 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 7a42b8b57923b9515391cfe2c4668380b15ed118 +Subproject commit 27b106f832999afa5b3353aaa2adcdb695fb4a47 From 922bca294d8ebd52560197d3c4d6fc8c6ae5e305 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 12 Jan 2024 21:30:37 +0800 Subject: [PATCH 093/127] Update submodule cudf to 5c78b7ea6b75f503d5df4abc828d80a0b470a284 (#1697) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 27b106f832..5c78b7ea6b 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 27b106f832999afa5b3353aaa2adcdb695fb4a47 +Subproject commit 5c78b7ea6b75f503d5df4abc828d80a0b470a284 From d9d87a3464d343ef72751c8fec9d49d3cdeef9d4 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Sat, 13 Jan 2024 05:25:08 +0800 Subject: [PATCH 094/127] Update submodule cudf to 7ca988f207730a3ae936e90d0104c4e6a14749ff (#1698) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 5c78b7ea6b..7ca988f207 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 5c78b7ea6b75f503d5df4abc828d80a0b470a284 +Subproject commit 7ca988f207730a3ae936e90d0104c4e6a14749ff From e264d32b3acfb5005b7e0240d704321b879f71a1 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Mon, 15 Jan 2024 21:29:37 +0800 Subject: [PATCH 095/127] Update submodule cudf to 07103355fea0fb3fd0e1115019bbac7d65bb132f (#1699) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 7ca988f207..07103355fe 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 7ca988f207730a3ae936e90d0104c4e6a14749ff +Subproject commit 07103355fea0fb3fd0e1115019bbac7d65bb132f From e5c9657b2e216ff0d63b297c27710a5439544b7f Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 16 Jan 2024 23:29:56 +0800 Subject: [PATCH 096/127] Update submodule cudf to 726a7f30757d1a06d74d86bb82cf311cb159f7fd (#1701) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 07103355fe..726a7f3075 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 07103355fea0fb3fd0e1115019bbac7d65bb132f +Subproject commit 726a7f30757d1a06d74d86bb82cf311cb159f7fd From c3be4f471623c6e77bf5ec50936c8a069d344c44 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 17 Jan 2024 10:28:28 +0800 Subject: [PATCH 097/127] Fix memory leak in time zone DB (#1689) * Fix memory leak in time zone DB Signed-off-by: Chong Gao * Fix bug * Address comments * Refine the sync * Fix compile error --------- Signed-off-by: Chong Gao Co-authored-by: Chong Gao --- .../spark/rapids/jni/GpuTimeZoneDB.java | 417 ++++++++++-------- 1 file changed, 236 insertions(+), 181 deletions(-) diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index b63a9dc282..643db278df 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -1,5 +1,5 @@ /* -* Copyright (c) 2023, NVIDIA CORPORATION. +* Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,13 @@ package com.nvidia.spark.rapids.jni; +import ai.rapids.cudf.ColumnVector; +import ai.rapids.cudf.DType; +import ai.rapids.cudf.HostColumnVector; +import ai.rapids.cudf.Table; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.time.Instant; import java.time.ZoneId; import java.time.zone.ZoneOffsetTransition; @@ -26,72 +33,171 @@ import java.util.List; import java.util.Map; import java.util.TimeZone; -import java.util.concurrent.*; - -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.DType; -import ai.rapids.cudf.HostColumnVector; -import ai.rapids.cudf.Table; +import java.util.concurrent.Executors; +/** + * Gpu time zone utility. + * Provides two kinds of APIs + * - Time zone transitions cache APIs + * `cacheDatabaseAsync`, `cacheDatabase` and `shutdown` are synchronized. + * When cacheDatabaseAsync is running, the `shutdown` and `cacheDatabase` will wait; + * These APIs guarantee only one thread is loading transitions cache, + * And guarantee loading cache only occurs one time. + * - Rebase time zone APIs + * fromTimestampToUtcTimestamp, fromUtcTimestampToTimestamp ... + */ public class GpuTimeZoneDB { - - public static final int TIMEOUT_SECS = 300; - + private static final Logger log = LoggerFactory.getLogger(GpuTimeZoneDB.class); // For the timezone database, we store the transitions in a ColumnVector that is a list of // structs. The type of this column vector is: // LIST> - private CompletableFuture> zoneIdToTableFuture; - private CompletableFuture fixedTransitionsFuture; + private Map zoneIdToTable; - private boolean closed = false; + // use this reference to indicate if time zone cache is initialized. + private HostColumnVector fixedTransitions; - GpuTimeZoneDB() { - zoneIdToTableFuture = new CompletableFuture<>(); - fixedTransitionsFuture = new CompletableFuture<>(); + // Guarantee singleton instance + private GpuTimeZoneDB() { } - - private static GpuTimeZoneDB instance = new GpuTimeZoneDB(); - // This method is default visibility for testing purposes only. The instance will be never be exposed publicly - // for this class. + + // singleton instance + private static final GpuTimeZoneDB instance = new GpuTimeZoneDB(); + + // This method is default visibility for testing purposes only. + // The instance will be never be exposed publicly for this class. static GpuTimeZoneDB getInstance() { return instance; } - + + static class LoadingLock { + Boolean isLoading = false; + + // record whether a shutdown is called ever. + // if `isCloseCalledEver` is true, then the following loading should be skipped. + Boolean isShutdownCalledEver = false; + } + + private static final LoadingLock lock = new LoadingLock(); + /** - * Start to cache the database. This should be called on startup of an executor. It should start - * to cache the data on the CPU in a background thread. It should return immediately and allow the - * other APIs to be called. Depending on what we want to do we can have the other APIs block - * until this is done caching, or we can have private APIs that would let us load and use specific - * parts of the database. I prefer the former solution at least until we see a performance hit - * where we are waiting on the database to finish loading. + * This should be called on startup of an executor. + * Runs in a thread asynchronously. + * If `shutdown` was called ever, then will not load the cache */ - public static void cacheDatabase() { - synchronized (instance) { - if (!instance.isLoaded()) { - Executor executor = Executors.newSingleThreadExecutor( - new ThreadFactory() { - private ThreadFactory defaultFactory = Executors.defaultThreadFactory(); - - @Override - public Thread newThread(Runnable r) { - Thread thread = defaultFactory.newThread(r); - thread.setName("gpu-timezone-database-0"); - thread.setDaemon(true); - return thread; - } - }); - instance.loadData(executor); + public static void cacheDatabaseAsync() { + synchronized (lock) { + if (lock.isShutdownCalledEver) { + // shutdown was called ever, will never load cache again. + return; + } + + if (lock.isLoading) { + // another thread is loading(), return + return; + } else { + lock.isLoading = true; } } + + // start a new thread to load + Runnable runnable = () -> { + try { + instance.cacheDatabaseImpl(); + } catch (Exception e) { + log.error("cache time zone transitions cache failed", e); + } finally { + synchronized (lock) { + // now loading is done + lock.isLoading = false; + // `cacheDatabase` and `shutdown` may wait loading is done. + lock.notify(); + } + } + }; + Thread thread = Executors.defaultThreadFactory().newThread(runnable); + thread.setName("gpu-timezone-database-0"); + thread.setDaemon(true); + thread.start(); } + /** + * Cache the database. This will take some time like several seconds. + * If one `cacheDatabase` is running, other `cacheDatabase` will wait until caching is done. + * If cache is exits, do not load cache again. + */ + public static void cacheDatabase() { + synchronized (lock) { + if (lock.isLoading) { + // another thread is loading(), wait loading is done + while (lock.isLoading) { + try { + lock.wait(); + } catch (InterruptedException e) { + throw new IllegalStateException("cache time zone transitions cache failed", e); + } + } + return; + } else { + lock.isLoading = true; + } + } + try { + instance.cacheDatabaseImpl(); + } finally { + // loading is done. + synchronized (lock) { + lock.isLoading = false; + // `cacheDatabase` and/or `shutdown` may wait loading is done. + lock.notify(); + } + } + } + + /** + * close the cache, used when Plugin is closing + */ public static void shutdown() { - if (instance.isLoaded()) { - instance.close(); - // Recreate a new instance to reload the database if necessary - instance = new GpuTimeZoneDB(); + synchronized (lock) { + lock.isShutdownCalledEver = true; + while (lock.isLoading) { + // wait until loading is done + try { + lock.wait(); + } catch (InterruptedException e) { + throw new IllegalStateException("shutdown time zone transitions cache failed", e); + } + } + instance.shutdownImpl(); + // `cacheDatabase` and/or `shutdown` may wait loading is done. + lock.notify(); + } + } + + private void cacheDatabaseImpl() { + if (fixedTransitions == null) { + try { + loadData(); + } catch (Exception e) { + closeResources(); + throw e; + } + } + } + + private void shutdownImpl() { + closeResources(); + } + + private void closeResources() { + if (zoneIdToTable != null) { + zoneIdToTable.clear(); + zoneIdToTable = null; + } + if (fixedTransitions != null) { + fixedTransitions.close(); + fixedTransitions = null; } } @@ -102,15 +208,12 @@ public static ColumnVector fromTimestampToUtcTimestamp(ColumnVector input, ZoneI throw new IllegalArgumentException(String.format("Unsupported timezone: %s", currentTimeZone.toString())); } - if (!instance.isLoaded()) { - cacheDatabase(); // lazy load the database - } + cacheDatabase(); Integer tzIndex = instance.getZoneIDMap().get(currentTimeZone.normalized().toString()); - Table transitions = instance.getTransitions(); - ColumnVector result = new ColumnVector(convertTimestampColumnToUTC(input.getNativeView(), - transitions.getNativeView(), tzIndex)); - transitions.close(); - return result; + try (Table transitions = instance.getTransitions()) { + return new ColumnVector(convertTimestampColumnToUTC(input.getNativeView(), + transitions.getNativeView(), tzIndex)); + } } public static ColumnVector fromUtcTimestampToTimestamp(ColumnVector input, ZoneId desiredTimeZone) { @@ -120,15 +223,12 @@ public static ColumnVector fromUtcTimestampToTimestamp(ColumnVector input, ZoneI throw new IllegalArgumentException(String.format("Unsupported timezone: %s", desiredTimeZone.toString())); } - if (!instance.isLoaded()) { - cacheDatabase(); // lazy load the database - } + cacheDatabase(); Integer tzIndex = instance.getZoneIDMap().get(desiredTimeZone.normalized().toString()); - Table transitions = instance.getTransitions(); - ColumnVector result = new ColumnVector(convertUTCTimestampColumnToTimeZone(input.getNativeView(), - transitions.getNativeView(), tzIndex)); - transitions.close(); - return result; + try (Table transitions = instance.getTransitions()) { + return new ColumnVector(convertUTCTimestampColumnToTimeZone(input.getNativeView(), + transitions.getNativeView(), tzIndex)); + } } // TODO: Deprecate this API when we support all timezones @@ -157,128 +257,85 @@ public static ZoneId getZoneId(String timeZoneId) { return ZoneId.of(formattedZoneId, ZoneId.SHORT_IDS); } - private boolean isLoaded() { - return zoneIdToTableFuture.isDone(); - } - - private void loadData(Executor executor) throws IllegalStateException { - // Start loading the data in separate thread and return - try { - executor.execute(this::doLoadData); - } catch (RejectedExecutionException e) { - throw new IllegalStateException(e); - } - } - @SuppressWarnings("unchecked") - private void doLoadData() { - synchronized (this) { - try { - Map zoneIdToTable = new HashMap<>(); - List> masterTransitions = new ArrayList<>(); - for (String tzId : TimeZone.getAvailableIDs()) { - ZoneId zoneId; - try { - zoneId = ZoneId.of(tzId).normalized(); // we use the normalized form to dedupe - } catch (ZoneRulesException e) { - // Sometimes the list of getAvailableIDs() is one of the 3-letter abbreviations, however, - // this use is deprecated due to ambiguity reasons (same abbrevation can be used for - // multiple time zones). These are not supported by ZoneId.of(...) directly here. - continue; - } - ZoneRules zoneRules = zoneId.getRules(); - // Filter by non-repeating rules - if (!zoneRules.isFixedOffset() && !zoneRules.getTransitionRules().isEmpty()) { - continue; - } - if (!zoneIdToTable.containsKey(zoneId.getId())) { - List transitions = zoneRules.getTransitions(); - int idx = masterTransitions.size(); - List data = new ArrayList<>(); - if (zoneRules.isFixedOffset()) { - data.add( - new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE, - zoneRules.getOffset(Instant.now()).getTotalSeconds()) - ); - } else { - // Capture the first official offset (before any transition) using Long min - ZoneOffsetTransition first = transitions.get(0); - data.add( - new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE, - first.getOffsetBefore().getTotalSeconds()) - ); - transitions.forEach(t -> { - // Whether transition is an overlap vs gap. - // In Spark: - // if it's a gap, then we use the offset after *on* the instant - // If it's an overlap, then there are 2 sets of valid timestamps in that are overlapping - // So, for the transition to UTC, you need to compare to instant + {offset before} - // The time math still uses {offset after} - if (t.isGap()) { - data.add( - new HostColumnVector.StructData( - t.getInstant().getEpochSecond(), - t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(), - t.getOffsetAfter().getTotalSeconds()) - ); - } else { - data.add( - new HostColumnVector.StructData( - t.getInstant().getEpochSecond(), - t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(), - t.getOffsetAfter().getTotalSeconds()) - ); - } - }); - } - masterTransitions.add(data); - zoneIdToTable.put(zoneId.getId(), idx); + private void loadData() { + try { + List> masterTransitions = new ArrayList<>(); + zoneIdToTable = new HashMap<>(); + for (String tzId : TimeZone.getAvailableIDs()) { + ZoneId zoneId; + try { + zoneId = ZoneId.of(tzId).normalized(); // we use the normalized form to dedupe + } catch (ZoneRulesException e) { + // Sometimes the list of getAvailableIDs() is one of the 3-letter abbreviations, however, + // this use is deprecated due to ambiguity reasons (same abbrevation can be used for + // multiple time zones). These are not supported by ZoneId.of(...) directly here. + continue; + } + ZoneRules zoneRules = zoneId.getRules(); + // Filter by non-repeating rules + if (!zoneRules.isFixedOffset() && !zoneRules.getTransitionRules().isEmpty()) { + continue; + } + if (!zoneIdToTable.containsKey(zoneId.getId())) { + List transitions = zoneRules.getTransitions(); + int idx = masterTransitions.size(); + List data = new ArrayList<>(); + if (zoneRules.isFixedOffset()) { + data.add( + new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE, + zoneRules.getOffset(Instant.now()).getTotalSeconds()) + ); + } else { + // Capture the first official offset (before any transition) using Long min + ZoneOffsetTransition first = transitions.get(0); + data.add( + new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE, + first.getOffsetBefore().getTotalSeconds()) + ); + transitions.forEach(t -> { + // Whether transition is an overlap vs gap. + // In Spark: + // if it's a gap, then we use the offset after *on* the instant + // If it's an overlap, then there are 2 sets of valid timestamps in that are overlapping + // So, for the transition to UTC, you need to compare to instant + {offset before} + // The time math still uses {offset after} + if (t.isGap()) { + data.add( + new HostColumnVector.StructData( + t.getInstant().getEpochSecond(), + t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(), + t.getOffsetAfter().getTotalSeconds()) + ); + } else { + data.add( + new HostColumnVector.StructData( + t.getInstant().getEpochSecond(), + t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(), + t.getOffsetAfter().getTotalSeconds()) + ); + } + }); } + masterTransitions.add(data); + zoneIdToTable.put(zoneId.getId(), idx); } - HostColumnVector.DataType childType = new HostColumnVector.StructType(false, - new HostColumnVector.BasicType(false, DType.INT64), - new HostColumnVector.BasicType(false, DType.INT64), - new HostColumnVector.BasicType(false, DType.INT32)); - HostColumnVector.DataType resultType = - new HostColumnVector.ListType(false, childType); - HostColumnVector fixedTransitions = HostColumnVector.fromLists(resultType, - masterTransitions.toArray(new List[0])); - fixedTransitionsFuture.complete(fixedTransitions); - zoneIdToTableFuture.complete(zoneIdToTable); - } catch (Exception e) { - fixedTransitionsFuture.completeExceptionally(e); - zoneIdToTableFuture.completeExceptionally(e); - throw e; } - } - } - - private void close() { - synchronized (this) { - if (closed) { - return; - } - try (HostColumnVector hcv = getHostFixedTransitions()) { - // automatically closed - closed = true; - } - } - } - - private HostColumnVector getHostFixedTransitions() { - try { - return fixedTransitionsFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS); - } catch (InterruptedException | ExecutionException | TimeoutException e) { - throw new RuntimeException(e); + HostColumnVector.DataType childType = new HostColumnVector.StructType(false, + new HostColumnVector.BasicType(false, DType.INT64), + new HostColumnVector.BasicType(false, DType.INT64), + new HostColumnVector.BasicType(false, DType.INT32)); + HostColumnVector.DataType resultType = + new HostColumnVector.ListType(false, childType); + fixedTransitions = HostColumnVector.fromLists(resultType, + masterTransitions.toArray(new List[0])); + } catch (Exception e) { + throw new IllegalStateException("load time zone DB cache failed!", e); } } private Map getZoneIDMap() { - try { - return zoneIdToTableFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS); - } catch (InterruptedException | ExecutionException | TimeoutException e) { - throw new RuntimeException(e); - } + return zoneIdToTable; } private Table getTransitions() { @@ -288,8 +345,7 @@ private Table getTransitions() { } private ColumnVector getFixedTransitions() { - HostColumnVector hostTransitions = getHostFixedTransitions(); - return hostTransitions.copyToDevice(); + return fixedTransitions.copyToDevice(); } /** @@ -308,8 +364,7 @@ List getHostFixedTransitions(String zoneId) { if (idx == null) { return null; } - HostColumnVector transitions = getHostFixedTransitions(); - return transitions.getList(idx); + return fixedTransitions.getList(idx); } From 92adf79c297034d484b75c207947699a03264286 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 17 Jan 2024 11:31:15 +0800 Subject: [PATCH 098/127] Update submodule cudf to 2bead955ce5f43887a6ccc9d9834ca57ce58029d (#1702) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 726a7f3075..2bead955ce 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 726a7f30757d1a06d74d86bb82cf311cb159f7fd +Subproject commit 2bead955ce5f43887a6ccc9d9834ca57ce58029d From 277c032ed8393ba20fa18fe2766f8c2e0ea10390 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 17 Jan 2024 17:31:01 +0800 Subject: [PATCH 099/127] Update submodule cudf to 8f5e64ddcba788ddcc715fda7f2bf852166b7ee6 (#1705) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 2bead955ce..8f5e64ddcb 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 2bead955ce5f43887a6ccc9d9834ca57ce58029d +Subproject commit 8f5e64ddcba788ddcc715fda7f2bf852166b7ee6 From ad5514ae8abd632232cc65f9f92009e8e3fe32f7 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Wed, 17 Jan 2024 16:15:49 -0600 Subject: [PATCH 100/127] Update to new cudf strings where character data is no longer a child column (#1708) * Update to new cudf strings where character data is no longer a child column Signed-off-by: Jason Lowe * clang style fixes --------- Signed-off-by: Jason Lowe --- src/main/cpp/src/cast_string_to_float.cu | 8 ++++---- src/main/cpp/src/map_utils.cu | 18 +++++++++++------- src/main/cpp/src/parse_uri.cu | 23 +++++++++++++---------- thirdparty/cudf | 2 +- 4 files changed, 29 insertions(+), 22 deletions(-) diff --git a/src/main/cpp/src/cast_string_to_float.cu b/src/main/cpp/src/cast_string_to_float.cu index fe8a7f64db..75523cd360 100644 --- a/src/main/cpp/src/cast_string_to_float.cu +++ b/src/main/cpp/src/cast_string_to_float.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -679,7 +679,7 @@ std::unique_ptr string_to_float(data_type dtype, out->mutable_view().null_mask(), ansi_mode ? static_cast(ansi_count.get())->data() : nullptr, static_cast(valid_count.get())->data(), - string_col.chars().begin(), + string_col.chars_begin(stream), string_col.offsets().begin(), string_col.null_mask(), num_rows); @@ -690,7 +690,7 @@ std::unique_ptr string_to_float(data_type dtype, out->mutable_view().null_mask(), ansi_mode ? static_cast(ansi_count.get())->data() : nullptr, static_cast(valid_count.get())->data(), - string_col.chars().begin(), + string_col.chars_begin(stream), string_col.offsets().begin(), string_col.null_mask(), num_rows); @@ -714,7 +714,7 @@ std::unique_ptr string_to_float(data_type dtype, dest.resize(string_bounds[1] - string_bounds[0]); cudaMemcpyAsync(dest.data(), - &string_col.chars().data()[string_bounds[0]], + &string_col.chars_begin(stream)[string_bounds[0]], string_bounds[1] - string_bounds[0], cudaMemcpyDeviceToHost, stream.value()); diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/map_utils.cu index a51a7de57b..002dadb0e3 100644 --- a/src/main/cpp/src/map_utils.cu +++ b/src/main/cpp/src/map_utils.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -75,24 +75,28 @@ rmm::device_uvector unify_json_strings(cudf::column_view const& input, } auto const d_strings = cudf::column_device_view::create(input, stream); - auto const chars_size = input.child(cudf::strings_column_view::chars_column_index).size(); + auto const input_scv = cudf::strings_column_view{input}; + auto const chars_size = input_scv.chars_size(stream); auto const output_size = 2l + // two extra bracket characters '[' and ']' static_cast(chars_size) + static_cast(input.size() - 1) + // append `,` character between input rows static_cast(input.null_count()) * 2l; // replace null with "{}" + // TODO: This assertion eventually needs to be removed. + // See https://github.com/NVIDIA/spark-rapids-jni/issues/1707 CUDF_EXPECTS(output_size <= static_cast(std::numeric_limits::max()), "The input json column is too large and causes overflow."); auto const joined_input = cudf::strings::detail::join_strings( - cudf::strings_column_view{input}, + input_scv, cudf::string_scalar(","), // append `,` character between the input rows cudf::string_scalar("{}"), // replacement for null rows stream, rmm::mr::get_current_device_resource()); - auto const joined_input_child = - joined_input->child(cudf::strings_column_view::chars_column_index); - auto const joined_input_size_bytes = joined_input_child.size(); + auto const joined_input_scv = cudf::strings_column_view{*joined_input}; + auto const joined_input_size_bytes = joined_input_scv.chars_size(stream); + // TODO: This assertion requires a stream synchronization, may want to remove at some point. + // See https://github.com/NVIDIA/spark-rapids-jni/issues/1707 CUDF_EXPECTS(joined_input_size_bytes + 2 == output_size, "Incorrect output size computation."); // We want to concatenate 3 strings: "[" + joined_input + "]". @@ -100,7 +104,7 @@ rmm::device_uvector unify_json_strings(cudf::column_view const& input, auto output = rmm::device_uvector(joined_input_size_bytes + 2, stream); CUDF_CUDA_TRY(cudaMemsetAsync(output.data(), static_cast('['), 1, stream.value())); CUDF_CUDA_TRY(cudaMemcpyAsync(output.data() + 1, - joined_input_child.view().data(), + joined_input_scv.chars_begin(stream), joined_input_size_bytes, cudaMemcpyDefault, stream.value())); diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu index 897ebe0208..83b14ced9e 100644 --- a/src/main/cpp/src/parse_uri.cu +++ b/src/main/cpp/src/parse_uri.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -700,19 +700,20 @@ uri_parts __device__ validate_uri(const char* str, int len) * * @param in_strings Input string column * @param chunk Chunk of URI to return + * @param base_ptr Pointer to the start of the character data in the strings column * @param out_lengths Number of characters in each decode URL * @param out_offsets Offsets to the start of the chunks * @param out_validity Bitmask of validity data, updated in function */ __global__ void parse_uri_char_counter(column_device_view const in_strings, URI_chunks chunk, + char const* const base_ptr, size_type* const out_lengths, size_type* const out_offsets, bitmask_type* out_validity) { // thread per row - auto const tid = cudf::detail::grid_1d::global_thread_id(); - auto const base_ptr = in_strings.child(strings_column_view::chars_column_index).data(); + auto const tid = cudf::detail::grid_1d::global_thread_id(); for (thread_index_type tidx = tid; tidx < in_strings.size(); tidx += cudf::detail::grid_1d::grid_stride()) { @@ -778,17 +779,18 @@ __global__ void parse_uri_char_counter(column_device_view const in_strings, * @brief Parse protocol and copy from the input string column to the output char buffer. * * @param in_strings Input string column + * @param base_ptr Pointer to the start of the character data in the strings column * @param src_offsets Offset value of source strings in in_strings * @param offsets Offset value of each string associated with `out_chars` * @param out_chars Character buffer for the output string column */ __global__ void parse_uri(column_device_view const in_strings, + char const* const base_ptr, size_type const* const src_offsets, size_type const* const offsets, char* const out_chars) { - auto const tid = cudf::detail::grid_1d::global_thread_id(); - auto const base_ptr = in_strings.child(strings_column_view::chars_column_index).data(); + auto const tid = cudf::detail::grid_1d::global_thread_id(); for (thread_index_type tidx = tid; tidx < in_strings.size(); tidx += cudf::detail::grid_1d::grid_stride()) { @@ -840,6 +842,7 @@ std::unique_ptr parse_uri(strings_column_view const& input, parse_uri_char_counter<<>>( *d_strings, chunk, + input.chars_begin(stream), offsets_mutable_view.begin(), reinterpret_cast(src_offsets.data()), reinterpret_cast(null_mask.data())); @@ -854,23 +857,23 @@ std::unique_ptr parse_uri(strings_column_view const& input, // to the host memory auto out_chars_bytes = cudf::detail::get_value(offsets_view, offset_count - 1, stream); - // create the chars column - auto chars_column = cudf::strings::detail::create_chars_child_column(out_chars_bytes, stream, mr); - auto d_out_chars = chars_column->mutable_view().data(); + // create the chars buffer + auto d_out_chars = rmm::device_buffer(out_chars_bytes, stream, mr); // copy the characters from the input column to the output column parse_uri<<>>( *d_strings, + input.chars_begin(stream), reinterpret_cast(src_offsets.data()), offsets_column->view().begin(), - d_out_chars); + static_cast(d_out_chars.data())); auto null_count = cudf::null_count(reinterpret_cast(null_mask.data()), 0, strings_count); return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(d_out_chars), null_count, std::move(null_mask)); } diff --git a/thirdparty/cudf b/thirdparty/cudf index 8f5e64ddcb..6abef4a474 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 8f5e64ddcba788ddcc715fda7f2bf852166b7ee6 +Subproject commit 6abef4a4746f1f9917711f372726023efdc21e85 From 5ffd328274605392f12c1157b770067c6f22033f Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 18 Jan 2024 12:05:48 +0800 Subject: [PATCH 101/127] [submodule-sync] bot-submodule-sync-branch-24.02 to branch-24.02 [skip ci] [bot] (#1706) * Update submodule cudf to c7acdaa231fb0ffe7751611590f9b85ba7508d4d Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> * Update submodule cudf to c81198789be183e7e1eb288eb98dd16f65b57e44 Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> * Update submodule cudf to 9acddc08cc209e8d6b94891be6131edd63ff5b43 Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --------- Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 6abef4a474..9acddc08cc 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 6abef4a4746f1f9917711f372726023efdc21e85 +Subproject commit 9acddc08cc209e8d6b94891be6131edd63ff5b43 From 8331658383d088c95759c6e3bf77bdc310978a0e Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 19 Jan 2024 06:05:10 +0800 Subject: [PATCH 102/127] Update submodule cudf to 66c3e8e92f9c37dd909b78936addb463f1bd6011 (#1709) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 9acddc08cc..66c3e8e92f 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 9acddc08cc209e8d6b94891be6131edd63ff5b43 +Subproject commit 66c3e8e92f9c37dd909b78936addb463f1bd6011 From fea7d22a6a9578f8cf960c909df34e204ed1873e Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 19 Jan 2024 12:05:14 +0800 Subject: [PATCH 103/127] Update submodule cudf to eeee795c232e2811adeb5a3942f7a149d8b16d49 (#1710) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 66c3e8e92f..eeee795c23 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 66c3e8e92f9c37dd909b78936addb463f1bd6011 +Subproject commit eeee795c232e2811adeb5a3942f7a149d8b16d49 From 91407929bea933bf53ab6b20a23843572b4f687f Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 19 Jan 2024 17:27:17 +0800 Subject: [PATCH 104/127] Update submodule cudf to f785ed3ddebf8b225b9d7c07aab9d5f32eb39b05 (#1711) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index eeee795c23..f785ed3dde 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit eeee795c232e2811adeb5a3942f7a149d8b16d49 +Subproject commit f785ed3ddebf8b225b9d7c07aab9d5f32eb39b05 From d45dca03ffa9eaa7aff8494fb98188787e654795 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Sat, 20 Jan 2024 06:05:25 +0800 Subject: [PATCH 105/127] Update submodule cudf to a38fc01a6b8cb8506753b6a7fd77c7444e25d52c (#1712) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index f785ed3dde..a38fc01a6b 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit f785ed3ddebf8b225b9d7c07aab9d5f32eb39b05 +Subproject commit a38fc01a6b8cb8506753b6a7fd77c7444e25d52c From 93f4a38d33c7f40f326f32ea1a03636f4e244eb6 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Sat, 20 Jan 2024 11:24:51 +0800 Subject: [PATCH 106/127] Update submodule cudf to 1c37c780ced37d6084c90b815b274b598665d60e (#1713) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index a38fc01a6b..1c37c780ce 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit a38fc01a6b8cb8506753b6a7fd77c7444e25d52c +Subproject commit 1c37c780ced37d6084c90b815b274b598665d60e From 13e4652011ce5dab8fed19bd2873931238515169 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Sat, 20 Jan 2024 17:23:39 +0800 Subject: [PATCH 107/127] Update submodule cudf to 19942809679e4675c296a38f90bfdbaa8574eee2 (#1714) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 1c37c780ce..1994280967 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 1c37c780ced37d6084c90b815b274b598665d60e +Subproject commit 19942809679e4675c296a38f90bfdbaa8574eee2 From 5de3c3a9356005a8dbe4611e5045557072505d7c Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 23 Jan 2024 06:03:16 +0800 Subject: [PATCH 108/127] Update submodule cudf to f24f0b528b16454a2b79182f77bb46a663ab2c25 (#1715) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 1994280967..f24f0b528b 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 19942809679e4675c296a38f90bfdbaa8574eee2 +Subproject commit f24f0b528b16454a2b79182f77bb46a663ab2c25 From 96c4ebc23f333a3ca41b46b4f10d9845f092be6e Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 23 Jan 2024 12:06:31 +0800 Subject: [PATCH 109/127] Update submodule cudf to ef3ce4bc8db008f58249241c16c80f7e6e600fa9 (#1716) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index f24f0b528b..ef3ce4bc8d 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit f24f0b528b16454a2b79182f77bb46a663ab2c25 +Subproject commit ef3ce4bc8db008f58249241c16c80f7e6e600fa9 From e667df4c888e7fcda9f7e7cac0d8bc027e0b4eac Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 23 Jan 2024 18:09:19 +0800 Subject: [PATCH 110/127] Update submodule cudf to a39897c108d44a4d5e027ca741be5462863eeefc (#1717) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index ef3ce4bc8d..a39897c108 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit ef3ce4bc8db008f58249241c16c80f7e6e600fa9 +Subproject commit a39897c108d44a4d5e027ca741be5462863eeefc From 516c48c29334edea210c89f0498022e0fa708ecf Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 24 Jan 2024 05:32:27 +0800 Subject: [PATCH 111/127] Update submodule cudf to 67a36a9104097cd6a8ae6efee1018e249f2fe441 (#1720) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index a39897c108..67a36a9104 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit a39897c108d44a4d5e027ca741be5462863eeefc +Subproject commit 67a36a9104097cd6a8ae6efee1018e249f2fe441 From 7b6b25e470b6aa5594182fd5ccd654b14c043414 Mon Sep 17 00:00:00 2001 From: Tim Liu Date: Wed, 24 Jan 2024 17:18:56 +0800 Subject: [PATCH 112/127] Enable auto-merge from branch-24.02 to branch-24.04 (#1722) Signed-off-by: Tim Liu --- .github/workflows/auto-merge.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml index 77a7701008..08ee4f34d8 100755 --- a/.github/workflows/auto-merge.yml +++ b/.github/workflows/auto-merge.yml @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,12 +18,12 @@ name: auto-merge HEAD to BASE on: pull_request_target: branches: - - branch-23.12 + - branch-24.02 types: [closed] env: - HEAD: branch-23.12 - BASE: branch-24.02 + HEAD: branch-24.02 + BASE: branch-24.04 jobs: auto-merge: From 9065404aa70232d5c7141228eaa49cd9c5c800ec Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 24 Jan 2024 16:25:14 -0500 Subject: [PATCH 113/127] Adding literal support for parse_uri for query (#1704) --- src/main/cpp/src/ParseURIJni.cpp | 16 +++ src/main/cpp/src/parse_uri.cu | 104 ++++++++++++++++-- src/main/cpp/src/parse_uri.hpp | 15 +++ src/main/cpp/tests/parse_uri.cpp | 53 ++++++--- .../com/nvidia/spark/rapids/jni/ParseURI.java | 13 +++ .../nvidia/spark/rapids/jni/ParseURITest.java | 38 +++++++ 6 files changed, 216 insertions(+), 23 deletions(-) diff --git a/src/main/cpp/src/ParseURIJni.cpp b/src/main/cpp/src/ParseURIJni.cpp index 3af72687b6..c688d10736 100644 --- a/src/main/cpp/src/ParseURIJni.cpp +++ b/src/main/cpp/src/ParseURIJni.cpp @@ -61,4 +61,20 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseQuery(JNI } CATCH_STD(env, 0); } + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseQueryWithLiteral( + JNIEnv* env, jclass, jlong input_column, jstring query) +{ + JNI_NULL_CHECK(env, input_column, "input column is null", 0); + JNI_NULL_CHECK(env, query, "query is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = reinterpret_cast(input_column); + cudf::jni::native_jstring native_query(env, query); + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::parse_uri_to_query(*input, native_query.get()).release()); + } + CATCH_STD(env, 0); +} } diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu index 83b14ced9e..4d21617fd7 100644 --- a/src/main/cpp/src/parse_uri.cu +++ b/src/main/cpp/src/parse_uri.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,7 @@ #include #include +#include namespace spark_rapids_jni { @@ -490,7 +492,55 @@ bool __device__ validate_fragment(string_view fragment) })); } -uri_parts __device__ validate_uri(const char* str, int len) +__device__ std::pair find_query_part(string_view haystack, string_view needle) +{ + auto const n_bytes = needle.size_bytes(); + auto const find_length = haystack.size_bytes() - n_bytes + 1; + + auto h = haystack.data(); + auto const end_h = haystack.data() + find_length; + auto n = needle.data(); + while (h < end_h) { + bool match = true; + for (size_type jdx = 0; match && (jdx < n_bytes); ++jdx) { + match = (h[jdx] == n[jdx]); + } + if (match) { match = n_bytes < haystack.size_bytes() && h[n_bytes] == '='; } + if (match) { + // we don't care about the matched part, we want the string data after that. + h += n_bytes; + break; + } else { + // skip to the next param, which is after a &. + while (h < end_h && *h != '&') { + h++; + } + } + h++; + } + + // if h is past the end of the haystack, no match. + if (haystack.data() + haystack.size_bytes() <= h || *h != '=') { return {{}, false}; } + + // skip over the = + h++; + + // rest of string until end or until '&' is query match + auto const bytes_left = haystack.size_bytes() - (h - haystack.data()); + int match_len = 0; + auto start = h; + while (*h != '&' && match_len < bytes_left) { + ++match_len; + ++h; + } + + return {{start, match_len}, true}; +} + +uri_parts __device__ validate_uri(const char* str, + int len, + thrust::optional query_match, + size_type row_idx) { uri_parts ret; @@ -572,6 +622,23 @@ uri_parts __device__ validate_uri(const char* str, int len) ret.valid = 0; return ret; } + + // Maybe limit the query data if a literal or a column is passed as a filter. This alters the + // return from the entire query to just a specific parameter. For example, query for the URI + // http://www.nvidia.com/page?param0=5¶m1=2 is param0=5¶m1=2, but if the literal is + // passed as param0, the return would simply be 5. + if (query_match && query_match->size() > 0) { + auto const match_idx = row_idx % query_match->size(); + auto in_match = query_match->element(match_idx); + + auto const [query, valid] = find_query_part(ret.query, in_match); + if (!valid) { + ret.valid = 0; + return ret; + } + ret.query = query; + } + ret.valid |= (1 << static_cast(URI_chunks::QUERY)); } auto const path_len = question >= 0 ? question : len; @@ -710,7 +777,8 @@ __global__ void parse_uri_char_counter(column_device_view const in_strings, char const* const base_ptr, size_type* const out_lengths, size_type* const out_offsets, - bitmask_type* out_validity) + bitmask_type* out_validity, + thrust::optional query_match) { // thread per row auto const tid = cudf::detail::grid_1d::global_thread_id(); @@ -727,7 +795,7 @@ __global__ void parse_uri_char_counter(column_device_view const in_strings, auto const in_chars = in_string.data(); auto const string_length = in_string.size_bytes(); - auto const uri = validate_uri(in_chars, string_length); + auto const uri = validate_uri(in_chars, string_length, query_match, row_idx); if ((uri.valid & (1 << static_cast(chunk))) == 0) { out_lengths[row_idx] = 0; clear_bit(out_validity, row_idx); @@ -809,6 +877,7 @@ __global__ void parse_uri(column_device_view const in_strings, std::unique_ptr parse_uri(strings_column_view const& input, URI_chunks chunk, + std::optional query_match, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -822,6 +891,9 @@ std::unique_ptr parse_uri(strings_column_view const& input, auto offset_count = strings_count + 1; auto const d_strings = column_device_view::create(input.parent(), stream); + auto const d_matches = + query_match ? column_device_view::create(query_match->parent(), stream) + : std::unique_ptr>{}; // build offsets column auto offsets_column = make_numeric_column( @@ -845,7 +917,8 @@ std::unique_ptr parse_uri(strings_column_view const& input, input.chars_begin(stream), offsets_mutable_view.begin(), reinterpret_cast(src_offsets.data()), - reinterpret_cast(null_mask.data())); + reinterpret_cast(null_mask.data()), + d_matches ? thrust::optional{*d_matches} : thrust::nullopt); // use scan to transform number of bytes into offsets thrust::exclusive_scan(rmm::exec_policy(stream), @@ -887,7 +960,7 @@ std::unique_ptr parse_uri_to_protocol(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::parse_uri(input, detail::URI_chunks::PROTOCOL, stream, mr); + return detail::parse_uri(input, detail::URI_chunks::PROTOCOL, std::nullopt, stream, mr); } std::unique_ptr parse_uri_to_host(strings_column_view const& input, @@ -895,7 +968,7 @@ std::unique_ptr parse_uri_to_host(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::parse_uri(input, detail::URI_chunks::HOST, stream, mr); + return detail::parse_uri(input, detail::URI_chunks::HOST, std::nullopt, stream, mr); } std::unique_ptr parse_uri_to_query(strings_column_view const& input, @@ -903,8 +976,21 @@ std::unique_ptr parse_uri_to_query(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::parse_uri( - input, detail::URI_chunks::QUERY, stream, rmm::mr::get_current_device_resource()); + return detail::parse_uri(input, detail::URI_chunks::QUERY, std::nullopt, stream, mr); +} + +std::unique_ptr parse_uri_to_query(cudf::strings_column_view const& input, + std::string const& query_match, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + // build string_column_view from incoming query_match string + auto d_scalar = make_string_scalar(query_match, stream); + auto col = make_column_from_scalar(*d_scalar, 1); + + return detail::parse_uri(input, detail::URI_chunks::QUERY, strings_column_view(*col), stream, mr); } -} // namespace spark_rapids_jni \ No newline at end of file +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/parse_uri.hpp b/src/main/cpp/src/parse_uri.hpp index 07f6f9cd46..bb001e3167 100644 --- a/src/main/cpp/src/parse_uri.hpp +++ b/src/main/cpp/src/parse_uri.hpp @@ -65,4 +65,19 @@ std::unique_ptr parse_uri_to_query( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Parse query and copy from the input string column to the output string column. + * + * @param input Input string column of URIs to parse. + * @param query_match String to match in query. + * @param stream Stream on which to operate. + * @param mr Memory resource for returned column. + * @return std::unique_ptr String column of queries parsed. + */ +std::unique_ptr parse_uri_to_query( + cudf::strings_column_view const& input, + std::string const& query_match, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace spark_rapids_jni diff --git a/src/main/cpp/tests/parse_uri.cpp b/src/main/cpp/tests/parse_uri.cpp index 36ebbeacc0..234ad380c7 100644 --- a/src/main/cpp/tests/parse_uri.cpp +++ b/src/main/cpp/tests/parse_uri.cpp @@ -127,14 +127,15 @@ cudf::test::strings_column_wrapper get_test_data(test_types t) "https:// /path/to/file", }); case test_types::QUERY: - return cudf::test::strings_column_wrapper({ - "https://www.nvidia.com/path?param0=1¶m2=3¶m4=5", - "https:// /?params=5&cloth=0&metal=1", - "https://[2001:db8::2:1]:443/parms/in/the/uri?a=b", - "https://[::1]/?invalid=param&f„⁈.=7", - "https://[::1]/?invalid=param&~.=!@&^", - "userinfo@www.nvidia.com/path?query=1#Ref", - }); + return cudf::test::strings_column_wrapper( + {"https://www.nvidia.com/path?param0=1¶m2=3¶m4=5", + "https:// /?params=5&cloth=0&metal=1¶m0=param3", + "https://[2001:db8::2:1]:443/parms/in/the/uri?a=b¶m0=true", + "https://[::1]/?invalid=param&f„⁈.=7¶m0=3", + "https://[::1]/?invalid=param¶m0=f„⁈&~.=!@&^", + "userinfo@www.nvidia.com/path?query=1¶m0=5#Ref", + "https://www.nvidia.com/path?brokenparam0=1&fakeparam0=5¶m0=true", + "http://nvidia.com?CBA=CBA&C=C"}); default: CUDF_FAIL("Test type unsupported!"); return cudf::test::strings_column_wrapper(); } } @@ -362,12 +363,36 @@ TEST_F(ParseURIQueryTests, SparkEdges) TEST_F(ParseURIQueryTests, Queries) { - auto const col = get_test_data(test_types::QUERY); - auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col}); + auto const col = get_test_data(test_types::QUERY); + + { + auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper const expected({"param0=1¶m2=3¶m4=5", + "", + "a=b¶m0=true", + "invalid=param&f„⁈.=7¶m0=3", + "", + "query=1¶m0=5", + "brokenparam0=1&fakeparam0=5¶m0=true", + "CBA=CBA&C=C"}, + {1, 0, 1, 1, 0, 1, 1, 1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); + } + { + auto const result = + spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col}, "param0"); + cudf::test::strings_column_wrapper const expected({"1", "", "true", "3", "", "5", "true", ""}, + {1, 0, 1, 1, 0, 1, 1, 0}); - cudf::test::strings_column_wrapper const expected( - {"param0=1¶m2=3¶m4=5", "", "a=b", "invalid=param&f„⁈.=7", "", "query=1"}, - {1, 0, 1, 1, 0, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); + } + { + auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col}, "C"); + cudf::test::strings_column_wrapper const expected({"", "", "", "", "", "", "", "C"}, + {0, 0, 0, 0, 0, 0, 0, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); + } } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java index 8f82bfc908..e9908f9ea5 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java @@ -60,7 +60,20 @@ public static ColumnVector parseURIQuery(ColumnView uriColumn) { return new ColumnVector(parseQuery(uriColumn.getNativeView())); } + /** + * Parse query and return a specific parameter for each URI from the incoming column. + * + * @param URIColumn The input strings column in which each row contains a URI. + * @param String The parameter to extract from the query + * @return A string column with query data extracted. + */ + public static ColumnVector parseURIQueryWithLiteral(ColumnView uriColumn, String query) { + assert uriColumn.getType().equals(DType.STRING) : "Input type must be String"; + return new ColumnVector(parseQueryWithLiteral(uriColumn.getNativeView(), query)); + } + private static native long parseProtocol(long jsonColumnHandle); private static native long parseHost(long jsonColumnHandle); private static native long parseQuery(long jsonColumnHandle); + private static native long parseQueryWithLiteral(long jsonColumnHandle, String query); } diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java index ca76df2bf3..c79633008c 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java @@ -90,6 +90,40 @@ void testQuery(String[] testData) { } } + void testQuery(String[] testData, String param) { + String[] expectedQueryStrings = new String[testData.length]; + for (int i=0; i 0 && pair.substring(0, idx).equals(param)) { + subquery = pair.substring(idx + 1); + break; + } + } + } + expectedQueryStrings[i] = subquery; + } + try (ColumnVector v0 = ColumnVector.fromStrings(testData); + ColumnVector expectedQuery = ColumnVector.fromStrings(expectedQueryStrings); + ColumnVector queryResult = ParseURI.parseURIQueryWithLiteral(v0, param)) { + AssertUtils.assertColumnsAreEqual(expectedQuery, queryResult); + } + } + @Test void parseURISparkTest() { String[] testData = { @@ -150,6 +184,7 @@ void parseURISparkTest() { testProtocol(testData); testHost(testData); testQuery(testData); + testQuery(testData, "query"); } @Test @@ -163,6 +198,7 @@ void parseURIUTF8Test() { testProtocol(testData); testHost(testData); testQuery(testData); + testQuery(testData, "query"); } @Test @@ -178,6 +214,7 @@ void parseURIIP4Test() { testProtocol(testData); testHost(testData); testQuery(testData); + testQuery(testData, "query"); } @Test @@ -206,5 +243,6 @@ void parseURIIP6Test() { testProtocol(testData); testHost(testData); testQuery(testData); + testQuery(testData, "query"); } } From 34536f8c10c18aa92d0f792e32218ed8cbd30083 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 25 Jan 2024 05:33:25 +0800 Subject: [PATCH 114/127] Update submodule cudf to f800f5a2fa9a961699345e6febe740b4b8f4760e (#1729) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 67a36a9104..f800f5a2fa 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 67a36a9104097cd6a8ae6efee1018e249f2fe441 +Subproject commit f800f5a2fa9a961699345e6febe740b4b8f4760e From 1d27b332b1252031ff1efd412e4f98a556c37daf Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 24 Jan 2024 18:41:35 -0500 Subject: [PATCH 115/127] Add support for `parse_uri` to limit query with a column (#1719) * Adding support for parse uri for query with a column for keys Signed-off-by: Mike Wilson Co-authored-by: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> --- src/main/cpp/src/ParseURIJni.cpp | 15 +++ src/main/cpp/src/parse_uri.cu | 17 +++- src/main/cpp/src/parse_uri.hpp | 15 +++ src/main/cpp/tests/parse_uri.cpp | 11 +++ .../com/nvidia/spark/rapids/jni/ParseURI.java | 22 ++++- .../nvidia/spark/rapids/jni/ParseURITest.java | 92 +++++++++++++++++++ 6 files changed, 167 insertions(+), 5 deletions(-) diff --git a/src/main/cpp/src/ParseURIJni.cpp b/src/main/cpp/src/ParseURIJni.cpp index c688d10736..354d47c424 100644 --- a/src/main/cpp/src/ParseURIJni.cpp +++ b/src/main/cpp/src/ParseURIJni.cpp @@ -77,4 +77,19 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseQueryWith } CATCH_STD(env, 0); } + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseQueryWithColumn( + JNIEnv* env, jclass, jlong input_column, jlong query_column) +{ + JNI_NULL_CHECK(env, input_column, "input column is null", 0); + JNI_NULL_CHECK(env, query_column, "query column is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = reinterpret_cast(input_column); + auto const query = reinterpret_cast(query_column); + return cudf::jni::ptr_as_jlong(spark_rapids_jni::parse_uri_to_query(*input, *query).release()); + } + CATCH_STD(env, 0); +} } diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu index 4d21617fd7..cd64c539ef 100644 --- a/src/main/cpp/src/parse_uri.cu +++ b/src/main/cpp/src/parse_uri.cu @@ -629,7 +629,11 @@ uri_parts __device__ validate_uri(const char* str, // passed as param0, the return would simply be 5. if (query_match && query_match->size() > 0) { auto const match_idx = row_idx % query_match->size(); - auto in_match = query_match->element(match_idx); + if (query_match->is_null(match_idx)) { + ret.valid = 0; + return ret; + } + auto in_match = query_match->element(match_idx); auto const [query, valid] = find_query_part(ret.query, in_match); if (!valid) { @@ -993,4 +997,15 @@ std::unique_ptr parse_uri_to_query(cudf::strings_column_view const return detail::parse_uri(input, detail::URI_chunks::QUERY, strings_column_view(*col), stream, mr); } +std::unique_ptr parse_uri_to_query(cudf::strings_column_view const& input, + cudf::strings_column_view const& query_match, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(input.size() == query_match.size(), "Query column must be the same size as input!"); + + return detail::parse_uri(input, detail::URI_chunks::QUERY, query_match, stream, mr); +} + } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/parse_uri.hpp b/src/main/cpp/src/parse_uri.hpp index bb001e3167..004d800ddb 100644 --- a/src/main/cpp/src/parse_uri.hpp +++ b/src/main/cpp/src/parse_uri.hpp @@ -80,4 +80,19 @@ std::unique_ptr parse_uri_to_query( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Parse query and copy from the input string column to the output string column. + * + * @param input Input string column of URIs to parse. + * @param query_match string column to match in query. + * @param stream Stream on which to operate. + * @param mr Memory resource for returned column. + * @return std::unique_ptr String column of queries parsed. + */ +std::unique_ptr parse_uri_to_query( + cudf::strings_column_view const& input, + cudf::strings_column_view const& query_match, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace spark_rapids_jni diff --git a/src/main/cpp/tests/parse_uri.cpp b/src/main/cpp/tests/parse_uri.cpp index 234ad380c7..09f238e18c 100644 --- a/src/main/cpp/tests/parse_uri.cpp +++ b/src/main/cpp/tests/parse_uri.cpp @@ -395,4 +395,15 @@ TEST_F(ParseURIQueryTests, Queries) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); } + { + cudf::test::strings_column_wrapper const query( + {"param0", "q", "a", "invalid", "test", "query", "fakeparam0", "C"}); + cudf::test::strings_column_wrapper const expected({"1", "", "b", "param", "", "1", "5", "C"}, + {1, 0, 1, 1, 0, 1, 1, 1}); + + auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col}, + cudf::strings_column_view{query}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); + } } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java index e9908f9ea5..6de84ea519 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java @@ -72,8 +72,22 @@ public static ColumnVector parseURIQueryWithLiteral(ColumnView uriColumn, String return new ColumnVector(parseQueryWithLiteral(uriColumn.getNativeView(), query)); } - private static native long parseProtocol(long jsonColumnHandle); - private static native long parseHost(long jsonColumnHandle); - private static native long parseQuery(long jsonColumnHandle); - private static native long parseQueryWithLiteral(long jsonColumnHandle, String query); + /** + * Parse query and return a specific parameter for each URI from the incoming column. + * + * @param URIColumn The input strings column in which each row contains a URI. + * @param String The parameter to extract from the query + * @return A string column with query data extracted. + */ + public static ColumnVector parseURIQueryWithColumn(ColumnView uriColumn, ColumnView queryColumn) { + assert uriColumn.getType().equals(DType.STRING) : "Input type must be String"; + assert queryColumn.getType().equals(DType.STRING) : "Query type must be String"; + return new ColumnVector(parseQueryWithColumn(uriColumn.getNativeView(), queryColumn.getNativeView())); + } + + private static native long parseProtocol(long inputColumnHandle); + private static native long parseHost(long inputColumnHandle); + private static native long parseQuery(long inputColumnHandle); + private static native long parseQueryWithLiteral(long inputColumnHandle, String query); + private static native long parseQueryWithColumn(long inputColumnHandle, long queryColumnHandle); } diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java index c79633008c..f8ed45c704 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java @@ -124,6 +124,41 @@ void testQuery(String[] testData, String param) { } } + void testQuery(String[] testData, String[] params) { + String[] expectedQueryStrings = new String[testData.length]; + for (int i=0; i 0 && pair.substring(0, idx).equals(params[i])) { + subquery = pair.substring(idx + 1); + break; + } + } + } + expectedQueryStrings[i] = subquery; + } + try (ColumnVector v0 = ColumnVector.fromStrings(testData); + ColumnVector p0 = ColumnVector.fromStrings(params); + ColumnVector expectedQuery = ColumnVector.fromStrings(expectedQueryStrings); + ColumnVector queryResult = ParseURI.parseURIQueryWithColumn(v0, p0)) { + AssertUtils.assertColumnsAreEqual(expectedQuery, queryResult); + } + } + @Test void parseURISparkTest() { String[] testData = { @@ -180,11 +215,68 @@ void parseURISparkTest() { "userinfo@www.nvidia.com/path?query=1#Ref", "", null}; + + + String[] queries = { + "a", + "h", + // commented out until https://github.com/NVIDIA/spark-rapids/issues/10036 is fixed + //"object", + "object", + "a", + "h", + "a", + "f", + "g", + "a", + "a", + "f", + "g", + "a", + "a", + "b", + "a", + "", + "a", + "a", + "a", + "a", + "b", + "a", + "q", + "b", + "a", + "query", + "a", + "primekey_in", + "a", + "q", + "ExpertId", + "query", + "solutionId", + "f", + "param", + "", + "q", + "a", + "f", + "mnid=5080", + "f", + "a", + "param4", + "cloth", + "a", + "invalid", + "invalid", + "query", + "a", + "f"}; testProtocol(testData); testHost(testData); testQuery(testData); testQuery(testData, "query"); + testQuery(testData, queries); } @Test From 8cb6913b5c672c82b97da0908f8af3d977fca821 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Thu, 25 Jan 2024 11:28:15 +0800 Subject: [PATCH 116/127] Update submodule cudf to 5b1eef31ed4c5935285ef780dc74d35cea086b49 (#1732) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index f800f5a2fa..5b1eef31ed 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit f800f5a2fa9a961699345e6febe740b4b8f4760e +Subproject commit 5b1eef31ed4c5935285ef780dc74d35cea086b49 From 95fcad879802613c21d0ed89d8656970b8869cad Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 26 Jan 2024 06:06:39 +0800 Subject: [PATCH 117/127] Update submodule cudf to 0cd58fbec63d5e461b487e7e37aa9942ebe0f116 (#1736) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 5b1eef31ed..0cd58fbec6 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 5b1eef31ed4c5935285ef780dc74d35cea086b49 +Subproject commit 0cd58fbec63d5e461b487e7e37aa9942ebe0f116 From cd0b85761f2a0db353798a542d4b4589856fdfa0 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Fri, 26 Jan 2024 11:02:49 +0800 Subject: [PATCH 118/127] Update submodule cudf to 821f4dea107db6a51fcbffff997fa6844ab5565f (#1738) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 0cd58fbec6..821f4dea10 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 0cd58fbec63d5e461b487e7e37aa9942ebe0f116 +Subproject commit 821f4dea107db6a51fcbffff997fa6844ab5565f From 9c34fae6e919d5d4ce76d7384b086b9ba69648ea Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 26 Jan 2024 20:57:28 +0800 Subject: [PATCH 119/127] Fix a parse_uri query bug (#1740) * Fix a parse_uri query bug Signed-off-by: Haoyang Li * Add comments Signed-off-by: Haoyang Li --------- Signed-off-by: Haoyang Li --- src/main/cpp/src/parse_uri.cu | 9 +++++---- .../com/nvidia/spark/rapids/jni/ParseURITest.java | 13 ++++++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu index cd64c539ef..13c4050404 100644 --- a/src/main/cpp/src/parse_uri.cu +++ b/src/main/cpp/src/parse_uri.cu @@ -500,9 +500,10 @@ __device__ std::pair find_query_part(string_view haystack, st auto h = haystack.data(); auto const end_h = haystack.data() + find_length; auto n = needle.data(); + bool match = false; while (h < end_h) { - bool match = true; - for (size_type jdx = 0; match && (jdx < n_bytes); ++jdx) { + match = false; // initialize to false to prevent empty query key + for (size_type jdx = 0; (jdx == 0 || match) && (jdx < n_bytes); ++jdx) { match = (h[jdx] == n[jdx]); } if (match) { match = n_bytes < haystack.size_bytes() && h[n_bytes] == '='; } @@ -519,8 +520,8 @@ __device__ std::pair find_query_part(string_view haystack, st h++; } - // if h is past the end of the haystack, no match. - if (haystack.data() + haystack.size_bytes() <= h || *h != '=') { return {{}, false}; } + // if not match or no value, return nothing + if (!match || *h != '=') { return {{}, false}; } // skip over the = h++; diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java index f8ed45c704..8f9fcfd903 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java @@ -214,8 +214,11 @@ void parseURISparkTest() { "https://[::1]/?invalid=param&~.=!@&^", "userinfo@www.nvidia.com/path?query=1#Ref", "", - null}; - + null, + "https://www.nvidia.com/?cat=12", + "www.nvidia.com/vote.php?pid=50", + "https://www.nvidia.com/vote.php?=50", + }; String[] queries = { "a", @@ -270,7 +273,11 @@ void parseURISparkTest() { "invalid", "query", "a", - "f"}; + "f", + "query", + "query", + "" + }; testProtocol(testData); testHost(testData); From 9d50ce52e827fde86bae03ba1ab7b584e3a717dc Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 29 Jan 2024 15:13:47 +0800 Subject: [PATCH 120/127] Fix build warnings of chars and make_strings_column (#1725) Signed-off-by: Haoyang Li --- src/main/cpp/benchmarks/common/generate_input.cu | 2 +- src/main/cpp/src/cast_decimal_to_string.cu | 2 +- src/main/cpp/src/cast_float_to_string.cu | 2 +- src/main/cpp/src/cast_string.cu | 6 +++--- src/main/cpp/src/format_float.cu | 2 +- src/main/cpp/src/map_utils.cu | 7 +++++-- 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/main/cpp/benchmarks/common/generate_input.cu b/src/main/cpp/benchmarks/common/generate_input.cu index 3b1376c89a..75f0a8fca0 100644 --- a/src/main/cpp/benchmarks/common/generate_input.cu +++ b/src/main/cpp/benchmarks/common/generate_input.cu @@ -520,7 +520,7 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons return cudf::make_strings_column( num_rows, std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), profile.get_null_frequency().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}, null_count); } diff --git a/src/main/cpp/src/cast_decimal_to_string.cu b/src/main/cpp/src/cast_decimal_to_string.cu index 0cd2713a2f..01fc5c5a92 100644 --- a/src/main/cpp/src/cast_decimal_to_string.cu +++ b/src/main/cpp/src/cast_decimal_to_string.cu @@ -191,7 +191,7 @@ struct dispatch_decimal_to_non_ansi_string_fn { return make_strings_column(input.size(), std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input, stream, mr)); } diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index 6fc4d20f79..c0e0875914 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -88,7 +88,7 @@ struct dispatch_float_to_string_fn { return make_strings_column(strings_count, std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), floats.null_count(), cudf::detail::copy_bitmask(floats, stream, mr)); } diff --git a/src/main/cpp/src/cast_string.cu b/src/main/cpp/src/cast_string.cu index 59a27a59b7..c2a8190062 100644 --- a/src/main/cpp/src/cast_string.cu +++ b/src/main/cpp/src/cast_string.cu @@ -624,7 +624,7 @@ void validate_ansi_column(column_view const& col, dest.resize(string_bounds[1] - string_bounds[0]); cudaMemcpyAsync(dest.data(), - &source_col.chars().data()[string_bounds[0]], + &source_col.chars_begin(stream)[string_bounds[0]], string_bounds[1] - string_bounds[0], cudaMemcpyDeviceToHost, stream.value()); @@ -667,7 +667,7 @@ struct string_to_integer_impl { detail::string_to_integer_kernel<<>>( data.data(), null_mask.data(), - string_col.chars().data(), + string_col.chars_begin(stream), string_col.offsets().data(), string_col.null_mask(), string_col.size(), @@ -736,7 +736,7 @@ struct string_to_decimal_impl { detail::string_to_decimal_kernel<<>>( data.data(), null_mask.data(), - string_col.chars().data(), + string_col.chars_begin(stream), string_col.offsets().data(), string_col.null_mask(), string_col.size(), diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu index d9ecbe8206..3052d334aa 100644 --- a/src/main/cpp/src/format_float.cu +++ b/src/main/cpp/src/format_float.cu @@ -89,7 +89,7 @@ struct dispatch_format_float_fn { return cudf::make_strings_column(strings_count, std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), floats.null_count(), cudf::detail::copy_bitmask(floats, stream, mr)); } diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/map_utils.cu index 002dadb0e3..9b420e201f 100644 --- a/src/main/cpp/src/map_utils.cu +++ b/src/main/cpp/src/map_utils.cu @@ -575,8 +575,11 @@ std::unique_ptr extract_keys_or_values( auto children = cudf::strings::detail::make_strings_children( substring_fn{unified_json_buff, extract_ranges}, num_extract, stream, mr); - return cudf::make_strings_column( - num_extract, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{}); + return cudf::make_strings_column(num_extract, + std::move(children.first), + std::move(children.second->release().data.release()[0]), + 0, + rmm::device_buffer{}); } // Compute the offsets for the final lists of Struct. From c776bd954e53fea65a513bdda8456b39c84400c4 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 30 Jan 2024 03:37:28 +0800 Subject: [PATCH 121/127] Fix a out of bound bug in parse_url query (#1746) --- src/main/cpp/src/parse_uri.cu | 64 +++++++++---------- .../nvidia/spark/rapids/jni/ParseURITest.java | 4 +- 2 files changed, 32 insertions(+), 36 deletions(-) diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu index 13c4050404..0e6ea2690d 100644 --- a/src/main/cpp/src/parse_uri.cu +++ b/src/main/cpp/src/parse_uri.cu @@ -494,48 +494,42 @@ bool __device__ validate_fragment(string_view fragment) __device__ std::pair find_query_part(string_view haystack, string_view needle) { - auto const n_bytes = needle.size_bytes(); - auto const find_length = haystack.size_bytes() - n_bytes + 1; - - auto h = haystack.data(); - auto const end_h = haystack.data() + find_length; - auto n = needle.data(); - bool match = false; - while (h < end_h) { - match = false; // initialize to false to prevent empty query key - for (size_type jdx = 0; (jdx == 0 || match) && (jdx < n_bytes); ++jdx) { - match = (h[jdx] == n[jdx]); + auto const n_bytes = needle.size_bytes(); + auto h = haystack.data(); + auto const h_end = h + haystack.size_bytes(); + auto n = needle.data(); + + // stop matching early after it can no longer contain the string we are searching for + while (h + n_bytes < h_end) { + bool match_needle = true; + for (size_type jdx = 0; jdx < n_bytes; ++jdx) { + match_needle = (h[jdx] == n[jdx]); + if (!match_needle) { break; } } - if (match) { match = n_bytes < haystack.size_bytes() && h[n_bytes] == '='; } - if (match) { - // we don't care about the matched part, we want the string data after that. - h += n_bytes; - break; - } else { - // skip to the next param, which is after a &. - while (h < end_h && *h != '&') { + + if (match_needle && h[n_bytes] == '=') { + // we don't care about the matched part, we want the string data after '=' + h += n_bytes + 1; + + // rest of string until end or until '&' is query match + int match_len = 0; + auto start = h; + while (h < h_end && *h != '&') { + match_len++; h++; } - } - h++; - } - - // if not match or no value, return nothing - if (!match || *h != '=') { return {{}, false}; } - // skip over the = - h++; + return {{start, match_len}, true}; + } - // rest of string until end or until '&' is query match - auto const bytes_left = haystack.size_bytes() - (h - haystack.data()); - int match_len = 0; - auto start = h; - while (*h != '&' && match_len < bytes_left) { - ++match_len; - ++h; + // not match, skip to the next param if possible, which is after a &. + while (h + n_bytes < h_end && *h != '&') { + h++; + } + h++; // skip over the & if has, or point to h_end +1 } - return {{start, match_len}, true}; + return {{}, false}; } uri_parts __device__ validate_uri(const char* str, diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java index 8f9fcfd903..ffe7e9e946 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java @@ -143,7 +143,7 @@ void testQuery(String[] testData, String[] params) { String[] pairs = query.split("&"); for (String pair : pairs) { int idx = pair.indexOf("="); - if (idx > 0 && pair.substring(0, idx).equals(params[i])) { + if (idx >= 0 && pair.substring(0, idx).equals(params[i])) { subquery = pair.substring(idx + 1); break; } @@ -218,6 +218,7 @@ void parseURISparkTest() { "https://www.nvidia.com/?cat=12", "www.nvidia.com/vote.php?pid=50", "https://www.nvidia.com/vote.php?=50", + "https://www.nvidia.com/vote.php?query=50" }; String[] queries = { @@ -276,6 +277,7 @@ void parseURISparkTest() { "f", "query", "query", + "", "" }; From f0c63a84cdb8ba0f3b0e633cd804dd049fb04309 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Tue, 30 Jan 2024 07:02:31 +0800 Subject: [PATCH 122/127] Update submodule cudf to fc2b9771f17644243817a339e218360aa97a1a79 (#1751) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 821f4dea10..fc2b9771f1 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 821f4dea107db6a51fcbffff997fa6844ab5565f +Subproject commit fc2b9771f17644243817a339e218360aa97a1a79 From d44c477e0276f2032e5c9ec341cc4f452f2cc28f Mon Sep 17 00:00:00 2001 From: Tim Liu Date: Fri, 2 Feb 2024 16:32:51 +0800 Subject: [PATCH 123/127] Change version to 24.02.0 Signed-off-by: Tim Liu --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 4f2d19e45c..5bedec16b6 100644 --- a/pom.xml +++ b/pom.xml @@ -21,7 +21,7 @@ com.nvidia spark-rapids-jni - 24.02.0-SNAPSHOT + 24.02.0 jar RAPIDS Accelerator JNI for Apache Spark From f53ea02cd3f44544d57da2314feeb7f74529ef24 Mon Sep 17 00:00:00 2001 From: Tim Liu Date: Fri, 2 Feb 2024 17:10:06 +0800 Subject: [PATCH 124/127] Update copyright Signed-off-by: Tim Liu --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 5bedec16b6..f9b48e6c40 100644 --- a/pom.xml +++ b/pom.xml @@ -1,6 +1,6 @@