From 28bac3b995ae6c543f23df624e6de12e206ed7f0 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 14 Nov 2023 18:13:19 +0000
Subject: [PATCH 001/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 7f3fba164c..4313cfa9b3 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 7f3fba164c4dd28c701ea2941d0525fc782a639c
+Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc

From 4520be90ff72bf7b8077cc236dcfbe27fc8f16f2 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 15 Nov 2023 00:22:23 +0000
Subject: [PATCH 002/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index b446a6f187..4313cfa9b3 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit b446a6f187241e765c925da1053ece2679313a06
+Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc

From a14a15832d3a2397e152e6746f454f53b62344bb Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 15 Nov 2023 03:13:28 +0000
Subject: [PATCH 003/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 330d389b26..4313cfa9b3 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 330d389b26a05676d9f079503a3d96b571762337
+Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc

From 000b1cdf655e9591d07171c18e211199480ead43 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 15 Nov 2023 05:11:43 +0000
Subject: [PATCH 004/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8a0a08f34f..4313cfa9b3 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8a0a08f34ff804a7329ea640aa1e0a9b188d2162
+Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc

From 80fea2665b4418b802dfb42d20c75eda064bd930 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 21 Nov 2023 04:46:17 +0000
Subject: [PATCH 005/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8a0a08f34f..4313cfa9b3 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8a0a08f34ff804a7329ea640aa1e0a9b188d2162
+Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc

From 6a7991ea70370cd66c55eccd8fd8daf53cb30fed Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Tue, 21 Nov 2023 13:58:47 +0800
Subject: [PATCH 006/127] Initiate Version 24.02.0-SNAPSHOT (#1562)

* Initiate version 24.02.0-SNAPSHOT

Signed-off-by: Peixin Li <pxli@nyu.edu>

* udpate cudf 24.02 to latest commit

* update cudf submodule ref

* update cudf commit

---------

Signed-off-by: Peixin Li <pxli@nyu.edu>
---
 .gitmodules                 | 2 +-
 CONTRIBUTING.md             | 2 +-
 pom.xml                     | 2 +-
 src/main/cpp/CMakeLists.txt | 2 +-
 thirdparty/cudf             | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 5051589232..6b6f69d695 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "thirdparty/cudf"]
 	path = thirdparty/cudf
 	url = https://github.com/rapidsai/cudf.git
-	branch = branch-23.12
+	branch = branch-24.02
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9c230cf1c6..13edca987a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -149,7 +149,7 @@ $ ./build/build-in-docker install ...
 ```
 
 Now cd to ~/repos/NVIDIA/spark-rapids and build with one of the options from
-[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-23.12/CONTRIBUTING.md#building-from-source).
+[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-24.02/CONTRIBUTING.md#building-from-source).
 
 ```bash
 $ ./build/buildall
diff --git a/pom.xml b/pom.xml
index ba03282637..4f2d19e45c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -21,7 +21,7 @@
 
   <groupId>com.nvidia</groupId>
   <artifactId>spark-rapids-jni</artifactId>
-  <version>23.12.0-SNAPSHOT</version>
+  <version>24.02.0-SNAPSHOT</version>
   <packaging>jar</packaging>
   <name>RAPIDS Accelerator JNI for Apache Spark</name>
   <description>
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 706bcfa30f..4c90b8de82 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -32,7 +32,7 @@ rapids_cuda_init_architectures(SPARK_RAPIDS_JNI)
 
 project(
   SPARK_RAPIDS_JNI
-  VERSION 23.12.00
+  VERSION 24.02.00
   LANGUAGES C CXX CUDA
 )
 
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 4313cfa9b3..947081f5b1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc
+Subproject commit 947081f5b10ca972826942b84c5c2530050325d8

From a94a28fd76fc4fe882ba968e19dd8f2ff2287297 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 21 Nov 2023 06:23:10 +0000
Subject: [PATCH 007/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8a0a08f34f..947081f5b1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8a0a08f34ff804a7329ea640aa1e0a9b188d2162
+Subproject commit 947081f5b10ca972826942b84c5c2530050325d8

From 213f986a6e0ae50f0511e06aa34f3ca12bc3c869 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 21 Nov 2023 11:03:12 +0000
Subject: [PATCH 008/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 823d3214a9..947081f5b1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3
+Subproject commit 947081f5b10ca972826942b84c5c2530050325d8

From f18a5c2359932c7660f6ab8be6d28f9d23d7dc54 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 21 Nov 2023 15:39:37 +0000
Subject: [PATCH 009/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 823d3214a9..947081f5b1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3
+Subproject commit 947081f5b10ca972826942b84c5c2530050325d8

From 1df862f55745918dde04769b4c69c5e67331326c Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 22 Nov 2023 05:30:30 +0800
Subject: [PATCH 010/127] Update submodule cudf to
 fcc89503c1f1e15ec287519959013adcf2bf8a52 (#1586)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 947081f5b1..fcc89503c1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 947081f5b10ca972826942b84c5c2530050325d8
+Subproject commit fcc89503c1f1e15ec287519959013adcf2bf8a52

From 771857b6adc0a7ac270f9a06307610383fd01fce Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 21 Nov 2023 22:56:16 +0000
Subject: [PATCH 011/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 823d3214a9..fcc89503c1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3
+Subproject commit fcc89503c1f1e15ec287519959013adcf2bf8a52

From b17a2bf403a6aac9c99e8de008bfbbf7523d5235 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 22 Nov 2023 01:04:14 +0000
Subject: [PATCH 012/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 823d3214a9..fcc89503c1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3
+Subproject commit fcc89503c1f1e15ec287519959013adcf2bf8a52

From 6eab7dbec2b41c904db58315e2c03f2fa84a40fe Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 22 Nov 2023 16:36:10 +0000
Subject: [PATCH 013/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 823d3214a9..fcc89503c1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3
+Subproject commit fcc89503c1f1e15ec287519959013adcf2bf8a52

From 77192a0ee883e8f8331a361838d751401cd143db Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 23 Nov 2023 05:24:59 +0800
Subject: [PATCH 014/127] Update submodule cudf to
 f02fde9de9354a829d6f4425e086c84d36c076ae (#1593)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index fcc89503c1..f02fde9de9 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit fcc89503c1f1e15ec287519959013adcf2bf8a52
+Subproject commit f02fde9de9354a829d6f4425e086c84d36c076ae

From cbb3be51f2a7a6b2a8d0a19e91d45241bf81a5b2 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 23 Nov 2023 11:27:19 +0800
Subject: [PATCH 015/127] Update submodule cudf to
 168533a8ad4086bd020be4f7bf9264a08b6d2243 (#1594)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f02fde9de9..168533a8ad 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f02fde9de9354a829d6f4425e086c84d36c076ae
+Subproject commit 168533a8ad4086bd020be4f7bf9264a08b6d2243

From face74b240be186fc589536eaa5f8d0da50ded41 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 25 Nov 2023 06:01:51 +0800
Subject: [PATCH 016/127] Update submodule cudf to
 db6745b5909233f0090d617c2eadb58a39c1348c (#1595)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 168533a8ad..db6745b590 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 168533a8ad4086bd020be4f7bf9264a08b6d2243
+Subproject commit db6745b5909233f0090d617c2eadb58a39c1348c

From d12c76014752a693cbbc2a6231acf5a82cdb51c3 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 28 Nov 2023 06:02:30 +0800
Subject: [PATCH 017/127] Update submodule cudf to
 c8d481e24a8cf6054cb9400213df00a4b42a1566 (#1596)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index db6745b590..c8d481e24a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit db6745b5909233f0090d617c2eadb58a39c1348c
+Subproject commit c8d481e24a8cf6054cb9400213df00a4b42a1566

From 6f601cf599ad04193a6e6dd7f82a5192bd282817 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 28 Nov 2023 02:31:58 +0000
Subject: [PATCH 018/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 823d3214a9..c8d481e24a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3
+Subproject commit c8d481e24a8cf6054cb9400213df00a4b42a1566

From 63f22a124f290beb13e3fe35c07264ffc5198f98 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 28 Nov 2023 12:02:23 +0800
Subject: [PATCH 019/127] Update submodule cudf to
 5e58e71836fd69ead04fbed5fdccb5e2e2c4d95c (#1599)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index c8d481e24a..5e58e71836 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit c8d481e24a8cf6054cb9400213df00a4b42a1566
+Subproject commit 5e58e71836fd69ead04fbed5fdccb5e2e2c4d95c

From db0cc73dc17c0c0a7ce1f43c28043c404f04c592 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 28 Nov 2023 07:32:04 +0000
Subject: [PATCH 020/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 823d3214a9..5e58e71836 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3
+Subproject commit 5e58e71836fd69ead04fbed5fdccb5e2e2c4d95c

From 10c74c37da4994ae912c202b44ed2575f81bcf75 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 29 Nov 2023 06:05:21 +0800
Subject: [PATCH 021/127] Update submodule cudf to
 94ca0b11d94b07f991c53a9413156f90a4f73597 (#1601)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5e58e71836..94ca0b11d9 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5e58e71836fd69ead04fbed5fdccb5e2e2c4d95c
+Subproject commit 94ca0b11d94b07f991c53a9413156f90a4f73597

From e7dff0b3fc2a8eb6bae494e0fb4c6f82f31eb0ec Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 28 Nov 2023 23:02:10 +0000
Subject: [PATCH 022/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 68cb1d944b..94ca0b11d9 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 68cb1d944b8b2f1c7e3564dc66eacc7f0b19ecee
+Subproject commit 94ca0b11d94b07f991c53a9413156f90a4f73597

From 76f6030ac732b3593f4164a8eb1785a08f24b08a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 29 Nov 2023 11:23:30 +0800
Subject: [PATCH 023/127] Update submodule cudf to
 8da62049aee750b391ff6d8ca4937428f94fd10c (#1604)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 94ca0b11d9..8da62049ae 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 94ca0b11d94b07f991c53a9413156f90a4f73597
+Subproject commit 8da62049aee750b391ff6d8ca4937428f94fd10c

From e5cf1afec33cec3258b188f578f685f40acb6066 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Wed, 29 Nov 2023 08:13:30 -0600
Subject: [PATCH 024/127] Update state retry state machine for CPU alloc
 support (#1543)

Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
---
 src/main/cpp/src/SparkResourceAdaptorJni.cpp  | 1591 +++++++++++------
 .../nvidia/spark/rapids/jni/CpuRetryOOM.java  |   32 +
 .../spark/rapids/jni/CpuSplitAndRetryOOM.java |   32 +
 .../jni/{RetryOOM.java => GpuRetryOOM.java}   |    8 +-
 ...RetryOOM.java => GpuSplitAndRetryOOM.java} |    8 +-
 .../nvidia/spark/rapids/jni/OffHeapOOM.java   |   32 +
 .../com/nvidia/spark/rapids/jni/RmmSpark.java |  307 +++-
 .../spark/rapids/jni/RmmSparkThreadState.java |   26 +-
 .../rapids/jni/SparkResourceAdaptor.java      |  156 +-
 .../spark/rapids/jni/ThreadStateRegistry.java |   67 +
 .../jni/LimitingOffHeapAllocForTests.java     |   90 +
 .../spark/rapids/jni/RmmSparkMonteCarlo.java  |   78 +-
 .../nvidia/spark/rapids/jni/RmmSparkTest.java |  638 +++++--
 13 files changed, 2265 insertions(+), 800 deletions(-)
 create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/CpuRetryOOM.java
 create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/CpuSplitAndRetryOOM.java
 rename src/main/java/com/nvidia/spark/rapids/jni/{RetryOOM.java => GpuRetryOOM.java} (85%)
 rename src/main/java/com/nvidia/spark/rapids/jni/{SplitAndRetryOOM.java => GpuSplitAndRetryOOM.java} (85%)
 create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/OffHeapOOM.java
 create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/ThreadStateRegistry.java
 create mode 100644 src/test/java/com/nvidia/spark/rapids/jni/LimitingOffHeapAllocForTests.java

diff --git a/src/main/cpp/src/SparkResourceAdaptorJni.cpp b/src/main/cpp/src/SparkResourceAdaptorJni.cpp
index 16c950d121..d3821fcc18 100644
--- a/src/main/cpp/src/SparkResourceAdaptorJni.cpp
+++ b/src/main/cpp/src/SparkResourceAdaptorJni.cpp
@@ -19,6 +19,7 @@
 #include <map>
 #include <set>
 #include <sstream>
+#include <unordered_set>
 
 #include <pthread.h>
 
@@ -32,8 +33,44 @@
 
 namespace {
 
-constexpr char const* RETRY_OOM_CLASS           = "com/nvidia/spark/rapids/jni/RetryOOM";
-constexpr char const* SPLIT_AND_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/SplitAndRetryOOM";
+constexpr char const* GPU_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/GpuRetryOOM";
+constexpr char const* GPU_SPLIT_AND_RETRY_OOM_CLASS =
+  "com/nvidia/spark/rapids/jni/GpuSplitAndRetryOOM";
+constexpr char const* CPU_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/CpuRetryOOM";
+constexpr char const* CPU_SPLIT_AND_RETRY_OOM_CLASS =
+  "com/nvidia/spark/rapids/jni/CpuSplitAndRetryOOM";
+constexpr char const* THREAD_REG_CLASS      = "com/nvidia/spark/rapids/jni/ThreadStateRegistry";
+constexpr char const* IS_THREAD_BLOCKED     = "isThreadBlocked";
+constexpr char const* IS_THREAD_BLOCKED_SIG = "(J)Z";
+constexpr char const* REMOVE_THREAD         = "removeThread";
+constexpr char const* REMOVE_THREAD_SIG     = "(J)V";
+
+// This is a bit of a hack to cache the methods because CUDF is getting java to do an onload
+// there.
+std::mutex jni_mutex;
+bool is_jni_loaded = false;
+jclass ThreadStateRegistry_jclass;
+jmethodID removeThread_method;
+jmethodID isThreadBlocked_method;
+
+void cache_thread_reg_jni(JNIEnv* env)
+{
+  std::unique_lock<std::mutex> lock(jni_mutex);
+  if (is_jni_loaded) { return; }
+  jclass cls = env->FindClass(THREAD_REG_CLASS);
+  if (cls == nullptr) { return; }
+
+  removeThread_method = env->GetStaticMethodID(cls, REMOVE_THREAD, REMOVE_THREAD_SIG);
+  if (removeThread_method == nullptr) { return; }
+
+  isThreadBlocked_method = env->GetStaticMethodID(cls, IS_THREAD_BLOCKED, IS_THREAD_BLOCKED_SIG);
+  if (isThreadBlocked_method == nullptr) { return; }
+
+  // Convert local reference to global so it cannot be garbage collected.
+  ThreadStateRegistry_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
+  if (ThreadStateRegistry_jclass == nullptr) { return; }
+  is_jni_loaded = true;
+}
 
 // In the task states BUFN means Block Until Further Notice.
 // Meaning the thread should be blocked until another task finishes.
@@ -42,28 +79,19 @@ constexpr char const* SPLIT_AND_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/S
 // again until we know that progress has been made. We might add an API
 // in the future to know when a retry section has passed, which would
 // probably be a preferable time to restart all BUFN threads.
-enum thread_state {
+enum class thread_state {
   UNKNOWN = -1,  // unknown state, this is really here for logging and anything transitioning to
                  // this state should actually be accomplished by deleting the thread from the state
-  TASK_RUNNING              = 0,  // task thread running normally
-  TASK_WAIT_ON_SHUFFLE      = 1,  // task thread waiting on shuffle
-  TASK_BUFN_WAIT_ON_SHUFFLE = 2,  // task thread waiting on shuffle, but marked as BUFN
-  TASK_ALLOC                = 3,  // task thread in the middle of doing an allocation
-  TASK_ALLOC_FREE = 4,   // task thread in the middle of doing an allocation and a free happened
-  TASK_BLOCKED    = 5,   // task thread that is temporarily blocked
-  TASK_BUFN_THROW = 6,   // task thread that should throw an exception to roll back before blocking
-  TASK_BUFN_WAIT  = 7,   // task thread that threw an exception to roll back and now should
-                         // block the next time alloc or block_until_ready is called
-  TASK_BUFN        = 8,  // task thread that is blocked until higher priority tasks start to succeed
-  TASK_SPLIT_THROW = 9,  // task thread that should throw an exception to split input and retry
-  TASK_REMOVE_THROW = 10,     // task thread that is being removed and needs to throw an exception
-                              // to start the blocked thread running again.
-  SHUFFLE_RUNNING      = 11,  // shuffle thread that is running normally
-  SHUFFLE_ALLOC        = 12,  // shuffle thread that is in the middle of doing an alloc
-  SHUFFLE_ALLOC_FREE   = 13,  // shuffle thread that is doing an alloc and a free happened.
-  SHUFFLE_BLOCKED      = 14,  // shuffle thread that is temporarily blocked
-  SHUFFLE_THROW        = 15,  // shuffle thread that needs to throw an OOM
-  SHUFFLE_REMOVE_THROW = 16  // shuffle thread that is being removed and needs to throw an exception
+  THREAD_RUNNING    = 0,  // task thread running normally
+  THREAD_ALLOC      = 1,  // task thread in the middle of doing an allocation
+  THREAD_ALLOC_FREE = 2,  // task thread in the middle of doing an allocation and a free happened
+  THREAD_BLOCKED    = 3,  // task thread that is temporarily blocked
+  THREAD_BUFN_THROW = 4,  // task thread that should throw an exception to roll back before blocking
+  THREAD_BUFN_WAIT  = 5,  // task thread that threw an exception to roll back and now should
+                          // block the next time alloc or block_until_ready is called
+  THREAD_BUFN = 6,  // task thread that is blocked until higher priority tasks start to succeed
+  THREAD_SPLIT_THROW  = 7,  // task thread that should throw an exception to split input and retry
+  THREAD_REMOVE_THROW = 8,  // task thread that is being removed and needs to throw an exception
 };
 
 /**
@@ -72,23 +100,15 @@ enum thread_state {
 const char* as_str(thread_state state)
 {
   switch (state) {
-    case TASK_RUNNING: return "TASK_RUNNING";
-    case TASK_WAIT_ON_SHUFFLE: return "TASK_WAIT_ON_SHUFFLE";
-    case TASK_BUFN_WAIT_ON_SHUFFLE: return "TASK_BUFN_WAIT_ON_SHUFFLE";
-    case TASK_ALLOC: return "TASK_ALLOC";
-    case TASK_ALLOC_FREE: return "TASK_ALLOC_FREE";
-    case TASK_BLOCKED: return "TASK_BLOCKED";
-    case TASK_BUFN_THROW: return "TASK_BUFN_THROW";
-    case TASK_BUFN_WAIT: return "TASK_BUFN_WAIT";
-    case TASK_BUFN: return "TASK_BUFN";
-    case TASK_SPLIT_THROW: return "TASK_SPLIT_THROW";
-    case TASK_REMOVE_THROW: return "TASK_REMOVE_THROW";
-    case SHUFFLE_RUNNING: return "SHUFFLE_RUNNING";
-    case SHUFFLE_ALLOC: return "SHUFFLE_ALLOC";
-    case SHUFFLE_ALLOC_FREE: return "SHUFFLE_ALLOC_FREE";
-    case SHUFFLE_BLOCKED: return "SHUFFLE_BLOCKED";
-    case SHUFFLE_THROW: return "SHUFFLE_THROW";
-    case SHUFFLE_REMOVE_THROW: return "SHUFFLE_REMOVE_THROW";
+    case thread_state::THREAD_RUNNING: return "THREAD_RUNNING";
+    case thread_state::THREAD_ALLOC: return "THREAD_ALLOC";
+    case thread_state::THREAD_ALLOC_FREE: return "THREAD_ALLOC_FREE";
+    case thread_state::THREAD_BLOCKED: return "THREAD_BLOCKED";
+    case thread_state::THREAD_BUFN_THROW: return "THREAD_BUFN_THROW";
+    case thread_state::THREAD_BUFN_WAIT: return "THREAD_BUFN_WAIT";
+    case thread_state::THREAD_BUFN: return "THREAD_BUFN";
+    case thread_state::THREAD_SPLIT_THROW: return "THREAD_SPLIT_THROW";
+    case thread_state::THREAD_REMOVE_THROW: return "THREAD_REMOVE_THROW";
     default: return "UNKNOWN";
   }
 }
@@ -120,13 +140,13 @@ static auto make_logger(std::string const& filename)
  * the lowest priority task and is constantly retried while newer tasks move to the front of the
  * line. So a higher task_id should be a lower priority.
  *
- * We also want all shuffle threads to have the highest priority possible. So we assign them
+ * We also want all non-task threads to have the highest priority possible. So we assign them
  * a task id of -1. The problem is overflow on a long, so for the priority of a task the formula
  * will be MAX_LONG - (task_id + 1).
  */
 class thread_priority {
  public:
-  thread_priority(long tsk_id, long t_id) : task_id(tsk_id), thread_id(t_id) {}
+  thread_priority(long const tsk_id, long const t_id) : task_id(tsk_id), thread_id(t_id) {}
 
   long get_thread_id() const { return thread_id; }
 
@@ -169,6 +189,43 @@ class thread_priority {
   long task_priority() const { return std::numeric_limits<long>::max() - (task_id + 1); }
 };
 
+/**
+ * Holds metrics for a given task/thread about retry counts and times. It is here
+ * because the mapping between tasks and threads can be complicated and can span
+ * different time ranges too.
+ */
+struct task_metrics {
+  // metric for being able to report how many times each type of exception was thrown,
+  // and some timings
+  int num_times_retry_throw       = 0;
+  int num_times_split_retry_throw = 0;
+  long time_blocked_nanos         = 0;
+  // The amount of time that this thread has lost due to retries (not including blocked time)
+  long time_lost_nanos = 0;
+
+  void take_from(task_metrics& other)
+  {
+    add(other);
+    other.clear();
+  }
+
+  void add(task_metrics const& other)
+  {
+    this->num_times_retry_throw += other.num_times_retry_throw;
+    this->num_times_split_retry_throw += other.num_times_split_retry_throw;
+    this->time_blocked_nanos += other.time_blocked_nanos;
+    this->time_lost_nanos += other.time_lost_nanos;
+  }
+
+  void clear()
+  {
+    num_times_retry_throw       = 0;
+    num_times_split_retry_throw = 0;
+    time_blocked_nanos          = 0;
+    time_lost_nanos             = 0;
+  }
+};
+
 /**
  * This is the full state of a thread. Some things like the thread_id and task_id
  * should not change after the state is set up. Everything else is up for change,
@@ -177,35 +234,37 @@ class thread_priority {
  */
 class full_thread_state {
  public:
-  full_thread_state(thread_state state, long thread_id) : state(state), thread_id(thread_id) {}
-  full_thread_state(thread_state state, long thread_id, long task_id)
+  full_thread_state(thread_state const state, long const thread_id)
+    : state(state), thread_id(thread_id)
+  {
+  }
+  full_thread_state(thread_state const state, long const thread_id, long const task_id)
     : state(state), thread_id(thread_id), task_id(task_id)
   {
   }
   thread_state state;
   long thread_id;
-  long task_id                     = -1;
+  long task_id        = -1;
+  bool is_for_shuffle = false;
+  std::unordered_set<long> pool_task_ids;
+  bool is_cpu_alloc = false;
+  // Is the thread transitively blocked on a pool or not.
+  bool pool_blocked                = false;
   int retry_oom_injected           = 0;
   int split_and_retry_oom_injected = 0;
   int cudf_exception_injected      = 0;
   // watchdog limit on maximum number of retries to avoid unexpected live lock situations
   int num_times_retried = 0;
-  // metric for being able to report how many times each type of exception was thrown,
-  // and some timings
-  int num_times_retry_throw       = 0;
-  int num_times_split_retry_throw = 0;
-  long time_blocked_nanos         = 0;
-  // The amount of time that this thread has lost due to retries (not inclduing blocked time)
-  long time_lost_nanos = 0;
-  // The amount of time that this thread has spent in the current retry block (not inclucing block
-  // time)
-  long time_retry_running_nanos = 0;
   // When did the retry time for this thread start, or when did the block time end.
   std::chrono::time_point<std::chrono::steady_clock> retry_start_or_block_end;
   // Is this thread currently in a marked retry block. This is only used for metrics.
   bool is_in_retry = false;
-
+  // The amount of time that this thread has spent in the current retry block (not including block
+  // time)
+  long time_retry_running_nanos = 0;
   std::chrono::time_point<std::chrono::steady_clock> block_start;
+  // metrics for the current thread
+  task_metrics metrics;
 
   std::unique_ptr<std::condition_variable> wake_condition =
     std::make_unique<std::condition_variable>();
@@ -214,7 +273,7 @@ class full_thread_state {
    * Transition to a new state. Ideally this is what is called when doing a state transition instead
    * of setting the state directly.
    */
-  void transition_to(thread_state new_state)
+  void transition_to(thread_state const new_state)
   {
     if (new_state == thread_state::UNKNOWN) {
       throw std::runtime_error(
@@ -232,24 +291,18 @@ class full_thread_state {
 
   void after_block()
   {
-    auto end  = std::chrono::steady_clock::now();
-    auto diff = end - block_start;
-    time_blocked_nanos += std::chrono::duration_cast<std::chrono::nanoseconds>(diff).count();
+    auto const end  = std::chrono::steady_clock::now();
+    auto const diff = end - block_start;
+    metrics.time_blocked_nanos +=
+      std::chrono::duration_cast<std::chrono::nanoseconds>(diff).count();
     if (is_in_retry) { retry_start_or_block_end = end; }
   }
 
-  long get_and_reset_failed_retry_time()
-  {
-    long ret        = time_lost_nanos;
-    time_lost_nanos = 0;
-    return ret;
-  }
-
   void record_failed_retry_time()
   {
     if (is_in_retry) {
       record_and_reset_pending_retry_time();
-      time_lost_nanos += time_retry_running_nanos;
+      metrics.time_lost_nanos += time_retry_running_nanos;
       time_retry_running_nanos = 0;
     }
   }
@@ -257,15 +310,15 @@ class full_thread_state {
   void record_and_reset_pending_retry_time()
   {
     if (is_in_retry) {
-      auto end  = std::chrono::steady_clock::now();
-      auto diff = end - retry_start_or_block_end;
+      auto const end  = std::chrono::steady_clock::now();
+      auto const diff = end - retry_start_or_block_end;
       time_retry_running_nanos +=
         std::chrono::duration_cast<std::chrono::nanoseconds>(diff).count();
       retry_start_or_block_end = end;
     }
   }
 
-  void reset_retry_state(bool is_in_retry)
+  void reset_retry_state(bool const is_in_retry)
   {
     time_retry_running_nanos = 0;
     if (is_in_retry) { retry_start_or_block_end = std::chrono::steady_clock::now(); }
@@ -275,7 +328,21 @@ class full_thread_state {
   /**
    * Get the priority of this thread.
    */
-  thread_priority priority() { return thread_priority(task_id, thread_id); }
+  thread_priority priority() const
+  {
+    if (task_id < 0 && !is_for_shuffle) {
+      // The task id for a non-shuffle pool thread is the same as the lowest task id
+      // it is currently working on.
+      auto const min_id = std::min_element(pool_task_ids.begin(), pool_task_ids.end());
+      if (min_id != pool_task_ids.end()) {
+        return thread_priority(*min_id, thread_id);
+      } else {
+        return thread_priority(-1, thread_id);
+      }
+    } else {
+      return thread_priority(task_id, thread_id);
+    }
+  }
 };
 
 /**
@@ -289,8 +356,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
  public:
   spark_resource_adaptor(JNIEnv* env,
                          rmm::mr::device_memory_resource* mr,
-                         std::shared_ptr<spdlog::logger>& logger)
-    : resource{mr}, logger{logger}
+                         std::shared_ptr<spdlog::logger>& logger,
+                         bool const is_log_enabled)
+    : resource{mr}, logger{logger}, is_log_enabled{is_log_enabled}
   {
     if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
     logger->flush_on(spdlog::level::info);
@@ -306,38 +374,69 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   bool supports_streams() const noexcept override { return resource->supports_streams(); }
 
   /**
-   * Update the internal state so that a specific thread is associated with a task.
+   * Update the internal state so that a specific thread is dedicated to a task.
    * This may be called multiple times for a given thread and if the thread is already
-   * associated with the task, then most of the time this is a noop. The only exception
+   * dedicated to the task, then most of the time this is a noop. The only exception
    * is if the thread is marked that it is shutting down, but has not completed yet.
    * This should never happen in practice with Spark because the only time we would
    * shut down a task thread on a thread that is different from itself is if there
    * was an error and the entire executor is shutting down. So there should be no
    * reuse.
    */
-  void associate_thread_with_task(long thread_id, long task_id)
+  void start_dedicated_task_thread(long const thread_id, long const task_id)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
     if (shutting_down) { throw std::runtime_error("spark_resource_adaptor is shutting down"); }
-    auto was_threads_inserted =
-      threads.emplace(thread_id, full_thread_state(thread_state::TASK_RUNNING, thread_id, task_id));
+    auto const found = threads.find(thread_id);
+    if (found != threads.end()) {
+      if (found->second.task_id >= 0 && found->second.task_id != task_id) {
+        if (is_log_enabled) {
+          std::stringstream ss;
+          ss << "desired task_id " << task_id;
+
+          log_status("FIXUP", thread_id, found->second.task_id, found->second.state, ss.str());
+        }
+        remove_thread_association(thread_id, found->second.task_id, lock);
+      }
+    }
+    auto const was_threads_inserted = threads.emplace(
+      thread_id, full_thread_state(thread_state::THREAD_RUNNING, thread_id, task_id));
     if (was_threads_inserted.second == false) {
-      if (was_threads_inserted.first->second.task_id != task_id) {
-        throw std::invalid_argument("a thread can only be associated with a single task.");
+      if (was_threads_inserted.first->second.state == thread_state::THREAD_REMOVE_THROW) {
+        std::stringstream ss;
+        ss << "A thread " << thread_id << " is shutting down "
+           << was_threads_inserted.first->second.task_id << " vs " << task_id;
+
+        auto const msg = ss.str();
+        log_status("ERROR",
+                   thread_id,
+                   was_threads_inserted.first->second.task_id,
+                   was_threads_inserted.first->second.state,
+                   msg);
+        throw std::invalid_argument(msg);
       }
 
-      if (was_threads_inserted.first->second.state == thread_state::TASK_REMOVE_THROW) {
-        throw std::invalid_argument("the thread is in the process of shutting down.");
+      if (was_threads_inserted.first->second.task_id != task_id) {
+        std::stringstream ss;
+        ss << "A thread " << thread_id << " can only be dedicated to a single task."
+           << was_threads_inserted.first->second.task_id << " != " << task_id;
+        auto const msg = ss.str();
+        log_status("ERROR",
+                   thread_id,
+                   was_threads_inserted.first->second.task_id,
+                   was_threads_inserted.first->second.state,
+                   msg);
+        throw std::invalid_argument(msg);
       }
     }
 
     try {
-      auto was_inserted = task_to_threads.insert({task_id, {thread_id}});
+      auto const was_inserted = task_to_threads.insert({task_id, {thread_id}});
       if (was_inserted.second == false) {
         // task_to_threads already has a task_id for this, so insert the thread_id
         was_inserted.first->second.insert(thread_id);
       }
-    } catch (const std::exception&) {
+    } catch (std::exception const&) {
       if (was_threads_inserted.second == true) {
         // roll back the thread insertion
         threads.erase(thread_id);
@@ -345,60 +444,100 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
       throw;
     }
     if (was_threads_inserted.second == true) {
-      log_transition(thread_id, task_id, thread_state::UNKNOWN, thread_state::TASK_RUNNING);
+      log_transition(thread_id, task_id, thread_state::UNKNOWN, thread_state::THREAD_RUNNING);
     }
   }
 
-  void start_retry_block(long thread_id)
+  void start_retry_block(long const thread_id)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
-    auto thread = threads.find(thread_id);
+    auto const thread = threads.find(thread_id);
     if (thread != threads.end()) { thread->second.reset_retry_state(true); }
   }
 
-  void end_retry_block(long thread_id)
+  void end_retry_block(long const thread_id)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
-    auto thread = threads.find(thread_id);
+    auto const thread = threads.find(thread_id);
     if (thread != threads.end()) { thread->second.reset_retry_state(false); }
   }
 
-  long get_and_reset_lost_time(long task_id)
+  /**
+   * Update the internal state so that a specific thread is associated with transitive
+   * thread pools and is working on a set of tasks.
+   * This may be called multiple times for a given thread and the set of tasks will be
+   * updated accordingly.
+   */
+  void pool_thread_working_on_tasks(bool const is_for_shuffle,
+                                    long const thread_id,
+                                    std::unordered_set<long> const& task_ids)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
-    long ret     = 0;
-    auto task_at = task_to_threads.find(task_id);
-    if (task_at != task_to_threads.end()) {
-      for (auto thread_id : task_at->second) {
-        auto threads_at = threads.find(thread_id);
-        if (threads_at != threads.end()) {
-          ret += threads_at->second.get_and_reset_failed_retry_time();
-        }
+    if (shutting_down) { throw std::runtime_error("spark_resource_adaptor is shutting down"); }
+
+    auto const was_inserted =
+      threads.emplace(thread_id, full_thread_state(thread_state::THREAD_RUNNING, thread_id));
+    if (was_inserted.second == true) {
+      was_inserted.first->second.is_for_shuffle = is_for_shuffle;
+      log_transition(thread_id, -1, thread_state::UNKNOWN, thread_state::THREAD_RUNNING);
+    } else if (was_inserted.first->second.task_id != -1) {
+      throw std::invalid_argument("the thread is associated with a non-pool task already");
+    } else if (was_inserted.first->second.state == thread_state::THREAD_REMOVE_THROW) {
+      throw std::invalid_argument("the thread is in the process of shutting down.");
+    } else if (was_inserted.first->second.is_for_shuffle != is_for_shuffle) {
+      if (is_for_shuffle) {
+        throw std::invalid_argument(
+          "the thread is marked as a non-shuffle thread, and we cannot change it while there are "
+          "active tasks");
+      } else {
+        throw std::invalid_argument(
+          "the thread is marked as a shuffle thread, and we cannot change it while there are "
+          "active tasks");
       }
     }
-    return ret;
+
+    // save the metrics for all tasks before we add any new ones.
+    checkpoint_metrics(was_inserted.first->second);
+
+    was_inserted.first->second.pool_task_ids.insert(task_ids.begin(), task_ids.end());
+    if (is_log_enabled) {
+      std::stringstream ss;
+      ss << "CURRENT IDs ";
+      for (const auto& task_id : was_inserted.first->second.pool_task_ids) {
+        ss << task_id << " ";
+      }
+      log_status("ADD_TASKS", thread_id, -1, was_inserted.first->second.state, ss.str());
+    }
   }
 
-  /**
-   * Update the internal state so that a specific thread is associated with shuffle.
-   * This may be called multiple times for a given thread and if the thread is already
-   * associated with shuffle, the this is a noop in most cases. The only time
-   * this is an error is if the thread is already marked as shutting down and has
-   * not completed that transition yet.
-   */
-  void associate_thread_with_shuffle(long thread_id)
+  void pool_thread_finished_for_tasks(long const thread_id,
+                                      std::unordered_set<long> const& task_ids)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
     if (shutting_down) { throw std::runtime_error("spark_resource_adaptor is shutting down"); }
 
-    auto was_inserted =
-      threads.emplace(thread_id, full_thread_state(thread_state::SHUFFLE_RUNNING, thread_id));
-    if (was_inserted.second == true) {
-      log_transition(thread_id, -1, thread_state::UNKNOWN, thread_state::SHUFFLE_RUNNING);
-    } else if (was_inserted.first->second.task_id != -1) {
-      throw std::invalid_argument("the thread is associated with a non-shuffle task already");
-    } else if (was_inserted.first->second.state == thread_state::SHUFFLE_REMOVE_THROW) {
-      throw std::invalid_argument("the thread is in the process of shutting down.");
+    auto const thread = threads.find(thread_id);
+    if (thread != threads.end()) {
+      // save the metrics for all tasks before we remove any of them.
+      checkpoint_metrics(thread->second);
+
+      // Now drop the tasks from the pool
+      for (auto const& id : task_ids) {
+        thread->second.pool_task_ids.erase(id);
+      }
+      if (is_log_enabled) {
+        std::stringstream ss;
+        ss << "CURRENT IDs ";
+        for (const auto& id : thread->second.pool_task_ids) {
+          ss << id << " ";
+        }
+        log_status("REMOVE_TASKS", thread_id, -1, thread->second.state, ss.str());
+      }
+      if (thread->second.pool_task_ids.empty()) {
+        if (remove_thread_association(thread_id, -1, lock)) {
+          wake_up_threads_after_task_finishes(lock);
+        }
+      }
     }
   }
 
@@ -409,10 +548,12 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * up and throw an exception. At that point the thread's state will be completely
    * removed.
    */
-  void remove_thread_association(long thread_id)
+  void remove_thread_association(long const thread_id, long const task_id)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
-    if (remove_thread_association(thread_id, lock)) { wake_up_threads_after_task_finishes(lock); }
+    if (remove_thread_association(thread_id, task_id, lock)) {
+      wake_up_threads_after_task_finishes(lock);
+    }
   }
 
   /**
@@ -421,22 +562,65 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * threads are currently blocked/waiting then the state will not be totally
    * removed until the thread is woken.
    */
-  void task_done(long task_id)
+  void task_done(long const task_id)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
-    auto task_at = task_to_threads.find(task_id);
+    bool run_checks    = false;
+    auto const task_at = task_to_threads.find(task_id);
     if (task_at != task_to_threads.end()) {
       // we want to make a copy so there is no conflict here...
-      std::set<long> threads_to_remove = task_at->second;
-      bool run_checks                  = false;
-      for (auto thread_id : threads_to_remove) {
-        run_checks = remove_thread_association(thread_id, lock) || run_checks;
+      std::set<long> const threads_to_remove = task_at->second;
+      for (auto const thread_id : threads_to_remove) {
+        run_checks = remove_thread_association(thread_id, task_id, lock) || run_checks;
       }
-      if (run_checks) { wake_up_threads_after_task_finishes(lock); }
     }
+    std::unordered_set<long> thread_ids;
+    for (auto const& [thread_id, ignored] : threads) {
+      thread_ids.insert(thread_id);
+    }
+    for (auto const& thread_id : thread_ids) {
+      auto const thread = threads.find(thread_id);
+      if (thread != threads.end()) {
+        if (thread->second.pool_task_ids.erase(task_id) != 0) {
+          if (is_log_enabled) {
+            std::stringstream ss;
+            ss << "CURRENT IDs ";
+            for (const auto& id : thread->second.pool_task_ids) {
+              ss << id << " ";
+            }
+            log_status("REMOVE_TASKS", thread_id, -1, thread->second.state, ss.str());
+          }
+          if (thread->second.pool_task_ids.empty()) {
+            run_checks = remove_thread_association(thread_id, task_id, lock) || run_checks;
+          }
+        }
+      }
+    }
+
+    if (run_checks) { wake_up_threads_after_task_finishes(lock); }
     task_to_threads.erase(task_id);
   }
 
+  /**
+   * A dedicated task thread is submitting to a pool.
+   */
+  void submitting_to_pool(long const thread_id) { waiting_on_pool_status_changed(thread_id, true); }
+
+  /**
+   * A dedicated task thread is waiting on a result from a pool.
+   */
+  void waiting_on_pool(long const thread_id) { waiting_on_pool_status_changed(thread_id, true); }
+
+  /**
+   * A dedicated task thread is no longer blocked on a pool.
+   * It got the answer, an exception, or it submitted the
+   * work successfully.
+   */
+  void done_waiting_on_pool(long const thread_id)
+  {
+    waiting_on_pool_status_changed(thread_id, false);
+  }
+
   /**
    * This should be called before shutting down the adaptor. It will try
    * to shut down everything in an orderly way and wait for all of the
@@ -449,12 +633,12 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
       // 1. Mark all threads that need to be removed as such
       // make a copy of the ids so we don't modify threads while walking it
       std::vector<long> threads_to_remove;
-      for (auto thread = threads.begin(); thread != threads.end(); thread++) {
-        threads_to_remove.push_back(thread->first);
+      for (auto const& thread : threads) {
+        threads_to_remove.push_back(thread.first);
       }
 
-      for (auto thread_id : threads_to_remove) {
-        remove_thread_association(thread_id, lock);
+      for (auto const thread_id : threads_to_remove) {
+        remove_thread_association(thread_id, -1, lock);
       }
       shutting_down = true;
     }
@@ -479,10 +663,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * Force a specific thread to throw one or more RetryOOM exceptions when an
    * alloc is called. This is intended only for testing.
    */
-  void force_retry_oom(long thread_id, int num_ooms)
+  void force_retry_oom(long const thread_id, int const num_ooms)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
-    auto threads_at = threads.find(thread_id);
+    auto const threads_at = threads.find(thread_id);
     if (threads_at != threads.end()) {
       threads_at->second.retry_oom_injected = num_ooms;
     } else {
@@ -494,10 +678,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * Force a specific thread to throw one or more SplitAndRetryOOM exceptions
    * when an alloc is called. This is intended only for testing.
    */
-  void force_split_and_retry_oom(long thread_id, int num_ooms)
+  void force_split_and_retry_oom(long const thread_id, int const num_ooms)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
-    auto threads_at = threads.find(thread_id);
+    auto const threads_at = threads.find(thread_id);
     if (threads_at != threads.end()) {
       threads_at->second.split_and_retry_oom_injected = num_ooms;
     } else {
@@ -509,10 +693,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * force a specific thread to throw one or more CudfExceptions when an
    * alloc is called. This is intended only for testing.
    */
-  void force_cudf_exception(long thread_id, int num_times)
+  void force_cudf_exception(long const thread_id, int const num_times)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
-    auto threads_at = threads.find(thread_id);
+    auto const threads_at = threads.find(thread_id);
     if (threads_at != threads.end()) {
       threads_at->second.cudf_exception_injected = num_times;
     } else {
@@ -520,130 +704,107 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     }
   }
 
-  /**
-   * get the number of times a retry was thrown and reset the value to 0.
-   */
-  int get_and_reset_num_retry(long task_id)
+  // Some C++ magic to get and reset a single metric.
+  // Metrics are recorded on a per-thread basis, but are reported per-task
+  // But the life time of threads and tasks are not directly tied together
+  // so they are check-pointed periodically. This reads and resets
+  // the metric for both the threads and the tasks
+  template <class T>
+  T get_and_reset_metric(long const task_id, T task_metrics::*MetricPtr)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
-    int ret      = 0;
-    auto task_at = task_to_threads.find(task_id);
+    T ret              = 0;
+    auto const task_at = task_to_threads.find(task_id);
     if (task_at != task_to_threads.end()) {
-      for (auto thread_id : task_at->second) {
-        auto threads_at = threads.find(thread_id);
+      for (auto const thread_id : task_at->second) {
+        auto const threads_at = threads.find(thread_id);
         if (threads_at != threads.end()) {
-          ret += threads_at->second.num_times_retry_throw;
-          threads_at->second.num_times_retry_throw = 0;
+          ret += (threads_at->second.metrics.*MetricPtr);
+          (threads_at->second.metrics.*MetricPtr) = 0;
         }
       }
     }
+
+    auto const metrics_at = task_to_metrics.find(task_id);
+    if (metrics_at != task_to_metrics.end()) {
+      ret += (metrics_at->second.*MetricPtr);
+      (metrics_at->second.*MetricPtr) = 0;
+    }
     return ret;
   }
 
+  /**
+   * get the number of times a retry was thrown and reset the value to 0.
+   */
+  int get_and_reset_num_retry(long const task_id)
+  {
+    return get_and_reset_metric(task_id, &task_metrics::num_times_retry_throw);
+  }
+
   /**
    * get the number of times a split and retry was thrown and reset the value to 0.
    */
-  int get_and_reset_num_split_retry(long task_id)
+  int get_and_reset_num_split_retry(long const task_id)
   {
-    std::unique_lock<std::mutex> lock(state_mutex);
-    int ret      = 0;
-    auto task_at = task_to_threads.find(task_id);
-    if (task_at != task_to_threads.end()) {
-      for (auto thread_id : task_at->second) {
-        auto threads_at = threads.find(thread_id);
-        if (threads_at != threads.end()) {
-          ret += threads_at->second.num_times_split_retry_throw;
-          threads_at->second.num_times_split_retry_throw = 0;
-        }
-      }
-    }
-    return ret;
+    return get_and_reset_metric(task_id, &task_metrics::num_times_split_retry_throw);
   }
 
   /**
    * get the time in ns that the task was blocked for.
    */
-  long get_and_reset_block_time(long task_id)
+  long get_and_reset_block_time(long const task_id)
   {
-    std::unique_lock<std::mutex> lock(state_mutex);
-    long ret     = 0;
-    auto task_at = task_to_threads.find(task_id);
-    if (task_at != task_to_threads.end()) {
-      for (auto thread_id : task_at->second) {
-        auto threads_at = threads.find(thread_id);
-        if (threads_at != threads.end()) {
-          ret += threads_at->second.time_blocked_nanos;
-          threads_at->second.time_blocked_nanos = 0;
-        }
-      }
-    }
-    return ret;
+    return get_and_reset_metric(task_id, &task_metrics::time_blocked_nanos);
   }
 
   /**
-   * Update the internal state so that this thread is known that it is going to enter a
-   * shuffle stage and could indirectly block on a shuffle thread (UCX).
+   * get the time in ns that was lost because a retry was thrown.
    */
-  void thread_could_block_on_shuffle(long thread_id)
+  long get_and_reset_lost_time(long const task_id)
+  {
+    return get_and_reset_metric(task_id, &task_metrics::time_lost_nanos);
+  }
+
+  void check_and_break_deadlocks()
   {
     std::unique_lock<std::mutex> lock(state_mutex);
-    auto threads_at = threads.find(thread_id);
-    if (threads_at != threads.end()) {
-      switch (threads_at->second.state) {
-        case TASK_RUNNING:
-          transition(threads_at->second, thread_state::TASK_WAIT_ON_SHUFFLE);
-          break;
-        case TASK_BUFN_WAIT:
-          transition(threads_at->second, thread_state::TASK_BUFN_WAIT_ON_SHUFFLE);
-          break;
-        case TASK_WAIT_ON_SHUFFLE:
-        // fall through
-        case TASK_BUFN_WAIT_ON_SHUFFLE:
-          // noop already in an expected state...
-          break;
-        default: {
-          std::stringstream ss;
-          ss << "thread  " << thread_id << " is in an unexpected state "
-             << as_str(threads_at->second.state) << " to start shuffle";
-          throw std::invalid_argument(ss.str());
-        }
-      }
-      check_and_update_for_bufn(lock);
-    } else {
-      throw std::invalid_argument("the thread is not associated with any task/shuffle");
-    }
+    check_and_update_for_bufn(lock);
   }
 
-  /**
-   * Indicate that the thread no longer will block indirectly on a shuffle thread.
-   */
-  void thread_done_with_shuffle(long thread_id)
+  bool cpu_prealloc(size_t const amount, bool const blocking)
   {
+    // amount is not used yet, but is here in case we want it in the future.
     std::unique_lock<std::mutex> lock(state_mutex);
-    auto threads_at = threads.find(thread_id);
-    if (threads_at != threads.end()) {
-      switch (threads_at->second.state) {
-        case TASK_WAIT_ON_SHUFFLE:
-          transition(threads_at->second, thread_state::TASK_RUNNING);
-          break;
-        case TASK_BUFN_WAIT_ON_SHUFFLE:
-          transition(threads_at->second, thread_state::TASK_BUFN_WAIT);
-          break;
-        case TASK_RUNNING:
-        // fall through
-        case TASK_BUFN_WAIT:
-          // noop already in an expected state...
-          break;
-        default: {
-          std::stringstream ss;
-          ss << "thread  " << thread_id << " is in an unexpected state "
-             << as_str(threads_at->second.state) << " to end shuffle";
-          throw std::invalid_argument(ss.str());
-        }
-      }
-    } else {
-      throw std::invalid_argument("the thread is not associated with any task/shuffle");
-    }
+    auto const thread_id = static_cast<long>(pthread_self());
+    return pre_alloc_core(thread_id, true, blocking, lock);
+  }
+
+  void cpu_postalloc_success(void const* addr,
+                             size_t const amount,
+                             bool const blocking,
+                             bool const was_recursive)
+  {
+    // addr is not used yet, but is here in case we want it in the future.
+    // amount is not used yet, but is here in case we want it for debugging/metrics.
+    // blocking is not used yet. It could be used for some debugging so we are keeping it.
+    std::unique_lock<std::mutex> lock(state_mutex);
+    auto const thread_id = static_cast<long>(pthread_self());
+    post_alloc_success_core(thread_id, true, was_recursive, lock);
+  }
+
+  bool cpu_postalloc_failed(bool const was_oom, bool const blocking, bool const was_recursive)
+  {
+    std::unique_lock<std::mutex> lock(state_mutex);
+    auto const thread_id = static_cast<long>(pthread_self());
+    return post_alloc_failed_core(thread_id, true, was_oom, blocking, was_recursive, lock);
+  }
+
+  void cpu_dealloc(void const* addr, size_t const amount)
+  {
+    // addr is not used yet, but is here in case we want it in the future.
+    // amount is not used yet, but is here in case we want it for debugging/metrics.
+    std::unique_lock<std::mutex> lock(state_mutex);
+    dealloc_core(true, lock);
   }
 
   /**
@@ -654,7 +815,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    */
   void block_thread_until_ready()
   {
-    auto thread_id = static_cast<long>(pthread_self());
+    auto const thread_id = static_cast<long>(pthread_self());
     std::unique_lock<std::mutex> lock(state_mutex);
     block_thread_until_ready(thread_id, lock);
   }
@@ -663,10 +824,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * This is really here just for testing. It provides a way to look at the
    * current state of a thread.
    */
-  int get_thread_state_as_int(long thread_id)
+  int get_thread_state_as_int(long const thread_id)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
-    auto threads_at = threads.find(thread_id);
+    auto const threads_at = threads.find(thread_id);
     if (threads_at != threads.end()) {
       return static_cast<int>(threads_at->second.state);
     } else {
@@ -677,6 +838,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
  private:
   rmm::mr::device_memory_resource* const resource;
   std::shared_ptr<spdlog::logger> logger;  ///< spdlog logger object
+  bool const is_log_enabled;
 
   // The state mutex must be held when modifying the state of threads or tasks
   // it must never be held when calling into the child resource or after returning
@@ -685,39 +847,41 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   std::condition_variable task_has_woken_condition;
   std::map<long, full_thread_state> threads;
   std::map<long, std::set<long>> task_to_threads;
+  // Metrics are a little complicated. Spark reports metrics at a task level
+  // but we track and collect them at a thread level. The life time of a thread
+  // and a task are not tied to each other, and a thread can work on things for
+  // multiple tasks at the same time. So whenever a thread changes status
+  // the metrics for the tasks it is working on are aggregated here. When a task
+  // finishes the metrics for that task are then deleted.
+  std::map<long, task_metrics> task_to_metrics;
   bool shutting_down = false;
   JavaVM* jvm;
 
   /**
    * log a status change that does not involve a state transition.
    */
-  void log_status(
-    const char* op, long thread_id, long task_id, thread_state state, const char* notes = nullptr)
+  void log_status(std::string const& op,
+                  long const thread_id,
+                  long const task_id,
+                  thread_state const state,
+                  std::string const& notes = "") const
   {
-    auto this_id = static_cast<long>(pthread_self());
-    logger->info("{},{},{},{},{},,{}",
-                 op,
-                 this_id,
-                 thread_id,
-                 task_id,
-                 as_str(state),
-                 (notes == nullptr ? "" : notes));
+    auto const this_id = static_cast<long>(pthread_self());
+    logger->info("{},{},{},{},{},,{}", op, this_id, thread_id, task_id, as_str(state), notes);
   }
 
   /**
    * log that a state transition happened.
    */
-  void log_transition(
-    long thread_id, long task_id, thread_state from, thread_state to, const char* notes = nullptr)
+  void log_transition(long const thread_id,
+                      long const task_id,
+                      thread_state const from,
+                      thread_state const to,
+                      std::string const& notes = "") const
   {
-    auto this_id = static_cast<long>(pthread_self());
-    logger->info("TRANSITION,{},{},{},{},{},{}",
-                 this_id,
-                 thread_id,
-                 task_id,
-                 as_str(from),
-                 as_str(to),
-                 (notes == nullptr ? "" : notes));
+    auto const this_id = static_cast<long>(pthread_self());
+    logger->info(
+      "TRANSITION,{},{},{},{},{},{}", this_id, thread_id, task_id, as_str(from), as_str(to), notes);
   }
 
   /**
@@ -725,7 +889,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * of setting the state directly. This will log the transition and do a little bit of
    * verification.
    */
-  void transition(full_thread_state& state, thread_state new_state, const char* message = nullptr)
+  void transition(full_thread_state& state,
+                  thread_state const new_state,
+                  std::string const& message = "")
   {
     thread_state original = state.state;
     state.transition_to(new_state);
@@ -735,17 +901,51 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   /**
    * throw a java exception using the cached jvm/env.
    */
-  void throw_java_exception(const char* ex_class_name, const char* msg)
+  void throw_java_exception(char const* ex_class_name, char const* msg)
   {
     JNIEnv* env = cudf::jni::get_jni_env(jvm);
     cudf::jni::throw_java_exception(env, ex_class_name, msg);
   }
 
+  void waiting_on_pool_status_changed(long const thread_id, bool const pool_blocked)
+  {
+    std::unique_lock<std::mutex> lock(state_mutex);
+    auto const thread = threads.find(thread_id);
+    long task_id      = -1;
+    if (thread != threads.end()) { task_id = thread->second.task_id; }
+
+    if (task_id < 0) {
+      std::stringstream ss;
+      ss << "thread " << thread_id << " is not a dedicated task thread";
+      throw std::invalid_argument(ss.str());
+    }
+
+    thread->second.pool_blocked = pool_blocked;
+  }
+
+  /**
+   * Checkpoint all of the metrics for a thread.
+   */
+  void checkpoint_metrics(full_thread_state& state)
+  {
+    if (state.task_id < 0) {
+      // save the metrics for all tasks before we add any new ones.
+      for (auto const task_id : state.pool_task_ids) {
+        auto const metrics_at = task_to_metrics.try_emplace(task_id, task_metrics());
+        metrics_at.first->second.add(state.metrics);
+      }
+      state.metrics.clear();
+    } else {
+      auto const metrics_at = task_to_metrics.try_emplace(state.task_id, task_metrics());
+      metrics_at.first->second.take_from(state.metrics);
+    }
+  }
+
   /**
    * This is a watchdog to prevent us from live locking. It should be called before we throw an
    * RetryOOM or a SplitAndRetryOOM to know if we actually should throw something else.
    */
-  void check_before_oom(full_thread_state& state, const std::unique_lock<std::mutex>& lock)
+  void check_before_oom(full_thread_state& state, std::unique_lock<std::mutex> const& lock)
   {
     // The limit is an arbitrary number, large enough that we should not hit it in "normal"
     // operation, but also small enough that we can detect a livelock fairly quickly.
@@ -758,34 +958,40 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     state.num_times_retried++;
   }
 
-  void throw_retry_oom(const char* msg,
+  void throw_retry_oom(char const* msg,
                        full_thread_state& state,
-                       const std::unique_lock<std::mutex>& lock)
+                       std::unique_lock<std::mutex> const& lock)
   {
-    state.num_times_retry_throw++;
+    state.metrics.num_times_retry_throw++;
     check_before_oom(state, lock);
     state.record_failed_retry_time();
-    throw_java_exception(RETRY_OOM_CLASS, "GPU OutOfMemory");
+    if (state.is_cpu_alloc) {
+      throw_java_exception(CPU_RETRY_OOM_CLASS, "CPU OutOfMemory");
+    } else {
+      throw_java_exception(GPU_RETRY_OOM_CLASS, "GPU OutOfMemory");
+    }
   }
 
-  void throw_split_and_retry_oom(const char* msg,
+  void throw_split_and_retry_oom(char const* msg,
                                  full_thread_state& state,
-                                 const std::unique_lock<std::mutex>& lock)
+                                 std::unique_lock<std::mutex> const& lock)
   {
-    state.num_times_split_retry_throw++;
+    state.metrics.num_times_split_retry_throw++;
     check_before_oom(state, lock);
     state.record_failed_retry_time();
-    throw_java_exception(SPLIT_AND_RETRY_OOM_CLASS, "GPU OutOfMemory");
+    if (state.is_cpu_alloc) {
+      throw_java_exception(CPU_SPLIT_AND_RETRY_OOM_CLASS, "CPU OutOfMemory");
+    } else {
+      throw_java_exception(GPU_SPLIT_AND_RETRY_OOM_CLASS, "GPU OutOfMemory");
+    }
   }
 
-  bool is_blocked(thread_state state)
+  bool is_blocked(thread_state state) const
   {
     switch (state) {
-      case TASK_BLOCKED:
-      // fall through
-      case TASK_BUFN:
+      case thread_state::THREAD_BLOCKED:
       // fall through
-      case SHUFFLE_BLOCKED: return true;
+      case thread_state::THREAD_BUFN: return true;
       default: return false;
     }
   }
@@ -793,23 +999,21 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   /**
    * Internal implementation that will block a thread until it is ready to continue.
    */
-  void block_thread_until_ready(long thread_id, std::unique_lock<std::mutex>& lock)
+  void block_thread_until_ready(long const thread_id, std::unique_lock<std::mutex>& lock)
   {
     bool done       = false;
     bool first_time = true;
     // Because this is called from alloc as well as from the public facing block_thread_until_ready
     // there are states that should only show up in relation to alloc failing. These include
-    // TASK_BUFN_THROW and TASK_SPLIT_THROW. They should never happen unless this is being called
-    // from within an alloc.
+    // THREAD_BUFN_THROW and THREAD_SPLIT_THROW. They should never happen unless this is being
+    // called from within an alloc.
     while (!done) {
       auto thread = threads.find(thread_id);
       if (thread != threads.end()) {
         switch (thread->second.state) {
-          case TASK_BLOCKED:
+          case thread_state::THREAD_BLOCKED:
           // fall through
-          case TASK_BUFN:
-          // fall through
-          case SHUFFLE_BLOCKED:
+          case thread_state::THREAD_BUFN:
             log_status("WAITING", thread_id, thread->second.task_id, thread->second.state);
             thread->second.before_block();
             do {
@@ -819,19 +1023,13 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
             thread->second.after_block();
             task_has_woken_condition.notify_all();
             break;
-          case SHUFFLE_THROW:
-            transition(thread->second, thread_state::SHUFFLE_RUNNING);
-            thread->second.record_failed_retry_time();
-            throw_java_exception(cudf::jni::OOM_CLASS,
-                                 "GPU OutOfMemory: could not allocate enough for shuffle");
-            break;
-          case TASK_BUFN_THROW:
-            transition(thread->second, thread_state::TASK_BUFN_WAIT);
+          case thread_state::THREAD_BUFN_THROW:
+            transition(thread->second, thread_state::THREAD_BUFN_WAIT);
             thread->second.record_failed_retry_time();
             throw_retry_oom("rollback and retry operation", thread->second, lock);
             break;
-          case TASK_BUFN_WAIT:
-            transition(thread->second, thread_state::TASK_BUFN);
+          case thread_state::THREAD_BUFN_WAIT:
+            transition(thread->second, thread_state::THREAD_BUFN);
             // Before we can wait it is possible that the throw didn't release anything
             // and the other threads didn't get unblocked by this, so we need to
             // check again to see if this was fixed or not.
@@ -849,15 +1047,13 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
               task_has_woken_condition.notify_all();
             }
             break;
-          case TASK_SPLIT_THROW:
-            transition(thread->second, thread_state::TASK_RUNNING);
+          case thread_state::THREAD_SPLIT_THROW:
+            transition(thread->second, thread_state::THREAD_RUNNING);
             thread->second.record_failed_retry_time();
             throw_split_and_retry_oom(
               "rollback, split input, and retry operation", thread->second, lock);
             break;
-          case TASK_REMOVE_THROW:
-          // fall through
-          case SHUFFLE_REMOVE_THROW:
+          case thread_state::THREAD_REMOVE_THROW:
             log_transition(
               thread_id, thread->second.task_id, thread->second.state, thread_state::UNKNOWN);
             // don't need to record failed time metric the thread is already gone...
@@ -888,32 +1084,28 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   void wake_up_threads_after_task_finishes(const std::unique_lock<std::mutex>& lock)
   {
     bool are_any_tasks_just_blocked = false;
-    for (auto thread = threads.begin(); thread != threads.end(); thread++) {
-      switch (thread->second.state) {
-        case TASK_BLOCKED:
-          transition(thread->second, thread_state::TASK_RUNNING);
-          thread->second.wake_condition->notify_all();
+    for (auto& [thread_id, t_state] : threads) {
+      switch (t_state.state) {
+        case thread_state::THREAD_BLOCKED:
+          transition(t_state, thread_state::THREAD_RUNNING);
+          t_state.wake_condition->notify_all();
           are_any_tasks_just_blocked = true;
           break;
-        case SHUFFLE_BLOCKED:
-          transition(thread->second, thread_state::SHUFFLE_RUNNING);
-          thread->second.wake_condition->notify_all();
-          break;
         default: break;
       }
     }
 
     if (!are_any_tasks_just_blocked) {
       // wake up all of the BUFN tasks.
-      for (auto thread = threads.begin(); thread != threads.end(); thread++) {
-        switch (thread->second.state) {
-          case TASK_BUFN:
+      for (auto& [thread_id, t_state] : threads) {
+        switch (t_state.state) {
+          case thread_state::THREAD_BUFN:
           // fall through
-          case TASK_BUFN_THROW:
+          case thread_state::THREAD_BUFN_THROW:
           // fall through
-          case TASK_BUFN_WAIT:
-            transition(thread->second, thread_state::TASK_RUNNING);
-            thread->second.wake_condition->notify_all();
+          case thread_state::THREAD_BUFN_WAIT:
+            transition(t_state, thread_state::THREAD_RUNNING);
+            t_state.wake_condition->notify_all();
             break;
           default: break;
         }
@@ -926,35 +1118,57 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * returns true if the thread that ended was a normally running task thread.
    * This should be used to decide if wake_up_threads_after_task_finishes is called or not.
    */
-  bool remove_thread_association(long thread_id, const std::unique_lock<std::mutex>& lock)
+  bool remove_thread_association(long thread_id,
+                                 long remove_task_id,
+                                 const std::unique_lock<std::mutex>& lock)
   {
-    bool ret        = false;
-    auto threads_at = threads.find(thread_id);
+    bool thread_should_be_removed = false;
+    bool ret                      = false;
+    auto const threads_at         = threads.find(thread_id);
     if (threads_at != threads.end()) {
-      auto task_id = threads_at->second.task_id;
-      if (task_id >= 0) {
-        auto task_at = task_to_threads.find(task_id);
-        if (task_at != task_to_threads.end()) { task_at->second.erase(thread_id); }
+      // save the metrics no matter what
+      checkpoint_metrics(threads_at->second);
+
+      if (remove_task_id < 0) {
+        thread_should_be_removed = true;
+      } else {
+        auto const task_id = threads_at->second.task_id;
+        if (task_id >= 0) {
+          if (task_id == remove_task_id) { thread_should_be_removed = true; }
+        } else {
+          threads_at->second.pool_task_ids.erase(remove_task_id);
+          if (threads_at->second.pool_task_ids.empty()) { thread_should_be_removed = true; }
+        }
       }
 
-      switch (threads_at->second.state) {
-        case TASK_BLOCKED:
-        // fall through
-        case TASK_BUFN:
-          transition(threads_at->second, thread_state::TASK_REMOVE_THROW);
-          threads_at->second.wake_condition->notify_all();
-          break;
-        case SHUFFLE_BLOCKED:
-          transition(threads_at->second, thread_state::SHUFFLE_REMOVE_THROW);
-          threads_at->second.wake_condition->notify_all();
-          break;
-        case TASK_RUNNING:
-          ret = true;
-          // fall through;
-        default:
-          log_transition(
-            thread_id, threads_at->second.task_id, threads_at->second.state, thread_state::UNKNOWN);
-          threads.erase(threads_at);
+      if (thread_should_be_removed) {
+        JNIEnv* env = nullptr;
+        if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+          cache_thread_reg_jni(env);
+          env->CallStaticVoidMethod(ThreadStateRegistry_jclass, removeThread_method, thread_id);
+        }
+        if (remove_task_id >= 0) {
+          auto const task_at = task_to_threads.find(remove_task_id);
+          if (task_at != task_to_threads.end()) { task_at->second.erase(thread_id); }
+        }
+
+        switch (threads_at->second.state) {
+          case thread_state::THREAD_BLOCKED:
+          // fall through
+          case thread_state::THREAD_BUFN:
+            transition(threads_at->second, thread_state::THREAD_REMOVE_THROW);
+            threads_at->second.wake_condition->notify_all();
+            break;
+          case thread_state::THREAD_RUNNING:
+            ret = true;
+            // fall through;
+          default:
+            log_transition(thread_id,
+                           threads_at->second.task_id,
+                           threads_at->second.state,
+                           thread_state::UNKNOWN);
+            threads.erase(threads_at);
+        }
       }
     }
     return ret;
@@ -969,30 +1183,61 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    *         entered the state machine. The only known case is GPU memory required for setup in
    *         cuDF for a spill operation.
    */
-  bool pre_alloc(long thread_id)
+  bool pre_alloc(long const thread_id)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
+    return pre_alloc_core(thread_id, false, true, lock);
+  }
 
-    auto thread = threads.find(thread_id);
+  /**
+   * Called prior to processing an alloc attempt (CPU or GPU). This will throw any injected
+   * exception and wait until the thread is ready to actually do/retry the allocation (if
+   * the allocation is blocking). That blocking API may throw other exceptions if rolling
+   * back or splitting the input is considered needed.
+   *
+   * @return true if the call finds our thread in an ALLOC state, meaning that we recursively
+   *         entered the state machine. This happens when we need to spill in a few cases for
+   *         the CPU.
+   */
+  bool pre_alloc_core(long const thread_id,
+                      bool const is_for_cpu,
+                      bool const blocking,
+                      std::unique_lock<std::mutex>& lock)
+  {
+    auto const thread = threads.find(thread_id);
     if (thread != threads.end()) {
       switch (thread->second.state) {
         // If the thread is in one of the ALLOC or ALLOC_FREE states, we have detected a loop
         // likely due to spill setup required in cuDF. We will treat this allocation differently
         // and skip transitions.
-        case TASK_ALLOC:
-        case SHUFFLE_ALLOC:
-        case TASK_ALLOC_FREE:
-        case SHUFFLE_ALLOC_FREE: return true;
+        case thread_state::THREAD_ALLOC:
+        // fall through
+        case thread_state::THREAD_ALLOC_FREE:
+          if (is_for_cpu && blocking) {
+            // On the CPU we want the spill code to be explicit so we don't have to detect it
+            // on the GPU we detect it and adjust dynamically
+            std::stringstream ss;
+            ss << "thread " << thread_id
+               << " is trying to do a blocking allocate while already in the state "
+               << as_str(thread->second.state);
 
+            throw std::invalid_argument(ss.str());
+          }
+          // We are in a recursive allocation.
+          return true;
         default: break;
       }
 
       if (thread->second.retry_oom_injected > 0) {
         thread->second.retry_oom_injected--;
-        thread->second.num_times_retry_throw++;
+        thread->second.metrics.num_times_retry_throw++;
         log_status("INJECTED_RETRY_OOM", thread_id, thread->second.task_id, thread->second.state);
         thread->second.record_failed_retry_time();
-        throw_java_exception(RETRY_OOM_CLASS, "injected RetryOOM");
+        if (is_for_cpu) {
+          throw_java_exception(CPU_RETRY_OOM_CLASS, "injected RetryOOM");
+        } else {
+          throw_java_exception(GPU_RETRY_OOM_CLASS, "injected RetryOOM");
+        }
       }
 
       if (thread->second.cudf_exception_injected > 0) {
@@ -1005,21 +1250,24 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
 
       if (thread->second.split_and_retry_oom_injected > 0) {
         thread->second.split_and_retry_oom_injected--;
-        thread->second.num_times_split_retry_throw++;
+        thread->second.metrics.num_times_split_retry_throw++;
         log_status(
           "INJECTED_SPLIT_AND_RETRY_OOM", thread_id, thread->second.task_id, thread->second.state);
         thread->second.record_failed_retry_time();
-        throw_java_exception(SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM");
+        if (is_for_cpu) {
+          throw_java_exception(CPU_SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM");
+        } else {
+          throw_java_exception(GPU_SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM");
+        }
       }
 
-      block_thread_until_ready(thread_id, lock);
+      if (blocking) { block_thread_until_ready(thread_id, lock); }
 
       switch (thread->second.state) {
-        case TASK_RUNNING: transition(thread->second, thread_state::TASK_ALLOC); break;
-        case SHUFFLE_RUNNING: transition(thread->second, thread_state::SHUFFLE_ALLOC); break;
-
-        // TODO I don't think there are other states that we need to handle, but
-        // this needs more testing.
+        case thread_state::THREAD_RUNNING:
+          transition(thread->second, thread_state::THREAD_ALLOC);
+          thread->second.is_cpu_alloc = is_for_cpu;
+          break;
         default: {
           std::stringstream ss;
           ss << "thread " << thread_id << " in unexpected state pre alloc "
@@ -1029,6 +1277,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
         }
       }
     }
+    // Not a recursive allocation
     return false;
   }
 
@@ -1042,22 +1291,37 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * `likely_spill` if this allocation should be treated differently, because
    * we detected recursion while handling a prior allocation in this thread.
    */
-  void post_alloc_success(long thread_id, bool likely_spill)
+  void post_alloc_success(long const thread_id, bool const likely_spill)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
+    post_alloc_success_core(thread_id, false, likely_spill, lock);
+  }
+
+  void post_alloc_success_core(long const thread_id,
+                               bool const is_for_cpu,
+                               bool const was_recursive,
+                               std::unique_lock<std::mutex>& lock)
+  {
     // pre allocate checks
-    auto thread = threads.find(thread_id);
-    if (!likely_spill && thread != threads.end()) {
+    auto const thread = threads.find(thread_id);
+    if (!was_recursive && thread != threads.end()) {
       switch (thread->second.state) {
-        case TASK_ALLOC:
-          // fall through
-        case TASK_ALLOC_FREE: transition(thread->second, thread_state::TASK_RUNNING); break;
-        case SHUFFLE_ALLOC:
+        case thread_state::THREAD_ALLOC:
           // fall through
-        case SHUFFLE_ALLOC_FREE: transition(thread->second, thread_state::SHUFFLE_RUNNING); break;
+        case thread_state::THREAD_ALLOC_FREE:
+          if (thread->second.is_cpu_alloc != is_for_cpu) {
+            std::stringstream ss;
+            ss << "thread " << thread_id << " has a mismatch on CPU vs GPU post alloc "
+               << as_str(thread->second.state);
+
+            throw std::invalid_argument(ss.str());
+          }
+          transition(thread->second, thread_state::THREAD_RUNNING);
+          thread->second.is_cpu_alloc = false;
+          break;
         default: break;
       }
-      wake_next_highest_priority_blocked(lock, false);
+      wake_next_highest_priority_blocked(lock, false, is_for_cpu);
     }
   }
 
@@ -1068,17 +1332,19 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    *
    * This is typically called when a free happens, or an alloc succeeds.
    * @param is_from_free true if a free happen.
+   * @param is_for_cpu true if it was a CPU operation (free or alloc)
    */
-  void wake_next_highest_priority_blocked(const std::unique_lock<std::mutex>& lock,
-                                          bool is_from_free)
+  void wake_next_highest_priority_blocked(std::unique_lock<std::mutex> const& lock,
+                                          bool const is_from_free,
+                                          bool const is_for_cpu)
   {
-    // 1. Find the highest priority blocked thread, including shuffle.
+    // 1. Find the highest priority blocked thread, for the alloc that matches
     thread_priority to_wake(-1, -1);
     bool is_to_wake_set = false;
-    for (auto thread = threads.begin(); thread != threads.end(); thread++) {
-      thread_state state = thread->second.state;
-      if (state == thread_state::TASK_BLOCKED || state == thread_state::SHUFFLE_BLOCKED) {
-        thread_priority current = thread->second.priority();
+    for (auto const& [thread_d, t_state] : threads) {
+      thread_state const& state = t_state.state;
+      if (state == thread_state::THREAD_BLOCKED && is_for_cpu == t_state.is_cpu_alloc) {
+        thread_priority current = t_state.priority();
         if (!is_to_wake_set || to_wake < current) {
           to_wake        = current;
           is_to_wake_set = true;
@@ -1086,17 +1352,13 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
       }
     }
     // 2. wake up that thread
-    long thread_id_to_wake = to_wake.get_thread_id();
+    long const thread_id_to_wake = to_wake.get_thread_id();
     if (thread_id_to_wake > 0) {
-      auto thread = threads.find(thread_id_to_wake);
+      auto const thread = threads.find(thread_id_to_wake);
       if (thread != threads.end()) {
         switch (thread->second.state) {
-          case TASK_BLOCKED:
-            transition(thread->second, thread_state::TASK_RUNNING);
-            thread->second.wake_condition->notify_all();
-            break;
-          case SHUFFLE_BLOCKED:
-            transition(thread->second, thread_state::SHUFFLE_RUNNING);
+          case thread_state::THREAD_BLOCKED:
+            transition(thread->second, thread_state::THREAD_RUNNING);
             thread->second.wake_condition->notify_all();
             break;
           default: {
@@ -1115,67 +1377,175 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
       // instead of trying to split its input. But we only do this if it
       // is a different thread that is freeing memory from the one we want to wake up.
       // This is because if the threads are the same no new memory is being added
-      // to what that task has access to and the task may never thow a retry and split.
+      // to what that task has access to and the task may never throw a retry and split.
       // Instead it would just keep retrying and freeing the same memory each time.
-      std::set<long> tasks_with_threads;
-      std::set<long> tasks_with_threads_bufn;
+      std::map<long, long> pool_bufn_task_thread_count;
+      std::map<long, long> pool_task_thread_count;
+      std::unordered_set<long> bufn_task_ids;
+      std::unordered_set<long> all_task_ids;
+      is_in_deadlock(
+        pool_bufn_task_thread_count, pool_task_thread_count, bufn_task_ids, all_task_ids, lock);
+      bool const all_bufn = all_task_ids.size() == bufn_task_ids.size();
+      if (all_bufn) {
+        thread_priority to_wake(-1, -1);
+        bool is_to_wake_set = false;
+        for (auto const& [thread_id, t_state] : threads) {
+          switch (t_state.state) {
+            case thread_state::THREAD_BUFN: {
+              if (is_for_cpu == t_state.is_cpu_alloc) {
+                thread_priority current = t_state.priority();
+                if (!is_to_wake_set || to_wake < current) {
+                  to_wake        = current;
+                  is_to_wake_set = true;
+                }
+              }
+            } break;
+            default: break;
+          }
+        }
+        // 4. Wake up the BUFN thread if we should
+        if (is_to_wake_set) {
+          long const thread_id_to_wake = to_wake.get_thread_id();
+          if (thread_id_to_wake > 0) {
+            // Don't wake up yourself on a free. It is not adding more memory for this thread
+            // to use on a retry and we might need a split instead to break a deadlock
+            auto const this_id = static_cast<long>(pthread_self());
+            auto const thread  = threads.find(thread_id_to_wake);
+            if (thread != threads.end() && thread->first != this_id) {
+              switch (thread->second.state) {
+                case thread_state::THREAD_BUFN:
+                  transition(thread->second, thread_state::THREAD_RUNNING);
+                  thread->second.wake_condition->notify_all();
+                  break;
+                case thread_state::THREAD_BUFN_WAIT:
+                  transition(thread->second, thread_state::THREAD_RUNNING);
+                  // no need to notify anyone, we will just retry without blocking...
+                  break;
+                case thread_state::THREAD_BUFN_THROW:
+                  // This should really never happen, this is a temporary state that is here only
+                  // while the lock is held, but just in case we don't want to mess it up, or throw
+                  // an exception.
+                  break;
+                default: {
+                  std::stringstream ss;
+                  ss << "internal error expected to only wake up blocked threads "
+                     << thread_id_to_wake << " " << as_str(thread->second.state);
+                  throw std::runtime_error(ss.str());
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
 
-      thread_priority to_wake(-1, -1);
-      bool is_to_wake_set = false;
-      for (auto thread = threads.begin(); thread != threads.end(); thread++) {
-        if (thread->second.task_id >= 0) { tasks_with_threads.insert(thread->second.task_id); }
+  bool is_thread_bufn_or_above(JNIEnv* env, full_thread_state const& state)
+  {
+    bool ret = false;
+    if (state.pool_blocked) {
+      ret = true;
+    } else {
+      switch (state.state) {
+        case thread_state::THREAD_BLOCKED: ret = false; break;
+        case thread_state::THREAD_BUFN:
+          // empty we are looking for even a single thread that is not blocked
+          ret = true;
+          break;
+        default:
+          ret = env->CallStaticBooleanMethod(
+            ThreadStateRegistry_jclass, isThreadBlocked_method, state.thread_id);
+          break;
+      }
+    }
+    return ret;
+  }
 
-        switch (thread->second.state) {
-          case TASK_BUFN_THROW:
-            // fall through
-          case TASK_BUFN_WAIT:
-            // fall through
-          case TASK_BUFN: {
-            tasks_with_threads_bufn.insert(thread->second.task_id);
-            thread_priority current = thread->second.priority();
-            if (!is_to_wake_set || to_wake < current) {
-              to_wake        = current;
-              is_to_wake_set = true;
-            }
-          } break;
-          default: break;
+  bool is_in_deadlock(std::map<long, long>& pool_bufn_task_thread_count,
+                      std::map<long, long>& pool_task_thread_count,
+                      std::unordered_set<long>& bufn_task_ids,
+                      std::unordered_set<long>& all_task_ids,
+                      std::unique_lock<std::mutex> const& lock)
+  {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+      throw std::runtime_error("Cloud not init JNI callbacks");
+    }
+    cache_thread_reg_jni(env);
+
+    // If all of the tasks are blocked, then we are in a deadlock situation
+    // and we need to wake something up. In theory if any one thread is still
+    // doing something, then we are not deadlocked. But the problem is detecting
+    // if a thread is blocked cheaply and accurately. We can tell if this code has
+    // blocked a thread. We can also have code we control inform us if a thread is
+    // blocked. We even have a callback to the JVM to see if the state of the java
+    // thread indicates if it is blocked or not. But I/O in java most of the time
+    // shows the thread as RUNNABLE. We also don't want to look at stack traces if
+    // we can avoid it as it is expensive. The reason this matters is because of
+    // python UDFs. When a python process runs to execute UDFs at least two dedicated
+    // task threads are used for a single task. One will write data to the python
+    // process and another will read results from it. Because both involve
+    // I/O we need a solution. For now we assume that a task is blocked if any
+    // one of the dedicated task threads are blocked and if all of the pool
+    // threads working on that task are also blocked. This is because the pool
+    // threads, even if they are blocked on I/O will eventually finish without
+    // needing to worry about it.
+    //
+    // We also need a way to detect if we need to split the input and retry.
+    // This happens when all of the tasks are also blocked until
+    // further notice. So we are going to treat a task as blocked until
+    // further notice if any of the dedicated threads for it are blocked until
+    // further notice, or all of the pool threads working on things for it are
+    // blocked until further notice.
+    std::unordered_set<long> blocked_task_ids;
+
+    // We are going to do two passes through the threads to deal with this.
+    // First pass is to look at the dedicated task threads
+    for (auto const& [thread_id, t_state] : threads) {
+      long const task_id = t_state.task_id;
+      if (task_id >= 0) {
+        all_task_ids.insert(task_id);
+        bool const is_bufn_plus = is_thread_bufn_or_above(env, t_state);
+        if (is_bufn_plus) { bufn_task_ids.insert(task_id); }
+        if (is_bufn_plus || t_state.state == thread_state::THREAD_BLOCKED) {
+          blocked_task_ids.insert(task_id);
         }
       }
+    }
 
-      // 4. Wake up the BUFN thread if we should
-      if (tasks_with_threads.size() == tasks_with_threads_bufn.size() && is_to_wake_set) {
-        long thread_id_to_wake = to_wake.get_thread_id();
-        if (thread_id_to_wake > 0) {
-          // Don't wake up yourself on a free. It is not adding more memory for this thread
-          // to use on a retry and we might need a split instead to break a deadlock
-          auto this_id = static_cast<long>(pthread_self());
-          auto thread  = threads.find(thread_id_to_wake);
-          if (thread != threads.end() && thread->first != this_id) {
-            switch (thread->second.state) {
-              case TASK_BUFN:
-                transition(thread->second, thread_state::TASK_RUNNING);
-                thread->second.wake_condition->notify_all();
-                break;
-              case TASK_BUFN_WAIT:
-                transition(thread->second, thread_state::TASK_RUNNING);
-                // no need to notify anyone, we will just retry without blocking...
-                break;
-              case TASK_BUFN_THROW:
-                // This should really never happen, this is a temporary state that is here only
-                // while the lock is held, but just in case we don't want to mess it up, or throw
-                // an exception.
-                break;
-              default: {
-                std::stringstream ss;
-                ss << "internal error expected to only wake up blocked threads "
-                   << thread_id_to_wake << " " << as_str(thread->second.state);
-                throw std::runtime_error(ss.str());
-              }
+    // Second pass is to look at the pool threads
+    for (auto const& [thread_id, t_state] : threads) {
+      long const is_pool_thread = t_state.task_id < 0;
+      if (is_pool_thread) {
+        for (auto const& task_id : t_state.pool_task_ids) {
+          auto const it = pool_task_thread_count.find(task_id);
+          if (it != pool_task_thread_count.end()) {
+            it->second += 1;
+          } else {
+            pool_task_thread_count[task_id] = 1;
+          }
+        }
+
+        bool const is_bufn_plus = is_thread_bufn_or_above(env, t_state);
+        if (is_bufn_plus) {
+          for (auto const& task_id : t_state.pool_task_ids) {
+            auto const it = pool_bufn_task_thread_count.find(task_id);
+            if (it != pool_bufn_task_thread_count.end()) {
+              it->second += 1;
+            } else {
+              pool_bufn_task_thread_count[task_id] = 1;
             }
           }
         }
+        if (!is_bufn_plus && t_state.state != thread_state::THREAD_BLOCKED) {
+          for (auto const& task_id : t_state.pool_task_ids) {
+            blocked_task_ids.erase(task_id);
+          }
+        }
       }
     }
+    // Now if all of the tasks are blocked, then we need to break a deadlock
+    return all_task_ids.size() == blocked_task_ids.size();
   }
 
   /**
@@ -1185,54 +1555,20 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    */
   void check_and_update_for_bufn(const std::unique_lock<std::mutex>& lock)
   {
-    // We want to know if all active tasks have at least one thread that
-    // is effectively blocked or not.  We could change the definitions here,
-    // but for now this sounds like a good starting point.
-    std::set<long> tasks_with_threads;
-    std::set<long> tasks_with_threads_effectively_blocked;
-    bool is_any_shuffle_thread_blocked = false;
-
-    // To keep things simple we are going to do multiple passes through
-    // the state. The first is to find out if any shuffle thread is blocked
-    // because if it is, then there is a possibility that any task thread
-    // in a shuffle could also be blocked.
-    for (auto thread = threads.begin(); thread != threads.end(); thread++) {
-      switch (thread->second.state) {
-        case SHUFFLE_BLOCKED: is_any_shuffle_thread_blocked = true; break;
-        default: break;
-      }
-    }
-
-    for (auto thread = threads.begin(); thread != threads.end(); thread++) {
-      if (thread->second.task_id >= 0) { tasks_with_threads.insert(thread->second.task_id); }
-
-      switch (thread->second.state) {
-        case TASK_WAIT_ON_SHUFFLE:
-        // fall through
-        case TASK_BUFN_WAIT_ON_SHUFFLE:
-          if (is_any_shuffle_thread_blocked) {
-            tasks_with_threads_effectively_blocked.insert(thread->second.task_id);
-          }
-          break;
-        case TASK_BLOCKED:
-        // fall through
-        case TASK_BUFN:
-          tasks_with_threads_effectively_blocked.insert(thread->second.task_id);
-          break;
-        default: break;
-      }
-    }
-
-    bool need_to_break_deadlock =
-      tasks_with_threads.size() == tasks_with_threads_effectively_blocked.size();
+    std::map<long, long> pool_bufn_task_thread_count;
+    std::map<long, long> pool_task_thread_count;
+    std::unordered_set<long> bufn_task_ids;
+    std::unordered_set<long> all_task_ids;
+    bool const need_to_break_deadlock = is_in_deadlock(
+      pool_bufn_task_thread_count, pool_task_thread_count, bufn_task_ids, all_task_ids, lock);
     if (need_to_break_deadlock) {
       // Find the task thread with the lowest priority that is not already BUFN
       thread_priority to_bufn(-1, -1);
       bool is_to_bufn_set = false;
-      for (auto thread = threads.begin(); thread != threads.end(); thread++) {
-        switch (thread->second.state) {
-          case TASK_BLOCKED: {
-            thread_priority current = thread->second.priority();
+      for (auto const& [thread_id, t_state] : threads) {
+        switch (t_state.state) {
+          case thread_state::THREAD_BLOCKED: {
+            thread_priority const& current = t_state.priority();
             if (!is_to_bufn_set || current < to_bufn) {
               to_bufn        = current;
               is_to_bufn_set = true;
@@ -1242,56 +1578,52 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
         }
       }
       if (is_to_bufn_set) {
-        long thread_id_to_bufn = to_bufn.get_thread_id();
-        auto thread            = threads.find(thread_id_to_bufn);
+        long const thread_id_to_bufn = to_bufn.get_thread_id();
+        auto const thread            = threads.find(thread_id_to_bufn);
         if (thread != threads.end()) {
-          transition(thread->second, thread_state::TASK_BUFN_THROW);
+          transition(thread->second, thread_state::THREAD_BUFN_THROW);
           thread->second.wake_condition->notify_all();
+          // We are explicitly not going to update the state around BUFN
+          // here, because we really want to wait for the retry to run
+          // it's course instead of doing a split right away.
+        }
+      }
+      // We now need a way to detect if we need to split the input and retry.
+      // This happens when all of the tasks are also blocked until
+      // further notice. So we are going to treat a task as blocked until
+      // further notice if any of the dedicated threads for it are blocked until
+      // further notice, or all of the pool threads working on things for it are
+      // blocked until further notice.
+
+      for (auto const& [task_id, bufn_count] : pool_bufn_task_thread_count) {
+        auto const pttc = pool_task_thread_count.find(task_id);
+        if (pttc != pool_task_thread_count.end() && pttc->second <= bufn_count) {
+          bufn_task_ids.insert(task_id);
         }
       }
 
-      // Now we need to check if all of the threads are BUFN
-      // Are all BUFN??
-      bool all_bufn_or_shuffle = true;
-      thread_priority to_wake(-1, -1);
-      bool is_to_wake_set = false;
-      for (auto thread = threads.begin(); thread != threads.end(); thread++) {
-        if (thread->second.task_id >= 0) {
-          switch (thread->second.state) {
-            case TASK_BUFN: {
-              thread_priority current = thread->second.priority();
+      bool const all_bufn = all_task_ids.size() == bufn_task_ids.size();
+
+      if (all_bufn) {
+        thread_priority to_wake(-1, -1);
+        bool is_to_wake_set = false;
+        for (auto const& [thread_id, t_state] : threads) {
+          switch (t_state.state) {
+            case thread_state::THREAD_BUFN: {
+              thread_priority const& current = t_state.priority();
               if (!is_to_wake_set || to_wake < current) {
                 to_wake        = current;
                 is_to_wake_set = true;
               }
             } break;
-            case TASK_WAIT_ON_SHUFFLE:
-              if (!is_any_shuffle_thread_blocked) { all_bufn_or_shuffle = false; }
-              break;
-            default: all_bufn_or_shuffle = false; break;
+            default: break;
           }
         }
-      }
-      if (all_bufn_or_shuffle) {
-        long thread_id    = to_wake.get_thread_id();
-        auto found_thread = threads.find(thread_id);
+        long const thread_id    = to_wake.get_thread_id();
+        auto const found_thread = threads.find(thread_id);
         if (found_thread != threads.end()) {
-          transition(found_thread->second, thread_state::TASK_SPLIT_THROW);
+          transition(found_thread->second, thread_state::THREAD_SPLIT_THROW);
           found_thread->second.wake_condition->notify_all();
-        } else {
-          // the only threads left are blocked on shuffle. No way for shuffle
-          // to split and throw, and ideally all of the data for those threads
-          // should already be spillable, so at this point shuffle needs to
-          // throw an OOM.
-          for (auto thread = threads.begin(); thread != threads.end(); thread++) {
-            switch (thread->second.state) {
-              case SHUFFLE_BLOCKED:
-                transition(thread->second, thread_state::SHUFFLE_THROW);
-                thread->second.wake_condition->notify_all();
-                break;
-              default: break;
-            }
-          }
         }
       }
     }
@@ -1302,30 +1634,41 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * typically happen after this has run, and we loop around to retry the alloc
    * if the state says we should.
    */
-  bool post_alloc_failed(long thread_id, bool is_oom, bool likely_spill)
+  bool post_alloc_failed(long const thread_id, bool const is_oom, bool const likely_spill)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
-    auto thread = threads.find(thread_id);
+    return post_alloc_failed_core(thread_id, false, is_oom, true, likely_spill, lock);
+  }
+
+  bool post_alloc_failed_core(long const thread_id,
+                              bool const is_for_cpu,
+                              bool const is_oom,
+                              bool const blocking,
+                              bool const was_recursive,
+                              std::unique_lock<std::mutex>& lock)
+  {
+    auto const thread = threads.find(thread_id);
     // only retry if this was due to an out of memory exception.
     bool ret = true;
-    if (!likely_spill && thread != threads.end()) {
+    if (!was_recursive && thread != threads.end()) {
+      if (thread->second.is_cpu_alloc != is_for_cpu) {
+        std::stringstream ss;
+        ss << "thread " << thread_id << " has a mismatch on CPU vs GPU post alloc "
+           << as_str(thread->second.state);
+
+        throw std::invalid_argument(ss.str());
+      }
+
       switch (thread->second.state) {
-        case TASK_ALLOC_FREE: transition(thread->second, thread_state::TASK_RUNNING); break;
-        case TASK_ALLOC:
-          if (is_oom) {
-            transition(thread->second, thread_state::TASK_BLOCKED);
-          } else {
-            // don't block unless it is OOM
-            transition(thread->second, thread_state::TASK_RUNNING);
-          }
+        case thread_state::THREAD_ALLOC_FREE:
+          transition(thread->second, thread_state::THREAD_RUNNING);
           break;
-        case SHUFFLE_ALLOC_FREE: transition(thread->second, thread_state::SHUFFLE_RUNNING); break;
-        case SHUFFLE_ALLOC:
-          if (is_oom) {
-            transition(thread->second, thread_state::SHUFFLE_BLOCKED);
+        case thread_state::THREAD_ALLOC:
+          if (is_oom && blocking) {
+            transition(thread->second, thread_state::THREAD_BLOCKED);
           } else {
-            // don't block unless it is OOM
-            transition(thread->second, thread_state::SHUFFLE_RUNNING);
+            // don't block unless it is OOM on a blocking allocation
+            transition(thread->second, thread_state::THREAD_RUNNING);
           }
           break;
         default: {
@@ -1343,21 +1686,21 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     return ret;
   }
 
-  void* do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override
+  void* do_allocate(std::size_t const num_bytes, rmm::cuda_stream_view stream) override
   {
-    auto tid = static_cast<long>(pthread_self());
+    auto const tid = static_cast<long>(pthread_self());
     while (true) {
-      bool likely_spill = pre_alloc(tid);
+      bool const likely_spill = pre_alloc(tid);
       try {
         void* ret = resource->allocate(num_bytes, stream);
         post_alloc_success(tid, likely_spill);
         return ret;
-      } catch (const rmm::out_of_memory& e) {
+      } catch (rmm::out_of_memory const& e) {
         // rmm::out_of_memory is what is thrown when an allocation failed
         // but there are other rmm::bad_alloc exceptions that could be
         // thrown as well, which are handled by the std::exception case.
         if (!post_alloc_failed(tid, true, likely_spill)) { throw; }
-      } catch (const std::exception& e) {
+      } catch (std::exception const& e) {
         post_alloc_failed(tid, false, likely_spill);
         throw;
       }
@@ -1366,42 +1709,49 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     throw rmm::bad_alloc("Internal Error");
   }
 
+  void dealloc_core(bool const is_for_cpu, std::unique_lock<std::mutex>& lock)
+  {
+    auto const tid    = static_cast<long>(pthread_self());
+    auto const thread = threads.find(tid);
+    if (thread != threads.end()) {
+      log_status("DEALLOC", tid, thread->second.task_id, thread->second.state);
+    } else {
+      log_status("DEALLOC", tid, -2, thread_state::UNKNOWN);
+    }
+
+    for (auto& [thread_id, t_state] : threads) {
+      // Only update state for _other_ threads. We update only other threads, for the case
+      // where we are handling a free from the recursive case: when an allocation/free
+      // happened while handling an allocation failure in onAllocFailed.
+      //
+      // If we moved all threads to *_ALLOC_FREE, after we exit the recursive state and
+      // are back handling the original allocation failure, we are left with a thread
+      // in a state that won't be retried in `post_alloc_failed`.
+      //
+      // By not changing our thread's state to THREAD_ALLOC_FREE, we keep the state
+      // the same, but we still let other threads know that there was a free and they should
+      // handle accordingly.
+      if (t_state.thread_id != tid) {
+        switch (t_state.state) {
+          case thread_state::THREAD_ALLOC:
+            if (is_for_cpu == t_state.is_cpu_alloc) {
+              transition(t_state, thread_state::THREAD_ALLOC_FREE);
+            }
+            break;
+          default: break;
+        }
+      }
+    }
+    wake_next_highest_priority_blocked(lock, true, is_for_cpu);
+  }
+
   void do_deallocate(void* p, std::size_t size, rmm::cuda_stream_view stream) override
   {
     resource->deallocate(p, size, stream);
     // deallocate success
     if (size > 0) {
       std::unique_lock<std::mutex> lock(state_mutex);
-
-      auto tid    = static_cast<long>(pthread_self());
-      auto thread = threads.find(tid);
-      if (thread != threads.end()) {
-        log_status("DEALLOC", tid, thread->second.task_id, thread->second.state);
-      } else {
-        log_status("DEALLOC", tid, -2, thread_state::UNKNOWN);
-      }
-
-      for (auto thread = threads.begin(); thread != threads.end(); thread++) {
-        // Only update state for _other_ threads. We update only other threads, for the case
-        // where we are handling a free from the recursive case: when an allocation/free
-        // happened while handling an allocation failure in onAllocFailed.
-        //
-        // If we moved all threads to *_ALLOC_FREE, after we exit the recursive state and
-        // are back handling the original allocation failure, we are left with a thread
-        // in a state that won't be retried in `post_alloc_failed`.
-        //
-        // By not changing our thread's state to TASK_ALLOC_FREE, we keep the state
-        // the same, but we still let other threads know that there was a free and they should
-        // handle accordingly.
-        if (thread->second.thread_id != tid) {
-          switch (thread->second.state) {
-            case TASK_ALLOC: transition(thread->second, thread_state::TASK_ALLOC_FREE); break;
-            case SHUFFLE_ALLOC: transition(thread->second, thread_state::SHUFFLE_ALLOC_FREE); break;
-            default: break;
-          }
-        }
-      }
-      wake_next_highest_priority_blocked(lock, true);
+      dealloc_core(false, lock);
     }
   }
 
@@ -1434,9 +1784,12 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_cr
     auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
     cudf::jni::native_jstring nlogloc(env, log_loc);
     std::shared_ptr<spdlog::logger> logger;
+    bool is_log_enabled;
     if (nlogloc.is_null()) {
-      logger = make_logger();
+      logger         = make_logger();
+      is_log_enabled = false;
     } else {
+      is_log_enabled = true;
       std::string slog_loc(nlogloc.get());
       if (slog_loc == "stderr") {
         logger = make_logger(std::cerr);
@@ -1447,7 +1800,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_cr
       }
     }
 
-    auto ret = new spark_resource_adaptor(env, wrapped, logger);
+    auto ret = new spark_resource_adaptor(env, wrapped, logger, is_log_enabled);
     return cudf::jni::ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
@@ -1466,44 +1819,59 @@ Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_releaseAdaptor(JNIEnv* env
 }
 
 JNIEXPORT void JNICALL
-Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_associateThreadWithTask(
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_startDedicatedTaskThread(
   JNIEnv* env, jclass, jlong ptr, jlong thread_id, jlong task_id)
 {
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
     auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
-    mr->associate_thread_with_task(thread_id, task_id);
+    mr->start_dedicated_task_thread(thread_id, task_id);
   }
   CATCH_STD(env, )
 }
 
 JNIEXPORT void JNICALL
-Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_associateThreadWithShuffle(JNIEnv* env,
-                                                                                 jclass,
-                                                                                 jlong ptr,
-                                                                                 jlong thread_id)
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_poolThreadWorkingOnTasks(
+  JNIEnv* env, jclass, jlong ptr, jboolean is_for_shuffle, jlong thread_id, jlongArray task_ids)
 {
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
+  JNI_NULL_CHECK(env, task_ids, "task_ids is null", );
   try {
     cudf::jni::auto_set_device(env);
+    cudf::jni::native_jlongArray jtask_ids(env, task_ids);
+    std::unordered_set<long> task_set(jtask_ids.begin(), jtask_ids.end());
     auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
-    mr->associate_thread_with_shuffle(thread_id);
+    mr->pool_thread_working_on_tasks(is_for_shuffle, thread_id, task_set);
   }
   CATCH_STD(env, )
 }
 
 JNIEXPORT void JNICALL
-Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_removeThreadAssociation(JNIEnv* env,
-                                                                              jclass,
-                                                                              jlong ptr,
-                                                                              jlong thread_id)
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_poolThreadFinishedForTasks(
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id, jlongArray task_ids)
 {
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
+  JNI_NULL_CHECK(env, task_ids, "task_ids is null", );
   try {
     cudf::jni::auto_set_device(env);
+    cudf::jni::native_jlongArray jtask_ids(env, task_ids);
+    std::unordered_set<long> task_set(jtask_ids.begin(), jtask_ids.end());
     auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
-    mr->remove_thread_association(thread_id);
+    mr->pool_thread_finished_for_tasks(thread_id, task_set);
+  }
+  CATCH_STD(env, )
+}
+
+JNIEXPORT void JNICALL
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_removeThreadAssociation(
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id, jlong task_id)
+{
+  JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
+  try {
+    cudf::jni::auto_set_device(env);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
+    mr->remove_thread_association(thread_id, task_id);
   }
   CATCH_STD(env, )
 }
@@ -1522,29 +1890,38 @@ JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_tas
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL
-Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_threadCouldBlockOnShuffle(JNIEnv* env,
-                                                                                jclass,
-                                                                                jlong ptr,
-                                                                                jlong thread_id)
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_submittingToPool(
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id)
+{
+  JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
+  try {
+    cudf::jni::auto_set_device(env);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
+    mr->submitting_to_pool(thread_id);
+  }
+  CATCH_STD(env, )
+}
+
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_waitingOnPool(
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id)
 {
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
     auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
-    mr->thread_could_block_on_shuffle(thread_id);
+    mr->waiting_on_pool(thread_id);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_threadDoneWithShuffle(
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_doneWaitingOnPool(
   JNIEnv* env, jclass, jlong ptr, jlong thread_id)
 {
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
     auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
-    mr->thread_done_with_shuffle(thread_id);
+    mr->done_waiting_on_pool(thread_id);
   }
   CATCH_STD(env, )
 }
@@ -1688,4 +2065,70 @@ JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_end
   }
   CATCH_STD(env, )
 }
+
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_checkAndBreakDeadlocks(
+  JNIEnv* env, jclass, jlong ptr)
+{
+  JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
+  try {
+    cudf::jni::auto_set_device(env);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
+    mr->check_and_break_deadlocks();
+  }
+  CATCH_STD(env, )
+}
+
+JNIEXPORT jboolean JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_preCpuAlloc(
+  JNIEnv* env, jclass, jlong ptr, jlong amount, jboolean blocking)
+{
+  JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
+    return mr->cpu_prealloc(amount, blocking);
+  }
+  CATCH_STD(env, 0)
+}
+
+JNIEXPORT void JNICALL
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_postCpuAllocSuccess(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong ptr,
+                                                                          jlong addr,
+                                                                          jlong amount,
+                                                                          jboolean blocking,
+                                                                          jboolean was_recursive)
+{
+  JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
+  try {
+    cudf::jni::auto_set_device(env);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
+    mr->cpu_postalloc_success(reinterpret_cast<void*>(addr), amount, blocking, was_recursive);
+  }
+  CATCH_STD(env, )
+}
+
+JNIEXPORT jboolean JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_postCpuAllocFailed(
+  JNIEnv* env, jclass, jlong ptr, jboolean was_oom, jboolean blocking, jboolean was_recursive)
+{
+  JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
+    return mr->cpu_postalloc_failed(was_oom, blocking, was_recursive);
+  }
+  CATCH_STD(env, 0)
+}
+
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_cpuDeallocate(
+  JNIEnv* env, jclass, jlong ptr, jlong addr, jlong amount)
+{
+  JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
+  try {
+    cudf::jni::auto_set_device(env);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
+    mr->cpu_dealloc(reinterpret_cast<void*>(addr), amount);
+  }
+  CATCH_STD(env, )
+}
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CpuRetryOOM.java b/src/main/java/com/nvidia/spark/rapids/jni/CpuRetryOOM.java
new file mode 100644
index 0000000000..a8fb42390a
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CpuRetryOOM.java
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+/**
+ * A special version of an out of memory error that indicates we ran out of off heap memory, but
+ * should roll back to a point when all memory for the task is spillable and then retry the
+ * operation.
+ */
+public class CpuRetryOOM extends OffHeapOOM {
+  public CpuRetryOOM() {
+    super();
+  }
+
+  public CpuRetryOOM(String message) {
+    super(message);
+  }
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CpuSplitAndRetryOOM.java b/src/main/java/com/nvidia/spark/rapids/jni/CpuSplitAndRetryOOM.java
new file mode 100644
index 0000000000..16e6e7239f
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CpuSplitAndRetryOOM.java
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+/**
+ * A special version of an out of memory error that indicates we ran out of off heap memory, but
+ * should roll back to a point when all memory for the task is spillable and then retry the
+ * operation with the input data split to make it ideally use less off heap memory overall.
+ */
+public class CpuSplitAndRetryOOM extends OffHeapOOM {
+  public CpuSplitAndRetryOOM() {
+    super();
+  }
+
+  public CpuSplitAndRetryOOM(String message) {
+    super(message);
+  }
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RetryOOM.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuRetryOOM.java
similarity index 85%
rename from src/main/java/com/nvidia/spark/rapids/jni/RetryOOM.java
rename to src/main/java/com/nvidia/spark/rapids/jni/GpuRetryOOM.java
index 62d5e28cca..f7eec8be46 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/RetryOOM.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuRetryOOM.java
@@ -17,15 +17,15 @@
 package com.nvidia.spark.rapids.jni;
 
 /**
- * A special version of an out of memory error that indicates we ran out of memory, but should
+ * A special version of an out of memory error that indicates we ran out of GPU memory, but should
  * roll back to a point when all memory for the task is spillable and then retry the operation.
  */
-public class RetryOOM extends GpuOOM {
-  public RetryOOM() {
+public class GpuRetryOOM extends GpuOOM {
+  public GpuRetryOOM() {
     super();
   }
 
-  public RetryOOM(String message) {
+  public GpuRetryOOM(String message) {
     super(message);
   }
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/SplitAndRetryOOM.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuSplitAndRetryOOM.java
similarity index 85%
rename from src/main/java/com/nvidia/spark/rapids/jni/SplitAndRetryOOM.java
rename to src/main/java/com/nvidia/spark/rapids/jni/GpuSplitAndRetryOOM.java
index 022c6952a1..4c3b3baeba 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/SplitAndRetryOOM.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuSplitAndRetryOOM.java
@@ -17,16 +17,16 @@
 package com.nvidia.spark.rapids.jni;
 
 /**
- * A special version of an out of memory error that indicates we ran out of memory, but should
+ * A special version of an out of memory error that indicates we ran out of GPU memory, but should
  * roll back to a point when all memory for the task is spillable and then retry the operation
  * with the input data split to make it ideally use less GPU memory overall.
  */
-public class SplitAndRetryOOM extends GpuOOM {
-  public SplitAndRetryOOM() {
+public class GpuSplitAndRetryOOM extends GpuOOM {
+  public GpuSplitAndRetryOOM() {
     super();
   }
 
-  public SplitAndRetryOOM(String message) {
+  public GpuSplitAndRetryOOM(String message) {
     super(message);
   }
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/OffHeapOOM.java b/src/main/java/com/nvidia/spark/rapids/jni/OffHeapOOM.java
new file mode 100644
index 0000000000..9379775072
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/OffHeapOOM.java
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+/**
+ * A special version of an out of memory error that indicates we ran out of off heap CPU memory.
+ * This is mostly to avoid a fatal error that would force the worker process to restart. This
+ * should be recoverable.
+ */
+public class OffHeapOOM extends RuntimeException {
+  public OffHeapOOM() {
+    super();
+  }
+
+  public OffHeapOOM(String message) {
+    super(message);
+  }
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java b/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java
index 3132dd9cd0..558124e2fe 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java
@@ -114,47 +114,122 @@ public static long getCurrentThreadId() {
   }
 
   /**
-   * Associate a thread with a given task id.
+   * Indicate that a given thread is dedicated to a specific task. This thread can be part of a
+   * thread pool, but if it blocks it can never transitively block another active task.
    * @param threadId the thread ID to use
-   * @param taskId the task ID this thread is associated with.
+   * @param taskId the task ID this thread is working on.
    */
-  public static void associateThreadWithTask(long threadId, long taskId) {
+  public static void startDedicatedTaskThread(long threadId, long taskId, Thread thread) {
     synchronized (Rmm.class) {
       if (sra != null && sra.isOpen()) {
-        sra.associateThreadWithTask(threadId, taskId);
+        ThreadStateRegistry.addThread(threadId, thread);
+        sra.startDedicatedTaskThread(threadId, taskId);
       }
     }
   }
 
   /**
-   * Associate the current thread with a given task id.
-   * @param taskId the task ID this thread is associated with.
+   * Indicate that the current thread is dedicated to a specific task. This thread can be part of
+   * a thread pool, but if this blocks it can never transitively block another active task.
+   * @param taskId the task ID this thread is working on.
    */
-  public static void associateCurrentThreadWithTask(long taskId) {
-    associateThreadWithTask(getCurrentThreadId(), taskId);
+  public static void currentThreadIsDedicatedToTask(long taskId) {
+    startDedicatedTaskThread(getCurrentThreadId(), taskId, Thread.currentThread());
   }
 
   /**
-   * Associate a thread with shuffle.
-   * @param threadId the thread ID to associate (not java thread id).
+   * A shuffle thread has started to work on some tasks.
+   * @param threadId the thread ID (not java thread id).
+   * @param thread the java thread
+   * @param taskIds the IDs of tasks that this is starting work on.
    */
-  public static void associateThreadWithShuffle(long threadId) {
+  public static void shuffleThreadWorkingTasks(long threadId, Thread thread, long[] taskIds) {
     synchronized (Rmm.class) {
       if (sra != null && sra.isOpen()) {
-        sra.associateThreadWithShuffle(threadId);
+        ThreadStateRegistry.addThread(threadId, thread);
+        sra.poolThreadWorkingOnTasks(true, threadId, taskIds);
       }
     }
   }
 
   /**
-   * Associate the current thread with shuffle.
+   * The current thread is a shuffle thread and has started to work on some tasks.
+   * @param taskIds the IDs of the tasks that this is starting work on.
    */
-  public static void associateCurrentThreadWithShuffle() {
-    associateThreadWithShuffle(getCurrentThreadId());
+  public static void shuffleThreadWorkingOnTasks(long[] taskIds) {
+    shuffleThreadWorkingTasks(getCurrentThreadId(), Thread.currentThread(), taskIds);
   }
 
+  /**
+   * The current thread which is in a thread pool that could transitively block other tasks has
+   * started to work on a task.
+   * @param taskId the ID of the task that this is starting work on.
+   */
+  public static void poolThreadWorkingOnTask(long taskId) {
+    long threadId = getCurrentThreadId();
+    Thread thread = Thread.currentThread();
+    long[] taskIds = new long[]{taskId};
+    synchronized (Rmm.class) {
+      if (sra != null && sra.isOpen()) {
+        ThreadStateRegistry.addThread(threadId, thread);
+        sra.poolThreadWorkingOnTasks(false, threadId, taskIds);
+      }
+    }
+  }
+
+  /**
+   * A thread in a thread pool that could transitively block other tasks has finished work
+   * on some tasks.
+   * @param threadId the thread ID (not java thread id).
+   * @param taskIds the IDs of the tasks that are done.
+   */
+  public static void poolThreadFinishedForTasks(long threadId, long[] taskIds) {
+    synchronized (Rmm.class) {
+      if (sra != null && sra.isOpen()) {
+        sra.poolThreadFinishedForTasks(threadId, taskIds);
+      }
+    }
+  }
+
+  /**
+   * A shuffle thread has finished work on some tasks.
+   * @param threadId the thread ID (not java thread id).
+   * @param taskIds the IDs of the tasks that are done.
+   */
+  private static void shuffleThreadFinishedForTasks(long threadId, long[] taskIds) {
+    poolThreadFinishedForTasks(threadId, taskIds);
+  }
+
+  /**
+   * The current thread which is in a thread pool that could transitively block other tasks
+   * has finished work on some tasks.
+   * @param taskIds the IDs of the tasks that are done.
+   */
+  public static void poolThreadFinishedForTasks(long[] taskIds) {
+    poolThreadFinishedForTasks(getCurrentThreadId(), taskIds);
+  }
 
+  /**
+   * The current shuffle thread has finished work on some tasks.
+   * @param taskIds the IDs of the tasks that are done.
+   */
+  public static void shuffleThreadFinishedForTasks(long[] taskIds) {
+    shuffleThreadFinishedForTasks(getCurrentThreadId(), taskIds);
+  }
 
+  /**
+   * The current thread which is in a thread pool that could transitively block other tasks
+   * has finished work on a task.
+   * @param taskId the ID of the task that is done.
+   */
+  public static void poolThreadFinishedForTask(long taskId) {
+    poolThreadFinishedForTasks(getCurrentThreadId(), new long[]{taskId});
+  }
+
+  /**
+   * Indicate that a retry block has started for a given thread.
+   * @param threadId the id of the thread, not the java ID.
+   */
   public static void startRetryBlock(long threadId) {
     synchronized (Rmm.class) {
       if (sra != null && sra.isOpen()) {
@@ -163,10 +238,17 @@ public static void startRetryBlock(long threadId) {
     }
   }
 
+  /**
+   * Indicate that the current thread is entering a retry block.
+   */
   public static void currentThreadStartRetryBlock() {
     startRetryBlock(getCurrentThreadId());
   }
 
+  /**
+   * Indicate that a retry block has ended for a given thread.
+   * @param threadId the id of the thread, not the java ID.
+   */
   public static void endRetryBlock(long threadId) {
     synchronized (Rmm.class) {
       if (sra != null && sra.isOpen()) {
@@ -175,28 +257,62 @@ public static void endRetryBlock(long threadId) {
     }
   }
 
+  /**
+   * Indicate that the current thread is exiting a retry block.
+   */
   public static void currentThreadEndRetryBlock() {
-    startRetryBlock(getCurrentThreadId());
+    endRetryBlock(getCurrentThreadId());
+  }
+
+  private static void checkAndBreakDeadlocks() {
+    synchronized (Rmm.class) {
+      if (sra != null && sra.isOpen()) {
+        sra.checkAndBreakDeadlocks();
+      }
+    }
   }
 
   /**
-   * Remove the given thread ID from any association.
+   * Remove the given thread ID from being associated with a given task
    * @param threadId the ID of the thread that is no longer a part of a task or shuffle
    *                 (not java thread id).
    */
-  public static void removeThreadAssociation(long threadId) {
+  public static void removeDedicatedThreadAssociation(long threadId, long taskId) {
+    synchronized (Rmm.class) {
+      if (sra != null && sra.isOpen()) {
+        sra.removeThreadAssociation(threadId, taskId);
+      }
+    }
+  }
+
+  /**
+   * Remove the current thread from being associated with the given task.
+   */
+  public static void removeCurrentDedicatedThreadAssociation(long taskId) {
+    removeDedicatedThreadAssociation(getCurrentThreadId(), taskId);
+  }
+
+  /**
+   * Remove all task associations for a given thread. This is intended to be used as a part
+   * of tests when a thread is shutting down, or for a pool thread when it is fully done.
+   * Dedicated task thread typically are cleaned when the task itself completes.
+   * @param threadId the id of the thread to clean up
+   */
+  public static void removeAllThreadAssociation(long threadId) {
     synchronized (Rmm.class) {
       if (sra != null && sra.isOpen()) {
-        sra.removeThreadAssociation(threadId);
+        sra.removeThreadAssociation(threadId, -1);
       }
     }
   }
 
   /**
-   * Remove any association the current thread has.
+   * Remove all task associations for the current thread. This is intended to be used as a part
+   * of tests when a thread is shutting down, or for a pool thread when it is fully done.
+   * Dedicated task thread typically are cleaned when the task itself completes.
    */
-  public static void removeCurrentThreadAssociation() {
-    removeThreadAssociation(getCurrentThreadId());
+  public static void removeAllCurrentThreadAssociation() {
+    removeAllThreadAssociation(getCurrentThreadId());
   }
 
   /**
@@ -213,51 +329,75 @@ public static void taskDone(long taskId) {
   }
 
   /**
-   * Indicate that the given thread could block on shuffle.
-   * @param threadId the id of the thread that could block (not java thread id).
+   * A dedicated task thread is about to submit work to a pool that could transitively block it.
+   * @param threadId the ID of the thread that is about to submit the work.
+   */
+  public static void submittingToPool(long threadId) {
+    synchronized (Rmm.class) {
+      if (sra != null && sra.isOpen()) {
+        sra.submittingToPool(threadId);
+      }
+    }
+  }
+
+  /**
+   * The current thread is about to submit work to a thread pool that might transitively block
+   * this thread. This thread must be a dedicated task thread.
+   */
+  public static void submittingToPool() {
+    submittingToPool(getCurrentThreadId());
+  }
+
+  /**
+   * A dedicated task thread is about to wait on work done on a pool that could transitively
+   * block it.
+   * @param threadId the ID of the thread that is about to wait.
    */
-  public static void threadCouldBlockOnShuffle(long threadId) {
+  public static void waitingOnPool(long threadId) {
     synchronized (Rmm.class) {
       if (sra != null && sra.isOpen()) {
-        sra.threadCouldBlockOnShuffle(threadId);
+        sra.waitingOnPool(threadId);
       }
     }
   }
 
   /**
-   * Indicate that the current thread could block on shuffle.
+   * The current thread is about to wait on work done on a thread pool that might transitively block
+   * this thread. This thread must be a dedicated task thread.
    */
-  public static void threadCouldBlockOnShuffle() {
-    threadCouldBlockOnShuffle(getCurrentThreadId());
+  public static void waitingOnPool() {
+    waitingOnPool(getCurrentThreadId());
   }
 
   /**
-   * Indicate that the given thread can no longer block on shuffle.
-   * @param threadId the ID of the thread that o longer can block on shuffle (not java thread id).
+   * A dedicated task thread is done waiting on a pool, either for a result or after submitting
+   * something to the pool.
+   * @param threadId the ID of the thread that is done.
    */
-  public static void threadDoneWithShuffle(long threadId) {
+  public static void doneWaitingOnPool(long threadId) {
     synchronized (Rmm.class) {
       if (sra != null && sra.isOpen()) {
-        sra.threadDoneWithShuffle(threadId);
+        sra.doneWaitingOnPool(threadId);
       }
     }
   }
 
   /**
-   * Indicate that the current thread can no longer block on shuffle.
+   * The current thread is done waiting on a pool either for a result or after submitting something
+   * to the pool. This thread must be a dedicated task thread.
    */
-  public static void threadDoneWithShuffle() {
-    threadDoneWithShuffle(getCurrentThreadId());
+  public static void doneWaitingOnPool() {
+    doneWaitingOnPool(getCurrentThreadId());
   }
 
   /**
-   * This should be called as a part of handling any RetryOOM or SplitAndRetryOOM exception.
+   * This should be called as a part of handling any GpuRetryOOM or GpuSplitAndRetryOOM exception.
    * The order should be something like.
    * <ol>
    *   <li>Catch Exception</li>
    *   <li>Mark any GPU input as spillable, (should have already had contig split called on it)</li>
    *   <li>call blockUntilReady</li>
-   *   <li>split the input data if SplitAndRetryOOM</li>
+   *   <li>split the input data if GpuSplitAndRetryOOM</li>
    *   <li>retry processing with the data</li>
    * </ol>
    * This should be a NOOP if the thread is not in a state where it would need to block. Note
@@ -279,7 +419,8 @@ public static void blockThreadUntilReady() {
   }
 
   /**
-   * Force the thread with the given ID to throw a RetryOOM on their next allocation attempt.
+   * Force the thread with the given ID to throw a GpuRetryOOM or CpuRetryOOM on their next
+   * allocation attempt, depending on the type of allocation being done.
    * @param threadId the ID of the thread to throw the exception (not java thread id).
    */
   public static void forceRetryOOM(long threadId) {
@@ -287,9 +428,10 @@ public static void forceRetryOOM(long threadId) {
   }
 
   /**
-   * Force the thread with the given ID to throw a RetryOOM on their next allocation attempt.
+   * Force the thread with the given ID to throw a GpuRetryOOM or CpuRetryOOM on their next
+   * allocation attempt, depending on the type of allocation being done.
    * @param threadId the ID of the thread to throw the exception (not java thread id).
-   * @param numOOMs the number of times the RetryOOM should be thrown
+   * @param numOOMs the number of times the *RetryOOM should be thrown
    */
   public static void forceRetryOOM(long threadId, int numOOMs) {
     synchronized (Rmm.class) {
@@ -302,7 +444,8 @@ public static void forceRetryOOM(long threadId, int numOOMs) {
   }
 
   /**
-   * Force the thread with the given ID to throw a SplitAndRetryOOM on their next allocation attempt.
+   * Force the thread with the given ID to throw a GpuSplitAndRetryOOM of CpuSplitAndRetryOOM
+   * on their next allocation attempt, depending on the allocation being done.
    * @param threadId the ID of the thread to throw the exception (not java thread id).
    */
   public static void forceSplitAndRetryOOM(long threadId) {
@@ -310,9 +453,10 @@ public static void forceSplitAndRetryOOM(long threadId) {
   }
 
   /**
-   * Force the thread with the given ID to throw a SplitAndRetryOOM on their next allocation attempt.
+   * Force the thread with the given ID to throw a GpuSplitAndRetryOOM or CpuSplitAndRetryOOm
+   * on their next allocation attempt, depending on the allocation being done.
    * @param threadId the ID of the thread to throw the exception (not java thread id).
-   * @param numOOMs the number of times the SplitAndRetryOOM should be thrown
+   * @param numOOMs the number of times the *SplitAndRetryOOM should be thrown
    */
   public static void forceSplitAndRetryOOM(long threadId, int numOOMs) {
     synchronized (Rmm.class) {
@@ -423,4 +567,79 @@ public static long getAndResetComputeTimeLostToRetryNs(long taskId) {
       }
     }
   }
+
+  /**
+   * Called before doing an allocation on the CPU. This could throw an injected exception to help
+   * with testing.
+   * @param amount the amount of memory being requested
+   * @param blocking is this for a blocking allocate or a non-blocking one.
+   * @return a boolean that indicates if the allocation is recursive. Note that recursive
+   * allocations on the CPU are only allowed with non-blocking allocations. This must be passed
+   * back into the post allocations calls.
+   */
+  public static boolean preCpuAlloc(long amount, boolean blocking) {
+    SparkResourceAdaptor local;
+    synchronized (Rmm.class) {
+      local = sra;
+    }
+    if (local != null && local.isOpen()) {
+      return local.preCpuAlloc(amount, blocking);
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * The allocation that was going to be done succeeded.
+   * @param ptr a pointer to the memory that was allocated.
+   * @param amount the amount of memory that was allocated.
+   * @param blocking is this for a blocking allocate or a non-blocking one.
+   * @param wasRecursive the boolean that was returned from `preCpuAlloc`.
+   */
+  public static void postCpuAllocSuccess(long ptr, long amount, boolean blocking,
+                                         boolean wasRecursive) {
+    SparkResourceAdaptor local;
+    synchronized (Rmm.class) {
+      local = sra;
+    }
+    if (local != null && local.isOpen()) {
+      local.postCpuAllocSuccess(ptr, amount, blocking, wasRecursive);
+    }
+  }
+
+  /**
+   * The allocation failed, and spilling didn't save it.
+   * @param wasOom was the failure caused by an OOM or something else.
+   * @param blocking is this for a blocking allocate or a non-blocking one.
+   * @param wasRecursive the boolean that was returned from `preCpuAlloc`.
+   * @return true if the allocation should be retried else false if the state machine
+   * thinks that a retry would not help.
+   */
+  public static boolean postCpuAllocFailed(boolean wasOom, boolean blocking, boolean wasRecursive) {
+    SparkResourceAdaptor local;
+    synchronized (Rmm.class) {
+      local = sra;
+    }
+    if (local != null && local.isOpen()) {
+      return local.postCpuAllocFailed(wasOom, blocking, wasRecursive);
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * Some CPU memory was freed.
+   * @param ptr a pointer to the memory being deallocated.
+   * @param amount the amount that was made available.
+   */
+  public static void cpuDeallocate(long ptr, long amount) {
+    SparkResourceAdaptor local;
+    synchronized (Rmm.class) {
+      local = sra;
+    }
+    if (local != null && local.isOpen()) {
+      local.cpuDeallocate(ptr, amount);
+    }
+  }
+
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RmmSparkThreadState.java b/src/main/java/com/nvidia/spark/rapids/jni/RmmSparkThreadState.java
index 8cd35f1a40..1a6a61b783 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/RmmSparkThreadState.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/RmmSparkThreadState.java
@@ -22,24 +22,16 @@
  */
 public enum RmmSparkThreadState {
   UNKNOWN(-1), // thread is not associated with anything...
-  TASK_RUNNING(0), // task thread running normally
-  TASK_WAIT_ON_SHUFFLE(1), // task thread waiting on shuffle
-  TASK_BUFN_WAIT_ON_SHUFFLE(2), // task thread waiting on shuffle, but marked as BUFN
-  TASK_ALLOC(3), // task thread in the middle of doing an allocation
-  TASK_ALLOC_FREE(4), // task thread in the middle of doing an allocation and a free happened
-  TASK_BLOCKED(5), // task thread that is temporarily blocked
-  TASK_BUFN_THROW(6), // task thread that should throw an exception to roll back before blocking
-  TASK_BUFN_WAIT(7), // task thread that threw an exception to roll back and now should
+  THREAD_RUNNING(0), // task thread running normally
+  THREAD_ALLOC(1), // task thread in the middle of doing an allocation
+  THREAD_ALLOC_FREE(2), // task thread in the middle of doing an allocation and a free happened
+  THREAD_BLOCKED(3), // task thread that is temporarily blocked
+  THREAD_BUFN_THROW(4), // task thread that should throw an exception to roll back before blocking
+  THREAD_BUFN_WAIT(5), // task thread that threw an exception to roll back and now should
   // block the next time alloc is called
-  TASK_BUFN(8), // task thread that is blocked until higher priority tasks start to succeed
-  TASK_SPLIT_THROW(9), // task thread that should throw an exception to split input and retry
-  TASK_REMOVE_THROW(10), // task thread that is being removed and needs to throw an exception
-  SHUFFLE_RUNNING(11), // shuffle thread that is running normally
-  SHUFFLE_ALLOC(12), // shuffle thread that is in the middle of doing an alloc
-  SHUFFLE_ALLOC_FREE(13), // shuffle thread that is doing an alloc and a free happened.
-  SHUFFLE_BLOCKED(14), // shuffle thread that is temporarily blocked
-  SHUFFLE_THROW(15), // shuffle thread that needs to throw an OOM
-  SHUFFLE_REMOVE_THROW(16); // shuffle thread that is being removed and needs to throw an exception
+  THREAD_BUFN(6), // task thread that is blocked until higher priority tasks start to succeed
+  THREAD_SPLIT_THROW(7), // task thread that should throw an exception to split input and retry
+  THREAD_REMOVE_THROW(8); // task thread that is being removed and needs to throw an exception
 
   private final int nativeId;
 
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java b/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java
index 8d98729dfc..74f1946748 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java
@@ -26,7 +26,15 @@ public class SparkResourceAdaptor
     NativeDepsLoader.loadNativeDeps();
   }
 
+  /**
+   * How long does the SparkResourceAdaptor pool thread states as a watchdog to break up potential
+   * deadlocks.
+   */
+  private static final long pollingPeriod = Long.getLong(
+      "ai.rapids.cudf.spark.rmmWatchdogPollingPeriod", 100);
+
   private long handle = 0;
+  private Thread watchDog;
 
   /**
    * Create a new tracking resource adaptor.
@@ -46,6 +54,17 @@ public SparkResourceAdaptor(RmmEventHandlerResourceAdaptor<RmmDeviceMemoryResour
   public SparkResourceAdaptor(RmmEventHandlerResourceAdaptor<RmmDeviceMemoryResource> wrapped,
       String logLoc) {
     super(wrapped);
+    watchDog = new Thread(() -> {
+      try {
+        while (handle > 0) {
+          checkAndBreakDeadlocks();
+          Thread.sleep(pollingPeriod);
+        }
+      } catch (InterruptedException e) {
+        // We are going to exit, so ignore the exception
+        Thread.currentThread().interrupt();
+      }
+    }, "SparkResourceAdaptor WatchDog");
     // Do a little normalization before setting up logging...
     if ("stderr".equalsIgnoreCase(logLoc)) {
       logLoc = "stderr";
@@ -53,6 +72,8 @@ public SparkResourceAdaptor(RmmEventHandlerResourceAdaptor<RmmDeviceMemoryResour
       logLoc = "stdout";
     }
     handle = createNewAdaptor(wrapped.getHandle(), logLoc);
+    watchDog.setDaemon(true);
+    watchDog.start();
   }
 
   @Override
@@ -75,12 +96,14 @@ public boolean isOpen() {
   }
 
   /**
-   * Associate a thread with a given task id.
+   * Start a dedicated task thread. There can be more than one thread for a task. It is also
+   * possible for this thread to be in a thread pool. It is just that there is no way if this
+   * thread blocks that it will transitively block any other active tasks.
    * @param threadId the thread ID to use (not java thread id)
    * @param taskId the task ID this thread is associated with.
    */
-  public void associateThreadWithTask(long threadId, long taskId) {
-    associateThreadWithTask(getHandle(), threadId, taskId);
+  public void startDedicatedTaskThread(long threadId, long taskId) {
+    startDedicatedTaskThread(getHandle(), threadId, taskId);
   }
 
   public void startRetryBlock(long threadId) {
@@ -91,20 +114,38 @@ public void endRetryBlock(long threadId) {
     endRetryBlock(getHandle(), threadId);
   }
 
+  public void checkAndBreakDeadlocks() {
+    checkAndBreakDeadlocks(getHandle());
+  }
+
   /**
-   * Associate a thread with shuffle.
-   * @param threadId the thread ID to associate (not java thread id).
+   * A thread in a shared thread pool has picked up some work for a set of tasks.
+   * @param isForShuffle true if this is for shuffle, else false. Shuffle allows
+   *                     for multiple task ids to be active at once, and also has
+   *                     the highest priority to run. Other pool threads will have
+   *                     a priority based off of the tasks they are working for.
+   * @param threadId the thread that will be doing the work.
+   * @param taskIds the ids of the tasks that it will be working on.
    */
-  public void associateThreadWithShuffle(long threadId) {
-    associateThreadWithShuffle(getHandle(), threadId);
+  public void poolThreadWorkingOnTasks(boolean isForShuffle, long threadId, long[] taskIds) {
+    if (taskIds.length > 0) {
+      poolThreadWorkingOnTasks(getHandle(), isForShuffle, threadId, taskIds);
+    }
+  }
+
+  public void poolThreadFinishedForTasks(long threadId, long[] taskIds) {
+    if (taskIds.length > 0) {
+      poolThreadFinishedForTasks(getHandle(), threadId, taskIds);
+    }
   }
 
   /**
    * Remove the given thread ID from any association.
    * @param threadId the ID of the thread that is no longer a part of a task or shuffle (not java thread id).
+   * @param taskId the task that is being removed. If the task id is -1, then any/all tasks are removed.
    */
-  public void removeThreadAssociation(long threadId) {
-    removeThreadAssociation(getHandle(), threadId);
+  public void removeThreadAssociation(long threadId, long taskId) {
+    removeThreadAssociation(getHandle(), threadId, taskId);
   }
 
   /**
@@ -117,41 +158,50 @@ public void taskDone(long taskId) {
   }
 
   /**
-   * Indicate that the given thread could block on shuffle.
-   * @param threadId the id of the thread that could block (not java thread id).
+   * A dedicated task thread is going to submit work to a pool.
+   * @param threadId the ID of the thread that will submit the work.
    */
-  public void threadCouldBlockOnShuffle(long threadId) {
-    threadCouldBlockOnShuffle(getHandle(), threadId);
+  public void submittingToPool(long threadId) {
+    submittingToPool(getHandle(), threadId);
   }
 
   /**
-   * Indicate that the given thread can no longer block on shuffle.
-   * @param threadId the ID of the thread that o longer can block on shuffle (not java thread id).
+   * A dedicated task thread is going to wait on work in a pool to complete.
+   * @param threadId the ID of the thread that will submit the work.
    */
-  public void threadDoneWithShuffle(long threadId) {
-    threadDoneWithShuffle(getHandle(), threadId);
+  public void waitingOnPool(long threadId) {
+    waitingOnPool(getHandle(), threadId);
   }
 
   /**
-   * Force the thread with the given ID to throw a RetryOOM on their next allocation attempt.
+   * A dedicated task thread is done waiting on a pool. This could be because of submitting
+   * something to the pool or waiting on a result from the pool.
+   * @param threadId the ID of the thread that is done.
+   */
+  public void doneWaitingOnPool(long threadId) {
+    doneWaitingOnPool(getHandle(), threadId);
+  }
+
+  /**
+   * Force the thread with the given ID to throw a GpuRetryOOM on their next allocation attempt.
    * @param threadId the ID of the thread to throw the exception (not java thread id).
-   * @param numOOMs the number of times the RetryOOM should be thrown
+   * @param numOOMs the number of times the GpuRetryOOM should be thrown
    */
   public void forceRetryOOM(long threadId, int numOOMs) {
     forceRetryOOM(getHandle(), threadId, numOOMs);
   }
 
   /**
-   * Force the thread with the given ID to throw a SplitAndRetryOOM on their next allocation attempt.
+   * Force the thread with the given ID to throw a GpuSplitAndRetryOOM on their next allocation attempt.
    * @param threadId the ID of the thread to throw the exception (not java thread id).
-   * @param numOOMs the number of times the SplitAndRetryOOM should be thrown
+   * @param numOOMs the number of times the GpuSplitAndRetryOOM should be thrown
    */
   public void forceSplitAndRetryOOM(long threadId, int numOOMs) {
     forceSplitAndRetryOOM(getHandle(), threadId, numOOMs);
   }
 
   /**
-   * Force the thread with the given ID to throw a SplitAndRetryOOM on their next allocation attempt.
+   * Force the thread with the given ID to throw a GpuSplitAndRetryOOM on their next allocation attempt.
    * @param threadId the ID of the thread to throw the exception (not java thread id).
    * @param numTimes the number of times the CudfException should be thrown
    */
@@ -186,6 +236,49 @@ public long getAndResetComputeTimeLostToRetry(long taskId) {
     return getAndResetComputeTimeLostToRetry(getHandle(), taskId);
   }
 
+
+  /**
+   * Called before doing an allocation on the CPU. This could throw an injected exception to help
+   * with testing.
+   * @param amount the amount of memory being requested
+   * @param blocking is this for a blocking allocate or a non-blocking one.
+   */
+  public boolean preCpuAlloc(long amount, boolean blocking) {
+    return preCpuAlloc(getHandle(), amount, blocking);
+  }
+
+  /**
+   * The allocation that was going to be done succeeded.
+   * @param ptr a pointer to the memory that was allocated.
+   * @param amount the amount of memory that was allocated.
+   * @param blocking is this for a blocking allocate or a non-blocking one.
+   * @param wasRecursive the result of calling preCpuAlloc.
+   */
+  public void postCpuAllocSuccess(long ptr, long amount, boolean blocking, boolean wasRecursive) {
+    postCpuAllocSuccess(getHandle(), ptr, amount, blocking, wasRecursive);
+  }
+
+  /**
+   * The allocation failed, and spilling didn't save it.
+   * @param wasOom was the failure caused by an OOM or something else.
+   * @param blocking is this for a blocking allocate or a non-blocking one.
+   * @param wasRecursive the result of calling preCpuAlloc
+   * @return true if the allocation should be retried else false if the state machine
+   * thinks that a retry would not help.
+   */
+  public boolean postCpuAllocFailed(boolean wasOom, boolean blocking, boolean wasRecursive) {
+    return postCpuAllocFailed(getHandle(), wasOom, blocking, wasRecursive);
+  }
+
+  /**
+   * Some CPU memory was freed.
+   * @param ptr a pointer to the memory being deallocated.
+   * @param amount the amount that was made available.
+   */
+  public void cpuDeallocate(long ptr, long amount) {
+    cpuDeallocate(getHandle(), ptr, amount);
+  }
+
   /**
    * Get the ID of the current thread that can be used with the other SparkResourceAdaptor APIs.
    * Don't use the java thread ID. They are not related.
@@ -194,12 +287,14 @@ public long getAndResetComputeTimeLostToRetry(long taskId) {
 
   private native static long createNewAdaptor(long wrappedHandle, String logLoc);
   private native static void releaseAdaptor(long handle);
-  private static native void associateThreadWithTask(long handle, long threadId, long taskId);
-  private static native void associateThreadWithShuffle(long handle, long threadId);
-  private static native void removeThreadAssociation(long handle, long threadId);
+  private static native void startDedicatedTaskThread(long handle, long threadId, long taskId);
+  private static native void poolThreadWorkingOnTasks(long handle, boolean isForShuffle, long threadId, long[] taskIds);
+  private static native void poolThreadFinishedForTasks(long handle, long threadId, long[] taskIds);
+  private static native void removeThreadAssociation(long handle, long threadId, long taskId);
   private static native void taskDone(long handle, long taskId);
-  private static native void threadCouldBlockOnShuffle(long handle, long threadId);
-  private static native void threadDoneWithShuffle(long handle, long threadId);
+  private static native void submittingToPool(long handle, long threadId);
+  private static native void waitingOnPool(long handle, long threadId);
+  private static native void doneWaitingOnPool(long handle, long threadId);
   private static native void forceRetryOOM(long handle, long threadId, int numOOMs);
   private static native void forceSplitAndRetryOOM(long handle, long threadId, int numOOMs);
   private static native void forceCudfException(long handle, long threadId, int numTimes);
@@ -211,4 +306,11 @@ public long getAndResetComputeTimeLostToRetry(long taskId) {
   private static native long getAndResetComputeTimeLostToRetry(long handle, long taskId);
   private static native void startRetryBlock(long handle, long threadId);
   private static native void endRetryBlock(long handle, long threadId);
+  private static native void checkAndBreakDeadlocks(long handle);
+  private static native boolean preCpuAlloc(long handle, long amount, boolean blocking);
+  private static native void postCpuAllocSuccess(long handle, long ptr, long amount,
+                                                 boolean blocking, boolean wasRecursive);
+  private static native boolean postCpuAllocFailed(long handle, boolean wasOom,
+                                                   boolean blocking, boolean wasRecursive);
+  private static native void cpuDeallocate(long handle, long ptr, long amount);
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/ThreadStateRegistry.java b/src/main/java/com/nvidia/spark/rapids/jni/ThreadStateRegistry.java
new file mode 100644
index 0000000000..4e7021e6ea
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/ThreadStateRegistry.java
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.HashMap;
+import java.util.HashSet;
+
+/**
+ * This is used to allow us to map a native thread id to a java thread so we can look at the
+ * state from a java perspective.
+ */
+class ThreadStateRegistry {
+  private static final Logger LOG = LoggerFactory.getLogger(ThreadStateRegistry.class);
+
+  private static final HashMap<Long, Thread> knownThreads = new HashMap<>();
+
+  public static synchronized void addThread(long nativeId, Thread t) {
+    knownThreads.put(nativeId, t);
+  }
+
+  // Typically called from JNI
+  public static synchronized void removeThread(long threadId) {
+    knownThreads.remove(threadId);
+  }
+
+  // This is likely called from JNI
+  public static synchronized boolean isThreadBlocked(long nativeId) {
+    Thread t = knownThreads.get(nativeId);
+    if (t == null || !t.isAlive()) {
+      // Dead is as good as blocked. This is mostly for tests, not so much for
+      // production
+      return true;
+    }
+    Thread.State state = t.getState();
+    switch (state) {
+      case BLOCKED:
+        // fall through
+      case WAITING:
+        // fall through
+      case TIMED_WAITING:
+        return true;
+      case TERMINATED:
+        // Technically there is a race with `!t.isAlive` check above, and dead is as good as
+        // blocked.
+        return true;
+      default:
+        return false;
+    }
+  }
+}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/LimitingOffHeapAllocForTests.java b/src/test/java/com/nvidia/spark/rapids/jni/LimitingOffHeapAllocForTests.java
new file mode 100644
index 0000000000..eb32667dc7
--- /dev/null
+++ b/src/test/java/com/nvidia/spark/rapids/jni/LimitingOffHeapAllocForTests.java
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.HostMemoryBuffer;
+
+import java.util.Optional;
+
+/**
+ * This provides a way to allocate and deallocate off heap buffers using the RmmSpark APIs for
+ * retry on allocations.
+ */
+public class LimitingOffHeapAllocForTests {
+  private static long limit;
+  private static long amountAllocated = 0;
+  public static synchronized void setLimit(long limit) {
+    LimitingOffHeapAllocForTests.limit = limit;
+    if (amountAllocated > 0) {
+      throw new IllegalStateException("PREVIOUS TEST LEAKED MEMORY!!!");
+    }
+  }
+
+  private static Optional<HostMemoryBuffer> allocInternal(long amount, boolean blocking) {
+    Optional<HostMemoryBuffer> ret = Optional.empty();
+    boolean wasOom = true;
+    boolean isRecursive = RmmSpark.preCpuAlloc(amount, blocking);
+    try {
+      synchronized (LimitingOffHeapAllocForTests.class) {
+        if (amountAllocated + amount <= limit) {
+          amountAllocated += amount;
+          wasOom = false;
+          HostMemoryBuffer buff = HostMemoryBuffer.allocate(amount);
+          final long ptr = buff.getAddress();
+          buff.setEventHandler(refCount -> {
+            if (refCount == 0) {
+              synchronized (LimitingOffHeapAllocForTests.class) {
+                amountAllocated -= amount;
+              }
+              RmmSpark.cpuDeallocate(ptr, amount);
+            }
+          });
+          ret = Optional.of(buff);
+        }
+      }
+    } finally {
+      if (ret.isPresent()) {
+        RmmSpark.postCpuAllocSuccess(ret.get().getAddress(), amount, blocking, isRecursive);
+      } else {
+        RmmSpark.postCpuAllocFailed(wasOom, blocking, isRecursive);
+      }
+    }
+    return ret;
+  }
+
+  /**
+   * Do a non-blocking allocation
+   * @param amount the amount to allocate
+   * @return the allocated buffer or not.
+   */
+  public static Optional<HostMemoryBuffer> tryAlloc(long amount) {
+    return allocInternal(amount, false);
+  }
+
+  /**
+   * Do a blocking allocation
+   * @param amount the amount to allocate
+   * @return the allocated buffer
+   */
+  public static HostMemoryBuffer alloc(long amount) {
+    Optional<HostMemoryBuffer> ret = Optional.empty();
+    while (!ret.isPresent()) {
+      ret = allocInternal(amount, true);
+    }
+    return ret.get();
+  }
+}
\ No newline at end of file
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkMonteCarlo.java b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkMonteCarlo.java
index e7d4c2a4da..1d1626935e 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkMonteCarlo.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkMonteCarlo.java
@@ -329,6 +329,7 @@ public void run() {
               if (runner.debugOoms) {
                 System.err.println("OOM for task: " + t.taskId +
                     " and thread: " + RmmSpark.getCurrentThreadId() + " " + oom);
+                oom.printStackTrace(System.err);
               }
               // ignored
             }
@@ -360,7 +361,7 @@ public void run() {
           }
         }
       } catch (Throwable e) {
-        System.err.println("ERROR: " + e);
+        System.err.println("ERROR: TID: " + RmmSpark.getCurrentThreadId() + " " + e);
         e.printStackTrace(System.err);
         hadOtherFailures = true;
       }
@@ -373,18 +374,10 @@ public boolean hadOtherFailures() {
 
   static class ShuffleThreadFactory implements ThreadFactory {
     static final AtomicLong idGen = new AtomicLong(0);
-    long id = idGen.getAndIncrement();
     @Override
     public Thread newThread(Runnable runnable) {
-      Runnable wrapped = () -> {
-        RmmSpark.associateCurrentThreadWithShuffle();
-        try {
-          runnable.run();
-        } finally {
-          RmmSpark.removeCurrentThreadAssociation();
-        }
-      };
-      Thread t = new Thread(wrapped);
+      long id = idGen.getAndIncrement();
+      Thread t = new Thread(runnable);
       t.setDaemon(true);
       t.setName("SHUFFLE-THREAD-" + id);
       return t;
@@ -543,6 +536,37 @@ public synchronized void setSitFailed() {
   }
 
   interface MemoryOp {
+    default void doIt(DeviceMemoryBuffer[] buffers, long taskId) {
+      long threadId = RmmSpark.getCurrentThreadId();
+      RmmSpark.shuffleThreadWorkingOnTasks(new long[]{taskId});
+      RmmSpark.startRetryBlock(threadId);
+      try {
+        int tries = 0;
+        while (tries < 100 && tries >= 0) {
+          try {
+            if (tries > 0) {
+              RmmSpark.blockThreadUntilReady();
+            }
+            tries++;
+            doIt(buffers);
+            tries = -1;
+          } catch (GpuRetryOOM oom) {
+            // Don't need to clear the buffers, because there is only one buffer.
+            numRetry.incrementAndGet();
+          } catch (CpuRetryOOM oom) {
+            // Don't need to clear the buffers, because there is only one buffer.
+            numRetry.incrementAndGet();
+          }
+        }
+        if (tries >= 100) {
+          throw new OutOfMemoryError("Could not make shuffle work after " + tries + " tries");
+        }
+      } finally {
+        RmmSpark.endRetryBlock(threadId);
+        RmmSpark.poolThreadFinishedForTask(taskId);
+      }
+    }
+
     void doIt(DeviceMemoryBuffer[] buffers);
 
     MemoryOp[] split();
@@ -748,7 +772,7 @@ private void cleanBuffers() {
       }
     }
 
-    public void run(ExecutorService shuffle) {
+    public void run(ExecutorService shuffle, long taskId) {
       buffers = new DeviceMemoryBuffer[numBuffers];
       allocatedBeforeError = 0;
       boolean isForShuffle = shuffle != null;
@@ -757,28 +781,31 @@ public void run(ExecutorService shuffle) {
         try {
           for (MemoryOp op: operations) {
             if (isForShuffle) {
-              // If shuffle is enabled the first allocation will happen on the shuffle thread...
-              RmmSpark.threadCouldBlockOnShuffle();
               try {
-                Future<?> f = shuffle.submit(() -> op.doIt(buffers));
+                RmmSpark.submittingToPool();
+                Future<?> f = shuffle.submit(() -> op.doIt(buffers, taskId));
+                RmmSpark.doneWaitingOnPool();
+                RmmSpark.waitingOnPool();
                 f.get(1000, TimeUnit.SECONDS);
               } finally {
                 isForShuffle = false;
-                RmmSpark.threadDoneWithShuffle();
+                RmmSpark.doneWaitingOnPool();
               }
             } else {
               op.doIt(buffers);
             }
           }
           done = true;
-        } catch (RetryOOM room) {
+        } catch (GpuRetryOOM room) {
+          numRetry.incrementAndGet();
+          cleanBuffers();
+          RmmSpark.blockThreadUntilReady();
+        } catch (CpuRetryOOM room) {
           numRetry.incrementAndGet();
           cleanBuffers();
           RmmSpark.blockThreadUntilReady();
         } catch (ExecutionException ee) {
-          // We are not able to do split and retry/etc from a shuffle
-          // so just bubble the exception on up
-          OutOfMemoryError oom = new OutOfMemoryError("");
+          OutOfMemoryError oom = new OutOfMemoryError("Came From Shuffle");
           oom.addSuppressed(ee);
           throw oom;
         } catch (InterruptedException | TimeoutException e) {
@@ -844,14 +871,19 @@ public long getTimeLost() {
 
     public void run(ExecutorService shuffle) {
       Thread.currentThread().setName("TASK RUNNER FOR " + taskId);
-      RmmSpark.associateCurrentThreadWithTask(taskId);
+      RmmSpark.currentThreadIsDedicatedToTask(taskId);
       try {
         RmmSpark.currentThreadStartRetryBlock();
         while (!toDo.isEmpty()) {
           TaskOpSet tos = toDo.pollFirst();
           try {
-            tos.run(shuffle);
-          } catch (SplitAndRetryOOM soom) {
+            tos.run(shuffle, taskId);
+          } catch (GpuSplitAndRetryOOM soom) {
+            TaskOpSet[] split = tos.split();
+            toDo.push(split[1]);
+            toDo.push(split[0]);
+            numSplitAndRetry.incrementAndGet();
+          } catch (CpuSplitAndRetryOOM soom) {
             TaskOpSet[] split = tos.split();
             toDo.push(split[1]);
             toDo.push(split[0]);
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java
index cd11da05ae..373deb9ca0 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java
@@ -18,6 +18,8 @@
 
 import ai.rapids.cudf.CudfException;
 import ai.rapids.cudf.DeviceMemoryBuffer;
+import ai.rapids.cudf.HostMemoryBuffer;
+import ai.rapids.cudf.MemoryBuffer;
 import ai.rapids.cudf.Rmm;
 import ai.rapids.cudf.RmmAllocationMode;
 import ai.rapids.cudf.RmmCudaMemoryResource;
@@ -25,7 +27,6 @@
 import ai.rapids.cudf.RmmEventHandler;
 import ai.rapids.cudf.RmmLimitingResourceAdaptor;
 import ai.rapids.cudf.RmmTrackingResourceAdaptor;
-import ai.rapids.cudf.ColumnVector.EventHandler;
 
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
@@ -62,7 +63,7 @@ public interface TaskThreadOp<T> {
 
   public static class TaskThread extends Thread {
     private final String name;
-    private final boolean isShuffle;
+    private final boolean isForPool;
     private long threadId = -1;
     private long taskId = 100;
 
@@ -71,10 +72,10 @@ public TaskThread(String name, long taskId) {
       this.taskId = taskId;
     }
 
-    public TaskThread(String name, boolean isShuffle) {
+    public TaskThread(String name, boolean isForPool) {
       super(name);
       this.name = name;
-      this.isShuffle = isShuffle;
+      this.isForPool = isForPool;
     }
 
     public synchronized long getThreadId() {
@@ -89,17 +90,15 @@ public void initialize() throws ExecutionException, InterruptedException, Timeou
       Future<Void> waitForStart = doIt(new TaskThreadOp<Void>() {
         @Override
         public Void doIt() {
-          if (isShuffle) {
-            RmmSpark.associateCurrentThreadWithShuffle();
-          } else {
-            RmmSpark.associateCurrentThreadWithTask(taskId);
+          if (!isForPool) {
+            RmmSpark.currentThreadIsDedicatedToTask(taskId);
           }
           return null;
         }
 
         @Override
         public String toString() {
-          return "INIT TASK " + name + " " + (isShuffle ? "SHUFFLE" : ("TASK " + taskId));
+          return "INIT TASK " + name + " " + (isForPool ? "POOL" : ("TASK " + taskId));
         }
       });
       System.err.println("WAITING FOR STARTUP (" + name + ")");
@@ -278,13 +277,16 @@ public void run() {
         }
         System.err.println("INSIDE THREAD RUNNING (" + name + ")");
         while (true) {
-          TaskThreadOp op = queue.poll(1000, TimeUnit.MILLISECONDS);
-          System.err.println("GOT '" + op + "' ON " + name);
-          if (op instanceof TaskThreadDoneOp) {
-            return;
-          }
-          // null is returned from the queue on a timeout
+          // Because of how our deadlock detection code works we don't want to
+          // block this thread, so we do this in a busy loop. It is not ideal,
+          // but works, and is more accurate to what the Spark is likely to do
+          TaskThreadOp op = queue.poll();
+          // null is returned from the queue if it is empty
           if (op != null) {
+            System.err.println("GOT '" + op + "' ON " + name);
+            if (op instanceof TaskThreadDoneOp) {
+              return;
+            }
             op.doIt();
             System.err.println("'" + op + "' FINISHED ON " + name);
           }
@@ -293,7 +295,6 @@ public void run() {
         System.err.println("THROWABLE CAUGHT IN " + name);
         t.printStackTrace(System.err);
       } finally {
-        RmmSpark.removeCurrentThreadAssociation();
         System.err.println("THREAD EXITING " + name);
       }
     }
@@ -306,22 +307,23 @@ public void testBasicInitAndTeardown() {
   }
 
   @Test
-  public void testInsertOOMs() {
+  public void testInsertOOMsGpu() {
     Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 512 * 1024 * 1024);
     RmmSpark.setEventHandler(new BaseRmmEventHandler(), "stderr");
     long threadId = RmmSpark.getCurrentThreadId();
     long taskid = 0; // This is arbitrary
+    Thread t = Thread.currentThread();
     assertEquals(RmmSparkThreadState.UNKNOWN, RmmSpark.getStateOf(threadId));
     assertEquals(0, RmmSpark.getAndResetNumRetryThrow(taskid));
     assertEquals(0, RmmSpark.getAndResetNumSplitRetryThrow(taskid));
     assertEquals(0, RmmSpark.getAndResetComputeTimeLostToRetryNs(taskid));
-    RmmSpark.associateThreadWithTask(threadId, taskid);
-    assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId));
+    RmmSpark.startDedicatedTaskThread(threadId, taskid, t);
+    assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
     try {
       RmmSpark.startRetryBlock(threadId);
       // Allocate something small and verify that it works...
       Rmm.alloc(100).close();
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId));
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
 
       try {
         Thread.sleep(1); // Just in case we run on a really fast system in the future where
@@ -332,36 +334,101 @@ public void testInsertOOMs() {
       // Force an exception
       RmmSpark.forceRetryOOM(threadId);
       // No change in the state after a force
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId));
-      assertThrows(RetryOOM.class, () -> Rmm.alloc(100).close());
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+      assertThrows(GpuRetryOOM.class, () -> Rmm.alloc(100).close());
       assert(RmmSpark.getAndResetComputeTimeLostToRetryNs(taskid) > 0);
 
       // Verify that injecting OOM does not cause the block to actually happen or
       // the state to change
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId));
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
       assertEquals(1, RmmSpark.getAndResetNumRetryThrow(taskid));
       assertEquals(0, RmmSpark.getAndResetNumSplitRetryThrow(taskid));
       RmmSpark.blockThreadUntilReady();
 
       // Allocate something small and verify that it works...
       Rmm.alloc(100).close();
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId));
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
 
       // Force another exception
       RmmSpark.forceSplitAndRetryOOM(threadId);
       // No change in state after force
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId));
-      assertThrows(SplitAndRetryOOM.class, () -> Rmm.alloc(100).close());
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+      assertThrows(GpuSplitAndRetryOOM.class, () -> Rmm.alloc(100).close());
       assertEquals(0, RmmSpark.getAndResetNumRetryThrow(taskid));
       assertEquals(1, RmmSpark.getAndResetNumSplitRetryThrow(taskid));
 
       // Verify that injecting OOM does not cause the block to actually happen
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId));
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
       RmmSpark.blockThreadUntilReady();
 
       // Allocate something small and verify that it works...
       Rmm.alloc(100).close();
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId));
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+    } finally {
+      RmmSpark.taskDone(taskid);
+    }
+    assertEquals(RmmSparkThreadState.UNKNOWN, RmmSpark.getStateOf(threadId));
+  }
+
+  @Test
+  public void testInsertOOMsCpu() {
+    Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 512 * 1024 * 1024);
+    RmmSpark.setEventHandler(new BaseRmmEventHandler(), "stderr");
+    LimitingOffHeapAllocForTests.setLimit(512 * 1024 * 1024);
+    long threadId = RmmSpark.getCurrentThreadId();
+    long taskid = 0; // This is arbitrary
+    Thread t = Thread.currentThread();
+    assertEquals(RmmSparkThreadState.UNKNOWN, RmmSpark.getStateOf(threadId));
+    assertEquals(0, RmmSpark.getAndResetNumRetryThrow(taskid));
+    assertEquals(0, RmmSpark.getAndResetNumSplitRetryThrow(taskid));
+    assertEquals(0, RmmSpark.getAndResetComputeTimeLostToRetryNs(taskid));
+    RmmSpark.startDedicatedTaskThread(threadId, taskid, t);
+    assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+    try {
+      RmmSpark.startRetryBlock(threadId);
+      // Allocate something small and verify that it works...
+      LimitingOffHeapAllocForTests.alloc(100).close();
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+
+      try {
+        Thread.sleep(1); // Just in case we run on a really fast system in the future where
+        // all of this is sub-nanosecond...
+      } catch (InterruptedException e) {
+        // Ignored
+      }
+      // Force an exception
+      RmmSpark.forceRetryOOM(threadId);
+      // No change in the state after a force
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+      assertThrows(CpuRetryOOM.class, () -> LimitingOffHeapAllocForTests.alloc(100).close());
+      assert(RmmSpark.getAndResetComputeTimeLostToRetryNs(taskid) > 0);
+
+      // Verify that injecting OOM does not cause the block to actually happen or
+      // the state to change
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+      assertEquals(1, RmmSpark.getAndResetNumRetryThrow(taskid));
+      assertEquals(0, RmmSpark.getAndResetNumSplitRetryThrow(taskid));
+      RmmSpark.blockThreadUntilReady();
+
+      // Allocate something small and verify that it works...
+      LimitingOffHeapAllocForTests.alloc(100).close();
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+
+      // Force another exception
+      RmmSpark.forceSplitAndRetryOOM(threadId);
+      // No change in state after force
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+      assertThrows(CpuSplitAndRetryOOM.class, () -> LimitingOffHeapAllocForTests.alloc(100).close());
+      assertEquals(0, RmmSpark.getAndResetNumRetryThrow(taskid));
+      assertEquals(1, RmmSpark.getAndResetNumSplitRetryThrow(taskid));
+
+      // Verify that injecting OOM does not cause the block to actually happen
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+      RmmSpark.blockThreadUntilReady();
+
+      // Allocate something small and verify that it works...
+      LimitingOffHeapAllocForTests.alloc(100).close();
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
     } finally {
       RmmSpark.taskDone(taskid);
     }
@@ -374,16 +441,18 @@ public void testReentrantAssociateThread() {
     RmmSpark.setEventHandler(new BaseRmmEventHandler(), "stderr");
     long threadId = 100;
     long taskId = 1;
+    long[] taskIds = new long[] {taskId};
+    Thread t = Thread.currentThread();
     try {
-      RmmSpark.associateThreadWithTask(threadId, taskId);
-      RmmSpark.associateThreadWithTask(threadId, taskId);
-      RmmSpark.removeThreadAssociation(threadId);
+      RmmSpark.startDedicatedTaskThread(threadId, taskId, t);
+      RmmSpark.startDedicatedTaskThread(threadId, taskId, t);
+      RmmSpark.removeDedicatedThreadAssociation(threadId, taskId);
       // Not removing twice because we don't have to match up the counts so it fits with how
       // the GPU semaphore is used.
-      RmmSpark.associateThreadWithShuffle(threadId);
-      RmmSpark.associateThreadWithShuffle(threadId);
-      RmmSpark.removeThreadAssociation(threadId);
-      RmmSpark.removeThreadAssociation(threadId);
+      RmmSpark.shuffleThreadWorkingTasks(threadId, t, taskIds);
+      RmmSpark.shuffleThreadWorkingTasks(threadId, t, taskIds);
+      RmmSpark.removeDedicatedThreadAssociation(threadId, taskId);
+      RmmSpark.removeDedicatedThreadAssociation(threadId, taskId);
     } finally {
       RmmSpark.taskDone(taskId);
     }
@@ -397,17 +466,21 @@ public void testAssociateThread() {
     long threadIdTwo = 300;
     long taskId = 2;
     long otherTaskId = 3;
+    long[] taskIds = new long[] {taskId, otherTaskId};
+    Thread t = Thread.currentThread();
     try {
-      RmmSpark.associateThreadWithTask(threadIdOne, taskId);
-      assertThrows(CudfException.class, () -> RmmSpark.associateThreadWithShuffle(threadIdOne));
-      assertThrows(CudfException.class, () -> RmmSpark.associateThreadWithTask(threadIdOne, otherTaskId));
+      RmmSpark.startDedicatedTaskThread(threadIdOne, taskId, t);
+      assertThrows(CudfException.class, () -> RmmSpark.shuffleThreadWorkingTasks(threadIdOne, t, taskIds));
+      // There can be races when a thread goes from one task to another, so we just make it safe to do.
+      RmmSpark.startDedicatedTaskThread(threadIdOne, otherTaskId, t);
 
-      RmmSpark.associateThreadWithShuffle(threadIdTwo);
-      assertThrows(CudfException.class, () -> RmmSpark.associateThreadWithTask(threadIdTwo, otherTaskId));
+      RmmSpark.shuffleThreadWorkingTasks(threadIdTwo, t, taskIds);
+      assertThrows(CudfException.class, () -> RmmSpark.startDedicatedTaskThread(threadIdTwo, otherTaskId, t));
       // Remove the association
-      RmmSpark.removeThreadAssociation(threadIdTwo);
+      RmmSpark.removeDedicatedThreadAssociation(threadIdTwo, taskId);
+      RmmSpark.removeDedicatedThreadAssociation(threadIdTwo, otherTaskId);
       // Add in a new association
-      RmmSpark.associateThreadWithTask(threadIdTwo, taskId);
+      RmmSpark.startDedicatedTaskThread(threadIdTwo, taskId, t);
     } finally {
       RmmSpark.taskDone(taskId);
       RmmSpark.taskDone(otherTaskId);
@@ -415,19 +488,40 @@ public void testAssociateThread() {
   }
 
 
-  static class AllocOnAnotherThread implements AutoCloseable {
+  static abstract class AllocOnAnotherThread implements AutoCloseable {
     final TaskThread thread;
     final long size;
-    DeviceMemoryBuffer b = null;
+    final long taskId;
+    MemoryBuffer b = null;
     Future<Void> fb;
     Future<Void> fc = null;
 
     public AllocOnAnotherThread(TaskThread thread, long size) {
       this.thread = thread;
       this.size = size;
+      this.taskId = -1;
+      fb = thread.doIt(new TaskThreadOp<Void>() {
+        @Override
+        public Void doIt() {
+          doAlloc();
+          return null;
+        }
+
+        @Override
+        public String toString() {
+          return "ALLOC(" + size + ")";
+        }
+      });
+    }
+
+    public AllocOnAnotherThread(TaskThread thread, long size, long taskId) {
+      this.thread = thread;
+      this.size = size;
+      this.taskId = taskId;
       fb = thread.doIt(new TaskThreadOp<Void>() {
         @Override
         public Void doIt() {
+          RmmSpark.shuffleThreadWorkingOnTasks(new long[]{taskId});
           doAlloc();
           return null;
         }
@@ -443,7 +537,7 @@ public void waitForAlloc() throws ExecutionException, InterruptedException, Time
       fb.get(1000, TimeUnit.MILLISECONDS);
     }
 
-    public void freeOnThread() throws ExecutionException, InterruptedException, TimeoutException {
+    public void freeOnThread() {
       if (fc != null) {
         throw new IllegalStateException("free called multiple times");
       }
@@ -473,20 +567,60 @@ public void freeAndWait() throws ExecutionException, InterruptedException, Timeo
       waitForFree();
     }
 
-    private Void doAlloc() {
+    abstract protected Void doAlloc();
+
+    @Override
+    public synchronized void close() {
+      if (b != null) {
+        try {
+          b.close();
+          b = null;
+        } finally {
+          if (this.taskId > 0) {
+            RmmSpark.poolThreadFinishedForTasks(thread.threadId, new long[]{taskId});
+          }
+        }
+      }
+    }
+  }
+
+  public static class GpuAllocOnAnotherThread extends AllocOnAnotherThread {
+
+    public GpuAllocOnAnotherThread(TaskThread thread, long size) {
+      super(thread, size);
+    }
+
+    public GpuAllocOnAnotherThread(TaskThread thread, long size, long taskId) {
+      super(thread, size, taskId);
+    }
+
+    @Override
+    protected Void doAlloc() {
       DeviceMemoryBuffer tmp = Rmm.alloc(size);
       synchronized (this) {
         b = tmp;
       }
       return null;
     }
+  }
+
+  public static class CpuAllocOnAnotherThread extends AllocOnAnotherThread {
+
+    public CpuAllocOnAnotherThread(TaskThread thread, long size) {
+      super(thread, size);
+    }
+
+    public CpuAllocOnAnotherThread(TaskThread thread, long size, long taskId) {
+      super(thread, size, taskId);
+    }
 
     @Override
-    public synchronized void close() {
-      if (b != null) {
-        b.close();
-        b = null;
+    protected Void doAlloc() {
+      HostMemoryBuffer tmp = LimitingOffHeapAllocForTests.alloc(size);
+      synchronized (this) {
+        b = tmp;
       }
+      return null;
     }
   }
 
@@ -513,6 +647,51 @@ void setupRmmForTestingWithLimits(long maxAllocSize, RmmEventHandler eventHandle
     RmmSpark.setEventHandler(eventHandler, "stderr");
   }
 
+  @Test
+  public void testNonBlockingCpuAlloc() {
+    // We are not going to use the GPU here, but this sets it all up for us.
+    setupRmmForTestingWithLimits(10 * 1024 * 1024);
+    // We are just going to pretend that we are doing an allocations
+    long taskId = 0;
+    long threadId = RmmSpark.getCurrentThreadId();
+    RmmSpark.currentThreadIsDedicatedToTask(taskId);
+    assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+    try {
+      boolean wasRecursive = RmmSpark.preCpuAlloc(100, false);
+      assertEquals(RmmSparkThreadState.THREAD_ALLOC, RmmSpark.getStateOf(threadId));
+      long address;
+      try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(100)) {
+        address = buffer.getAddress();
+        RmmSpark.postCpuAllocSuccess(address, 100, false, wasRecursive);
+        assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+      }
+      RmmSpark.cpuDeallocate(address, 100);
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+    } finally {
+      RmmSpark.removeDedicatedThreadAssociation(threadId, taskId);
+    }
+  }
+
+  @Test
+  public void testNonBlockingCpuAllocFailedOOM() {
+    // We are not going to use the GPU here, but this sets it all up for us.
+    setupRmmForTestingWithLimits(10 * 1024 * 1024);
+    // We are just going to pretend that we are doing an allocations
+    long taskId = 0;
+    long threadId = RmmSpark.getCurrentThreadId();
+    RmmSpark.currentThreadIsDedicatedToTask(taskId);
+    assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+    try {
+      boolean wasRecursive = RmmSpark.preCpuAlloc(100, false);
+      assertEquals(RmmSparkThreadState.THREAD_ALLOC, RmmSpark.getStateOf(threadId));
+      // TODO put this on a background thread so we can time out if it blocks.
+      RmmSpark.postCpuAllocFailed(true, false, wasRecursive);
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+    } finally {
+      RmmSpark.removeDedicatedThreadAssociation(threadId, taskId);
+    }
+  }
+
   @Test
   public void testBasicBlocking() throws ExecutionException, InterruptedException, TimeoutException {
     // 10 MiB
@@ -523,16 +702,49 @@ public void testBasicBlocking() throws ExecutionException, InterruptedException,
     taskTwo.initialize();
     try {
       long tOneId = taskOne.getThreadId();
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(tOneId));
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tOneId));
+
+      long tTwoId = taskTwo.getThreadId();
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId));
+
+      try (AllocOnAnotherThread firstOne = new GpuAllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) {
+        firstOne.waitForAlloc();
+        // This one should block
+        try (AllocOnAnotherThread secondOne = new GpuAllocOnAnotherThread(taskTwo, 6 * 1024 * 1024)) {
+          taskTwo.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS);
+          // Free the first allocation to wake up the second task...
+          firstOne.freeAndWait();
+          secondOne.waitForAlloc();
+          secondOne.freeAndWait();
+        }
+      }
+    } finally {
+      taskOne.done();
+      taskTwo.done();
+    }
+  }
+
+  @Test
+  public void testBasicCpuBlocking() throws ExecutionException, InterruptedException, TimeoutException {
+    // 10 MiB
+    setupRmmForTestingWithLimits(10 * 1024 * 1024);
+    LimitingOffHeapAllocForTests.setLimit(10 * 1024 * 1024);
+    TaskThread taskOne = new TaskThread("TEST THREAD ONE", 1);
+    TaskThread taskTwo = new TaskThread("TEST THREAD TWO", 2);
+    taskOne.initialize();
+    taskTwo.initialize();
+    try {
+      long tOneId = taskOne.getThreadId();
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tOneId));
 
       long tTwoId = taskTwo.getThreadId();
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(tTwoId));
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId));
 
-      try (AllocOnAnotherThread firstOne = new AllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) {
+      try (AllocOnAnotherThread firstOne = new CpuAllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) {
         firstOne.waitForAlloc();
         // This one should block
-        try (AllocOnAnotherThread secondOne = new AllocOnAnotherThread(taskTwo, 6 * 1024 * 1024)) {
-          taskTwo.pollForState(RmmSparkThreadState.TASK_BLOCKED, 1000, TimeUnit.MILLISECONDS);
+        try (AllocOnAnotherThread secondOne = new CpuAllocOnAnotherThread(taskTwo, 6 * 1024 * 1024)) {
+          taskTwo.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS);
           // Free the first allocation to wake up the second task...
           firstOne.freeAndWait();
           secondOne.waitForAlloc();
@@ -546,6 +758,68 @@ public void testBasicBlocking() throws ExecutionException, InterruptedException,
     }
   }
 
+  @Test
+  public void testBasicMixedBlocking() throws ExecutionException, InterruptedException, TimeoutException {
+    // 10 MiB
+    setupRmmForTestingWithLimits(10 * 1024 * 1024);
+    LimitingOffHeapAllocForTests.setLimit(10 * 1024 * 1024);
+    TaskThread taskOne = new TaskThread("TEST THREAD ONE", 1);
+    TaskThread taskTwo = new TaskThread("TEST THREAD TWO", 2);
+    TaskThread taskThree = new TaskThread("TEST THREAD THREE", 3);
+    TaskThread taskFour = new TaskThread("TEST THREAD FOUR", 4);
+    taskOne.initialize();
+    taskTwo.initialize();
+    taskThree.initialize();
+    taskFour.initialize();
+    try {
+      long tOneId = taskOne.getThreadId();
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tOneId));
+
+      long tTwoId = taskTwo.getThreadId();
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId));
+
+      long tThreeId = taskThree.getThreadId();
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tThreeId));
+
+      long tFourId = taskFour.getThreadId();
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tFourId));
+
+      try (AllocOnAnotherThread firstGpuAlloc = new GpuAllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) {
+        firstGpuAlloc.waitForAlloc();
+
+        try (AllocOnAnotherThread firstCpuAlloc = new CpuAllocOnAnotherThread(taskTwo, 5 * 1024 * 1024)) {
+          firstCpuAlloc.waitForAlloc();
+
+          // Blocking GPU Alloc
+          try (AllocOnAnotherThread secondGpuAlloc = new GpuAllocOnAnotherThread(taskThree, 6 * 1024 * 1024)) {
+            taskThree.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS);
+
+            // Blocking CPU Alloc
+            try (AllocOnAnotherThread secondCpuAlloc = new CpuAllocOnAnotherThread(taskFour, 6 * 1024 * 1024)) {
+              taskFour.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS);
+
+              // We want to make sure that the order of wakeup corresponds to the location of the data that was released
+              // Not necessarily the priority of the task/thread.
+              firstCpuAlloc.freeAndWait();
+              secondCpuAlloc.waitForAlloc();
+              secondCpuAlloc.freeAndWait();
+            }
+
+            // Now do the GPU frees
+            firstGpuAlloc.freeAndWait();
+            secondGpuAlloc.waitForAlloc();
+            secondGpuAlloc.freeAndWait();
+          }
+        }
+      }
+    } finally {
+      taskOne.done();
+      taskTwo.done();
+      taskThree.done();
+      taskFour.done();
+    }
+  }
+
   @Test
   public void testShuffleBlocking() throws ExecutionException, InterruptedException, TimeoutException {
     // 10 MiB
@@ -559,30 +833,112 @@ public void testShuffleBlocking() throws ExecutionException, InterruptedExceptio
     taskTwo.initialize();
     try {
       long sOneId = shuffleOne.getThreadId();
-      assertEquals(RmmSparkThreadState.SHUFFLE_RUNNING, RmmSpark.getStateOf(sOneId));
+      // It is not in a running state until it has something to do.
 
       long tOneId = taskOne.getThreadId();
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(tOneId));
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tOneId));
 
       long tTwoId = taskTwo.getThreadId();
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(tTwoId));
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId));
 
-      try (AllocOnAnotherThread firstOne = new AllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) {
+      try (AllocOnAnotherThread firstOne = new GpuAllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) {
         firstOne.waitForAlloc();
         // This one should block
-        try (AllocOnAnotherThread secondOne = new AllocOnAnotherThread(taskTwo, 6 * 1024 * 1024)) {
-          taskTwo.pollForState(RmmSparkThreadState.TASK_BLOCKED, 1000, TimeUnit.MILLISECONDS);
+        try (AllocOnAnotherThread secondOne = new GpuAllocOnAnotherThread(taskTwo, 6 * 1024 * 1024)) {
+          taskTwo.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS);
+          // Make sure that shuffle has higher priority than tasks...
+          try (AllocOnAnotherThread thirdOne = new GpuAllocOnAnotherThread(shuffleOne, 6 * 1024 * 1024, 2)) {
+            shuffleOne.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS);
+            // But taskOne is not blocked, so there will be no retry until it is blocked, or else
+            // it is making progress
+            taskOne.doIt((TaskThreadOp<Void>) () -> {
+              try {
+                Thread.sleep(200);
+              } catch (InterruptedException e) {
+                throw new RuntimeException(e);
+              }
+              return null;
+            });
+
+            try {
+              secondOne.waitForAlloc();
+              fail("SHOULD HAVE THROWN...");
+            } catch (ExecutionException ee) {
+              assert (ee.getCause() instanceof GpuRetryOOM);
+            }
+            secondOne.freeAndWait();
 
-          // Make sure that shuffle has higher priority than do tasks...
-          try (AllocOnAnotherThread thirdOne = new AllocOnAnotherThread(shuffleOne, 6 * 1024 * 1024)) {
-            shuffleOne.pollForState(RmmSparkThreadState.SHUFFLE_BLOCKED, 1000, TimeUnit.MILLISECONDS);
             // Free the first allocation to wake up the shuffle thread, but not the second task yet...
             firstOne.freeAndWait();
+
+            thirdOne.waitForAlloc();
+            thirdOne.freeAndWait();
+          }
+        }
+      }
+    } finally {
+      shuffleOne.done();
+      taskOne.done();
+      taskTwo.done();
+    }
+  }
+
+
+  @Test
+  public void testShuffleBlockingCpu() throws ExecutionException, InterruptedException, TimeoutException {
+    // 10 MiB
+    setupRmmForTestingWithLimits(10 * 1024 * 1024);
+    LimitingOffHeapAllocForTests.setLimit(10 * 1024 * 1024);
+    TaskThread shuffleOne = new TaskThread("TEST THREAD SHUFFLE", true);
+    TaskThread taskOne = new TaskThread("TEST THREAD ONE", 1);
+    TaskThread taskTwo = new TaskThread("TEST THREAD TWO", 2);
+
+    shuffleOne.initialize();
+    taskOne.initialize();
+    taskTwo.initialize();
+    try {
+      long sOneId = shuffleOne.getThreadId();
+      // It is not in a running state until it has something to do.
+
+      long tOneId = taskOne.getThreadId();
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tOneId));
+
+      long tTwoId = taskTwo.getThreadId();
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId));
+
+      try (AllocOnAnotherThread firstOne = new CpuAllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) {
+        firstOne.waitForAlloc();
+        // This one should block
+        try (AllocOnAnotherThread secondOne = new CpuAllocOnAnotherThread(taskTwo, 6 * 1024 * 1024)) {
+          taskTwo.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS);
+          // Make sure that shuffle has higher priority than tasks...
+          try (AllocOnAnotherThread thirdOne = new CpuAllocOnAnotherThread(shuffleOne, 6 * 1024 * 1024, 2)) {
+            shuffleOne.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS);
+            // But taskOne is not blocked, so there will be no retry until it is blocked, or else
+            // it is making progress
+            taskOne.doIt((TaskThreadOp<Void>) () -> {
+              try {
+                Thread.sleep(200);
+              } catch (InterruptedException e) {
+                throw new RuntimeException(e);
+              }
+              return null;
+            });
+
+            try {
+              secondOne.waitForAlloc();
+              fail("SHOULD HAVE THROWN...");
+            } catch (ExecutionException ee) {
+              assert (ee.getCause() instanceof CpuRetryOOM);
+            }
+            secondOne.freeAndWait();
+
+            // Free the first allocation to wake up the shuffle thread, but not the second task yet...
+            firstOne.freeAndWait();
+
             thirdOne.waitForAlloc();
             thirdOne.freeAndWait();
           }
-          secondOne.waitForAlloc();
-          secondOne.freeAndWait();
         }
       }
     } finally {
@@ -604,47 +960,110 @@ public void testBasicBUFN() throws ExecutionException, InterruptedException, Tim
     taskTwo.initialize();
     try {
       long tThreeId = taskThree.getThreadId();
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(tThreeId));
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tThreeId));
 
       long tTwoId = taskTwo.getThreadId();
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(tTwoId));
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId));
 
-      try (AllocOnAnotherThread allocThreeOne = new AllocOnAnotherThread(taskThree, 5 * 1024 * 1024)) {
+      try (AllocOnAnotherThread allocThreeOne = new GpuAllocOnAnotherThread(taskThree, 5 * 1024 * 1024)) {
         allocThreeOne.waitForAlloc();
-        try (AllocOnAnotherThread allocTwoOne = new AllocOnAnotherThread(taskTwo, 3 * 1024 * 1024)) {
+        try (AllocOnAnotherThread allocTwoOne = new GpuAllocOnAnotherThread(taskTwo, 3 * 1024 * 1024)) {
           allocTwoOne.waitForAlloc();
 
-          // This one should block
-          try (AllocOnAnotherThread allocTwoTwo = new AllocOnAnotherThread(taskTwo, 3 * 1024 * 1024)) {
-            taskTwo.pollForState(RmmSparkThreadState.TASK_BLOCKED, 1000, TimeUnit.MILLISECONDS);
+          try (AllocOnAnotherThread allocTwoTwo = new GpuAllocOnAnotherThread(taskTwo, 3 * 1024 * 1024)) {
+            taskTwo.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS);
 
-            try (AllocOnAnotherThread allocThreeTwo = new AllocOnAnotherThread(taskThree, 4 * 1024 * 1024)) {
+            try (AllocOnAnotherThread allocThreeTwo = new GpuAllocOnAnotherThread(taskThree, 4 * 1024 * 1024)) {
               // This one should be able to allocate because there is not enough memory, but
               // now all the threads would be blocked, so the lowest priority thread is going to
               // become BUFN
-              taskThree.pollForState(RmmSparkThreadState.TASK_BUFN_WAIT, 1000, TimeUnit.MILLISECONDS);
+              taskThree.pollForState(RmmSparkThreadState.THREAD_BUFN_WAIT, 1000, TimeUnit.MILLISECONDS);
               try {
                 allocThreeTwo.waitForAlloc();
                 fail("ALLOC AFTER BUFN SHOULD HAVE THROWN...");
               } catch (ExecutionException ee) {
-                assert(ee.getCause() instanceof RetryOOM);
+                assert(ee.getCause() instanceof GpuRetryOOM);
               }
               // allocOneTwo cannot be freed, nothing was allocated because it threw an exception.
               allocThreeOne.freeAndWait();
               Future<Void> f = taskThree.blockUntilReady();
-              taskThree.pollForState(RmmSparkThreadState.TASK_BUFN, 1000, TimeUnit.MILLISECONDS);
+              taskThree.pollForState(RmmSparkThreadState.THREAD_BUFN, 1000, TimeUnit.MILLISECONDS);
 
               // taskOne should only wake up after we finish task 2
               // Task two is now able to alloc
               allocTwoTwo.freeAndWait();
               allocTwoOne.freeAndWait();
               // Task two has freed things, but is still not done, so task one will stay blocked...
-              taskTwo.pollForState(RmmSparkThreadState.TASK_RUNNING, 1000, TimeUnit.MILLISECONDS);
-              taskThree.pollForState(RmmSparkThreadState.TASK_BUFN, 1000, TimeUnit.MILLISECONDS);
+              taskTwo.pollForState(RmmSparkThreadState.THREAD_RUNNING, 1000, TimeUnit.MILLISECONDS);
+              taskThree.pollForState(RmmSparkThreadState.THREAD_BUFN, 1000, TimeUnit.MILLISECONDS);
 
               taskTwo.done().get(1000, TimeUnit.MILLISECONDS);
               // Now that task two is done see if task one is running again...
-              taskThree.pollForState(RmmSparkThreadState.TASK_RUNNING, 1000, TimeUnit.MILLISECONDS);
+              taskThree.pollForState(RmmSparkThreadState.THREAD_RUNNING, 1000, TimeUnit.MILLISECONDS);
+              // Now we could finish trying our allocations, but this is good enough...
+            }
+          }
+        }
+      }
+    } finally {
+      taskThree.done();
+      taskTwo.done();
+    }
+  }
+
+  @Test
+  public void testBasicBUFNCpu() throws ExecutionException, InterruptedException, TimeoutException {
+    // 10 MiB
+    setupRmmForTestingWithLimits(10 * 1024 * 1024);
+    LimitingOffHeapAllocForTests.setLimit(10 * 1024 * 1024);
+    // A task id of 3 is higher than a task id of 2, so it should be a lower
+    // priority and become BUFN ahead of taskTwo.
+    TaskThread taskThree = new TaskThread("TEST THREAD ONE", 3);
+    TaskThread taskTwo = new TaskThread("TEST THREAD TWO", 2);
+    taskThree.initialize();
+    taskTwo.initialize();
+    try {
+      long tThreeId = taskThree.getThreadId();
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tThreeId));
+
+      long tTwoId = taskTwo.getThreadId();
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tTwoId));
+
+      try (AllocOnAnotherThread allocThreeOne = new CpuAllocOnAnotherThread(taskThree, 5 * 1024 * 1024)) {
+        allocThreeOne.waitForAlloc();
+        try (AllocOnAnotherThread allocTwoOne = new CpuAllocOnAnotherThread(taskTwo, 3 * 1024 * 1024)) {
+          allocTwoOne.waitForAlloc();
+
+          try (AllocOnAnotherThread allocTwoTwo = new CpuAllocOnAnotherThread(taskTwo, 3 * 1024 * 1024)) {
+            taskTwo.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS);
+
+            try (AllocOnAnotherThread allocThreeTwo = new CpuAllocOnAnotherThread(taskThree, 4 * 1024 * 1024)) {
+              // This one should be able to allocate because there is not enough memory, but
+              // now all the threads would be blocked, so the lowest priority thread is going to
+              // become BUFN
+              taskThree.pollForState(RmmSparkThreadState.THREAD_BUFN_WAIT, 1000, TimeUnit.MILLISECONDS);
+              try {
+                allocThreeTwo.waitForAlloc();
+                fail("ALLOC AFTER BUFN SHOULD HAVE THROWN...");
+              } catch (ExecutionException ee) {
+                assert(ee.getCause() instanceof CpuRetryOOM);
+              }
+              // allocOneTwo cannot be freed, nothing was allocated because it threw an exception.
+              allocThreeOne.freeAndWait();
+              Future<Void> f = taskThree.blockUntilReady();
+              taskThree.pollForState(RmmSparkThreadState.THREAD_BUFN, 1000, TimeUnit.MILLISECONDS);
+
+              // taskOne should only wake up after we finish task 2
+              // Task two is now able to alloc
+              allocTwoTwo.freeAndWait();
+              allocTwoOne.freeAndWait();
+              // Task two has freed things, but is still not done, so task one will stay blocked...
+              taskTwo.pollForState(RmmSparkThreadState.THREAD_RUNNING, 1000, TimeUnit.MILLISECONDS);
+              taskThree.pollForState(RmmSparkThreadState.THREAD_BUFN, 1000, TimeUnit.MILLISECONDS);
+
+              taskTwo.done().get(1000, TimeUnit.MILLISECONDS);
+              // Now that task two is done see if task one is running again...
+              taskThree.pollForState(RmmSparkThreadState.THREAD_RUNNING, 1000, TimeUnit.MILLISECONDS);
               // Now we could finish trying our allocations, but this is good enough...
             }
           }
@@ -666,24 +1085,24 @@ public void testBUFNSplitAndRetrySingleThread() throws ExecutionException, Inter
     taskOne.initialize();
     try {
       long threadId = taskOne.getThreadId();
-      assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId));
-      try (AllocOnAnotherThread one = new AllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) {
+      assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
+      try (AllocOnAnotherThread one = new GpuAllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) {
         one.waitForAlloc();
-        try (AllocOnAnotherThread two = new AllocOnAnotherThread(taskOne, 6 * 1024 * 1024)) {
+        try (AllocOnAnotherThread two = new GpuAllocOnAnotherThread(taskOne, 6 * 1024 * 1024)) {
           two.waitForAlloc();
           fail("Expect that allocating more memory than is allowed would fail");
         } catch (ExecutionException oom) {
-          assert oom.getCause() instanceof RetryOOM : oom.toString();
+          assert oom.getCause() instanceof GpuRetryOOM : oom.toString();
         }
         try {
           taskOne.blockUntilReady().get(1000, TimeUnit.MILLISECONDS);
           fail("Expect split and retry after all tasks blocked.");
         } catch (ExecutionException oom) {
-          assert oom.getCause() instanceof SplitAndRetryOOM : oom.toString();
+          assert oom.getCause() instanceof GpuSplitAndRetryOOM : oom.toString();
         }
-        assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId));
+        assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
         // Now we try to allocate with half the data.
-        try (AllocOnAnotherThread secondTry = new AllocOnAnotherThread(taskOne, 3 * 1024 * 1024)) {
+        try (AllocOnAnotherThread secondTry = new GpuAllocOnAnotherThread(taskOne, 3 * 1024 * 1024)) {
           secondTry.waitForAlloc();
         }
       }
@@ -697,8 +1116,9 @@ public void testInsertMultipleOOMs() {
     Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 10 * 1024 * 1024);
     RmmSpark.setEventHandler(new BaseRmmEventHandler(), "stderr");
     long threadId = RmmSpark.getCurrentThreadId();
-    long taskid = 0; // This is arbitrary
-    RmmSpark.associateThreadWithTask(threadId, taskid);
+    long taskId = 0; // This is arbitrary
+    Thread t = Thread.currentThread();
+    RmmSpark.startDedicatedTaskThread(threadId, taskId, t);
     try {
       // Allocate something small and verify that it works...
       Rmm.alloc(100).close();
@@ -707,7 +1127,7 @@ public void testInsertMultipleOOMs() {
       int numRetryOOMs = 3;
       RmmSpark.forceRetryOOM(threadId, numRetryOOMs);
       for (int i = 0; i < numRetryOOMs; i++) {
-        assertThrows(RetryOOM.class, () -> Rmm.alloc(100).close());
+        assertThrows(GpuRetryOOM.class, () -> Rmm.alloc(100).close());
         // Verify that injecting OOM does not cause the block to actually happen
         RmmSpark.blockThreadUntilReady();
       }
@@ -719,7 +1139,7 @@ public void testInsertMultipleOOMs() {
       int numSplitAndRetryOOMs = 5;
       RmmSpark.forceSplitAndRetryOOM(threadId, numSplitAndRetryOOMs);
       for (int i = 0; i < numSplitAndRetryOOMs; i++) {
-        assertThrows(SplitAndRetryOOM.class, () -> Rmm.alloc(100).close());
+        assertThrows(GpuSplitAndRetryOOM.class, () -> Rmm.alloc(100).close());
         // Verify that injecting OOM does not cause the block to actually happen
         RmmSpark.blockThreadUntilReady();
       }
@@ -727,7 +1147,7 @@ public void testInsertMultipleOOMs() {
       // Allocate something small and verify that it works...
       Rmm.alloc(100).close();
     } finally {
-      RmmSpark.removeThreadAssociation(threadId);
+      RmmSpark.removeDedicatedThreadAssociation(threadId, taskId);
     }
   }
 
@@ -736,8 +1156,9 @@ public void testCudfException() {
     Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 10 * 1024 * 1024);
     RmmSpark.setEventHandler(new BaseRmmEventHandler(), "stderr");
     long threadId = RmmSpark.getCurrentThreadId();
-    long taskid = 0; // This is arbitrary
-    RmmSpark.associateThreadWithTask(threadId, taskid);
+    long taskId = 0; // This is arbitrary
+    Thread t = Thread.currentThread();
+    RmmSpark.startDedicatedTaskThread(threadId, taskId, t);
     try {
       // Allocate something small and verify that it works...
       Rmm.alloc(100).close();
@@ -754,7 +1175,7 @@ public void testCudfException() {
       // Allocate something small and verify that it works...
       Rmm.alloc(100).close();
     } finally {
-      RmmSpark.removeThreadAssociation(threadId);
+      RmmSpark.removeDedicatedThreadAssociation(threadId, taskId);
     }
   }
 
@@ -763,23 +1184,24 @@ public void retryWatchdog() {
     // 10 MiB
     setupRmmForTestingWithLimits(10 * 1024 * 1024);
     long threadId = RmmSpark.getCurrentThreadId();
-    long taskid = 0; // This is arbitrary
+    long taskId = 0; // This is arbitrary
     long numRetries = 0;
-    RmmSpark.associateThreadWithTask(threadId, taskid);
+    Thread t = Thread.currentThread();
+    RmmSpark.startDedicatedTaskThread(threadId, taskId, t);
     long startTime = System.nanoTime();
     try (DeviceMemoryBuffer filler = Rmm.alloc(9 * 1024 * 1024)) {
       while (numRetries < 10000) {
         try {
           Rmm.alloc(2 * 1024 * 1024).close();
           fail("overallocation should have failed");
-        } catch (RetryOOM room) {
+        } catch (GpuRetryOOM room) {
           numRetries++;
           try {
             RmmSpark.blockThreadUntilReady();
-          } catch (SplitAndRetryOOM sroom) {
+          } catch (GpuSplitAndRetryOOM sroom) {
             numRetries++;
           }
-        } catch (SplitAndRetryOOM sroom) {
+        } catch (GpuSplitAndRetryOOM sroom) {
           fail("retry should be thrown before split and retry...");
         }
       }
@@ -788,7 +1210,7 @@ public void retryWatchdog() {
       // The 500 is hard coded in the code below
       assertEquals(500, numRetries);
     } finally {
-      RmmSpark.removeThreadAssociation(threadId);
+      RmmSpark.removeDedicatedThreadAssociation(threadId, taskId);
     }
     long endTime = System.nanoTime();
     System.err.println("Took " + (endTime - startTime) + "ns to retry 500 times...");
@@ -812,14 +1234,15 @@ public void testAllocationDuringSpill() {
     // 10 MiB
     setupRmmForTestingWithLimits(10 * 1024 * 1024, rmmEventHandler);
     long threadId = RmmSpark.getCurrentThreadId();
-    long taskid = 0; // This is arbitrary
-    RmmSpark.associateThreadWithTask(threadId, taskid);
+    long taskId = 0; // This is arbitrary
+    Thread t = Thread.currentThread();
+    RmmSpark.startDedicatedTaskThread(threadId, taskId, t);
     assertThrows(GpuOOM.class, () -> {
       try (DeviceMemoryBuffer filler = Rmm.alloc(9 * 1024 * 1024)) {
         try (DeviceMemoryBuffer shouldFail = Rmm.alloc(2 * 1024 * 1024)) {}
         fail("overallocation should have failed");
       } finally {
-        RmmSpark.removeThreadAssociation(threadId);
+        RmmSpark.removeDedicatedThreadAssociation(threadId, taskId);
       }
     });
     assertEquals(11, rmmEventHandler.getAllocationCount());
@@ -832,14 +1255,15 @@ public void testAllocationFailedDuringSpill() {
     // 10 MiB
     setupRmmForTestingWithLimits(10 * 1024 * 1024, rmmEventHandler);
     long threadId = RmmSpark.getCurrentThreadId();
-    long taskid = 0; // This is arbitrary
-    RmmSpark.associateThreadWithTask(threadId, taskid);
+    long taskId = 0; // This is arbitrary
+    Thread t = Thread.currentThread();
+    RmmSpark.startDedicatedTaskThread(threadId, taskId, t);
     assertThrows(GpuOOM.class, () -> {
       try (DeviceMemoryBuffer filler = Rmm.alloc(9 * 1024 * 1024)) {
         try (DeviceMemoryBuffer shouldFail = Rmm.alloc(2 * 1024 * 1024)) {}
         fail("overallocation should have failed");
       } finally {
-        RmmSpark.removeThreadAssociation(threadId);
+        RmmSpark.removeDedicatedThreadAssociation(threadId, taskId);
       }
     });
     assertEquals(0, rmmEventHandler.getAllocationCount());

From ab9ed762ace5d9a8aceb89871f7fdc3d86345a99 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 30 Nov 2023 06:04:07 +0800
Subject: [PATCH 025/127] Update submodule cudf to
 e15290a373ff0c84c85c2c0e940e69377a66cf96 (#1605)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8da62049ae..e15290a373 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8da62049aee750b391ff6d8ca4937428f94fd10c
+Subproject commit e15290a373ff0c84c85c2c0e940e69377a66cf96

From ddc2410324dccf9e50f5af0476748c5363014608 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 30 Nov 2023 22:02:01 +0800
Subject: [PATCH 026/127] Update submodule cudf to
 d528c95beb471d5e95a9b24b9d54351496fef11a (#1606)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e15290a373..d528c95beb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e15290a373ff0c84c85c2c0e940e69377a66cf96
+Subproject commit d528c95beb471d5e95a9b24b9d54351496fef11a

From 9c3c7a6d05bcaeee6e502203a13b167ec1748869 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Fri, 1 Dec 2023 13:29:30 -0600
Subject: [PATCH 027/127] Fix faultinj build error after spdlog/fmt upgrade
 (#1608)

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
---
 src/main/cpp/faultinj/faultinj.cu | 7 +++++--
 thirdparty/cudf                   | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/main/cpp/faultinj/faultinj.cu b/src/main/cpp/faultinj/faultinj.cu
index 19902783b8..6903ebf446 100644
--- a/src/main/cpp/faultinj/faultinj.cu
+++ b/src/main/cpp/faultinj/faultinj.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,9 @@
 #include <sys/inotify.h>
 #include <sys/time.h>
 
+// Format enums for logging
+auto format_as(CUpti_CallbackDomain domain) { return fmt::underlying(domain); }
+
 namespace {
 
 #define CUPTI_CALL(call)                                                \
@@ -392,7 +395,7 @@ void readFaultInjectorConfig(void)
     std::srand(seed);
 
     const spdlog::level::level_enum logLevelEnum = static_cast<spdlog::level::level_enum>(logLevel);
-    spdlog::info("changed log level to {}", logLevelEnum);
+    spdlog::info("changed log level to {}", logLevel);
     spdlog::set_level(logLevelEnum);
     traceConfig(globalControl.configRoot);
 
diff --git a/thirdparty/cudf b/thirdparty/cudf
index d528c95beb..c8074b5176 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit d528c95beb471d5e95a9b24b9d54351496fef11a
+Subproject commit c8074b5176a74630101c78c43c24b66141352b24

From df72c289fedd0497ccbf0ea0e7a5b986fb94239e Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Mon, 4 Dec 2023 15:29:47 -0500
Subject: [PATCH 028/127] Updating `parse_uri` to properly validate the URI
 before returning results (#1554)

* Radical rewrite to properly validate URI
* Updated to add support for utf8 and escaped hex
* Fixing IPv6 issue and code cleanup
* Fixing incorrect port string generation. Thanks Haoyang!

Signed-off-by: Mike Wilson <knobby@burntsheep.com>
---
 src/main/cpp/src/parse_uri.cu                 | 920 +++++++++++++-----
 src/main/cpp/tests/parse_uri.cpp              | 113 ++-
 .../nvidia/spark/rapids/jni/ParseURITest.java | 112 ++-
 3 files changed, 902 insertions(+), 243 deletions(-)

diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index 54e79ab022..d0629cb71f 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -18,11 +18,13 @@
 
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/string_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -34,251 +36,728 @@ namespace spark_rapids_jni {
 using namespace cudf;
 
 namespace detail {
+
+struct uri_parts {
+  string_view scheme;
+  string_view host;
+  string_view authority;
+  string_view path;
+  string_view fragment;
+  string_view query;
+  string_view userinfo;
+  string_view port;
+  string_view opaque;
+  bool valid{false};
+};
+
+enum class URI_chunks : int8_t { PROTOCOL, HOST, AUTHORITY, PATH, QUERY, USERINFO };
+
+enum class chunk_validity : int8_t { VALID, INVALID, FATAL };
+
 namespace {
 
-// utility to validate a character is valid in a URI
-constexpr bool is_valid_character(char ch, bool alphanum_only)
+// Some parsing errors are fatal and some parsing errors simply mean this
+// thing doesn't exist or is invalid. For example, just because 280.0.1.16 is
+// not a valid IPv4 address simply means if asking for the host the host is null
+// but the authority is still 280.0.1.16 and the uri is not considered invalid.
+// By contrast, the URI https://[15:6:g:invalid] will not return https for the
+// scheme and is considered completely invalid.
+
+constexpr bool is_alpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); }
+
+constexpr bool is_numeric(char c) { return c >= '0' && c <= '9'; }
+
+constexpr bool is_alphanum(char c) { return is_alpha(c) || is_numeric(c); }
+
+constexpr bool is_hex(char c)
 {
-  if (alphanum_only) {
-    if (ch >= '-' && ch <= '9' && ch != '/') return true;  // 0-9 and .-
-    if (ch >= 'A' && ch <= 'Z') return true;               // A-Z
-    if (ch >= 'a' && ch <= 'z') return true;               // a-z
-  } else {
-    if (ch >= '!' && ch <= ';' && ch != '"') return true;  // 0-9 and !#%&'()*+,-./
-    if (ch >= '=' && ch <= 'Z' && ch != '>') return true;  // A-Z and =?@
-    if (ch >= '_' && ch <= 'z' && ch != '`') return true;  // a-z and _
+  return is_numeric(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+}
+
+__device__ thrust::pair<bool, string_view::const_iterator> skip_and_validate_special(
+  string_view::const_iterator iter,
+  string_view::const_iterator end,
+  bool allow_invalid_escapes = false)
+{
+  while (iter != end) {
+    auto const c         = *iter;
+    auto const num_bytes = cudf::strings::detail::bytes_in_char_utf8(*iter);
+    if (*iter == '%' && !allow_invalid_escapes) {
+      // verify following two characters are hexadecimal
+      for (int i = 0; i < 2; ++i) {
+        ++iter;
+        if (iter == end) { return {false, iter}; }
+
+        if (!is_hex(*iter)) { return {false, iter}; }
+      }
+    } else if (num_bytes > 1) {
+      // UTF8 validation means it isn't whitespace and not a control character
+      // the normal validation will handle anything single byte, this checks for multiple byte
+      // whitespace
+      auto const c = *iter;
+      // There are multi-byte looking things like extended ASCII characters that are not valid UTF8.
+      // Check that here.
+      if ((c & 0xC0) != 0x80) { return {false, iter}; }
+      if (num_bytes > 2 && ((c & 0xC000) != 0x8000)) { return {false, iter}; }
+      if (num_bytes > 3 && ((c & 0xC00000) != 0x800000)) { return {false, iter}; }
+
+      // Validate it isn't a whitespace or control unicode character.
+      if ((c >= 0xc280 && c <= 0xc2a0) || c == 0xe19a80 || (c >= 0xe28080 && c <= 0xe2808a) ||
+          c == 0xe280af || c == 0xe280a8 || c == 0xe2819f || c == 0xe38080) {
+        return {false, iter};
+      }
+    } else {
+      break;
+    }
+    ++iter;
   }
-  return false;
+
+  return {true, iter};
 }
 
-/**
- * @brief Count the number of characters of each string after parsing the protocol.
- *
- * @tparam num_warps_per_threadblock Number of warps in a threadblock. This template argument must
- * match the launch configuration, i.e. the kernel must be launched with
- * `num_warps_per_threadblock * cudf::detail::warp_size` threads per threadblock.
- * @tparam char_block_size Number of characters which will be loaded into the shared memory at a
- * time.
- *
- * @param in_strings Input string column
- * @param out_counts Number of characters in each decode URL
- * @param out_validity Bitmask of validity data, updated in funcion
- */
-template <size_type num_warps_per_threadblock, size_type char_block_size>
-__global__ void parse_uri_protocol_char_counter(column_device_view const in_strings,
-                                                size_type* const out_counts,
-                                                bitmask_type* out_validity)
+template <typename Predicate>
+__device__ bool validate_chunk(string_view s, Predicate fn, bool allow_invalid_escapes = false)
 {
-  __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size];
-  __shared__ typename cub::WarpScan<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
-  __shared__ bool found_token[num_warps_per_threadblock];
-
-  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
-  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
-  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
-  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
-  auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
-  char* in_chars_shared = temporary_buffer[local_warp_id];
-
-  // Loop through strings, and assign each string to a warp.
-  for (thread_index_type tidx = global_warp_id; tidx < in_strings.size(); tidx += nwarps) {
-    auto const row_idx = static_cast<size_type>(tidx);
-    if (in_strings.is_null(row_idx)) {
-      if (warp_lane == 0) out_counts[row_idx] = 0;
+  auto iter = s.begin();
+  {
+    auto [valid, iter_] = skip_and_validate_special(iter, s.end(), allow_invalid_escapes);
+    iter                = std::move(iter_);
+    if (!valid) { return false; }
+  }
+  while (iter != s.end()) {
+    if (!fn(iter)) { return false; }
+
+    iter++;
+    auto [valid, iter_] = skip_and_validate_special(iter, s.end(), allow_invalid_escapes);
+    iter                = std::move(iter_);
+    if (!valid) { return false; }
+  }
+  return true;
+}
+
+bool __device__ validate_scheme(string_view scheme)
+{
+  // A scheme simply needs to be an alpha character followed by alphanumeric
+  auto iter = scheme.begin();
+  if (!is_alpha(*iter)) { return false; }
+  while (++iter != scheme.end()) {
+    auto const c = *iter;
+    if (!is_alphanum(c) && c != '+' && c != '-' && c != '.') { return false; }
+  }
+  return true;
+}
+
+bool __device__ validate_ipv6(string_view s)
+{
+  constexpr auto max_colons{8};
+
+  if (s.size_bytes() < 2) { return false; }
+
+  bool found_double_colon{false};
+  int open_bracket_count{0};
+  int close_bracket_count{0};
+  int period_count{0};
+  int colon_count{0};
+  int percent_count{0};
+  char previous_char{0};
+  int address{0};
+  int address_char_count{0};
+  bool address_has_hex{false};
+
+  auto const leading_double_colon = [&]() {
+    auto iter = s.begin();
+    if (*iter == '[') iter++;
+    return *iter++ == ':' && *iter == ':';
+  }();
+
+  for (auto iter = s.begin(); iter < s.end(); ++iter) {
+    auto const c = *iter;
+
+    switch (c) {
+      case '[':
+        open_bracket_count++;
+        if (open_bracket_count > 1) { return false; }
+        break;
+      case ']':
+        close_bracket_count++;
+        if (close_bracket_count > 1) { return false; }
+        if ((period_count > 0) && (address_has_hex || address > 255)) { return false; }
+        break;
+      case ':':
+        colon_count++;
+        if (previous_char == ':') {
+          if (found_double_colon) { return false; }
+          found_double_colon = true;
+        }
+        address            = 0;
+        address_has_hex    = false;
+        address_char_count = 0;
+        if (colon_count > max_colons || (colon_count == max_colons && !found_double_colon)) {
+          return false;
+        }
+        // Periods before a colon don't work, periods can be an IPv4 address after this IPv6 address
+        // like [1:2:3:4:5:6:d.d.d.d]
+        if (period_count > 0 || percent_count > 0) { return false; }
+        break;
+      case '.':
+        period_count++;
+        if (percent_count > 0) { return false; }
+        if (period_count > 3) { return false; }
+        if (address_has_hex) { return false; }
+        if (address > 255) { return false; }
+        if (colon_count != 6 && !found_double_colon) { return false; }
+        // Special case of ::1:2:3:4:5:d.d.d.d has 7 colons - but spark says this is invalid
+        // if (colon_count == max_colons && !leading_double_colon) { return false; }
+        if (colon_count >= max_colons) { return false; }
+        address            = 0;
+        address_has_hex    = false;
+        address_char_count = 0;
+        break;
+      case '%':
+        // IPv6 can define a device to use for the routing. This is expressed as '%eth0' at the end
+        // of the address.
+        percent_count++;
+        if (percent_count > 1) { return false; }
+        if ((period_count > 0) && (address_has_hex || address > 255)) { return false; }
+        address            = 0;
+        address_has_hex    = false;
+        address_char_count = 0;
+        break;
+      default:
+        // after % all bets are off as the device name can be nearly anything
+        if (percent_count == 0) {
+          if (address_char_count > 3) { return false; }
+          address_char_count++;
+          address *= 10;
+          if (c >= 'a' && c <= 'f') {
+            address += 10;
+            address += c - 'a';
+            address_has_hex = true;
+          } else if (c >= 'A' && c <= 'Z') {
+            address += 10;
+            address += c - 'A';
+            address_has_hex = true;
+          } else if (c >= '0' && c <= '9') {
+            address += c - '0';
+          } else {
+            return false;
+          }
+        }
+        break;
+    }
+    previous_char = c;
+  }
+
+  return true;
+}
+
+bool __device__ validate_ipv4(string_view s)
+{
+  // dotted quad (0-255).(0-255).(0-255).(0-255)
+  int address            = 0;
+  int address_char_count = 0;
+  int dot_count          = 0;
+  for (auto iter = s.begin(); iter < s.end(); ++iter) {
+    auto const c = *iter;
+
+    // can't lead with a .
+    if ((c < '0' || c > '9') && (iter == s.begin() || c != '.')) { return false; }
+
+    if (c == '.') {
+      // verify we saw at least one character and reset values
+      if (address_char_count == 0) { return false; }
+      address            = 0;
+      address_char_count = 0;
+      dot_count++;
       continue;
     }
 
-    auto const in_string     = in_strings.element<string_view>(row_idx);
-    auto const in_chars      = in_string.data();
-    auto const string_length = in_string.size_bytes();
-    auto const nblocks       = cudf::util::div_rounding_up_unsafe(string_length, char_block_size);
-    size_type output_string_size = 0;
+    address_char_count++;
+    address *= 10;
+    address += c - '0';
+
+    if (address > 255) { return false; }
+  }
+
+  // can't end with a .
+  if (address_char_count == 0) { return false; }
+
+  // must be 4 portions seperated by 3 dots.
+  if (dot_count != 3) { return false; }
+
+  return true;
+}
+
+bool __device__ validate_domain_name(string_view name)
+{
+  // domain name can be alphanum or -.
+  // slash can not be the first of last character of the domain name or around a .
+  bool last_was_slash  = false;
+  bool last_was_period = false;
+  bool numeric_start   = false;
+  for (auto iter = name.begin(); iter < name.end(); ++iter) {
+    auto const c = *iter;
+    if (!is_alphanum(c) && c != '-' && c != '.') { return false; }
+
+    // the final section can't start with a digit
+    if (last_was_period && c >= '0' && c <= '9') {
+      numeric_start = true;
+    } else {
+      numeric_start = false;
+    }
+
+    if (c == '-') {
+      if (last_was_period || iter == name.begin() || iter == --name.end()) { return false; }
+      last_was_slash  = true;
+      last_was_period = false;
+    } else if (c == '.') {
+      if (last_was_slash) { return false; }
+      last_was_period = true;
+      last_was_slash  = false;
+    } else {
+      last_was_period = false;
+      last_was_slash  = false;
+    }
+  }
+
+  // numeric start to last part of domain isn't allowed.
+  if (numeric_start) { return false; }
+
+  return true;
+}
+
+chunk_validity __device__ validate_host(string_view host)
+{
+  // This can be IPv4, IPv6, or a domain name.
+  if (*host.begin() == '[') {
+    // If last character is a ], this is IPv6 or invalid.
+    if (*(host.end() - 1) != ']') {
+      // invalid
+      return chunk_validity::FATAL;
+    }
+    if (!validate_ipv6(host)) { return chunk_validity::FATAL; }
+
+    return chunk_validity::VALID;
+  }
 
-    // valid until proven otherwise
-    bool valid{true};
+  // If there are more [ or ] characters this is invalid.
+  // Also need to find the last .
+  int last_open_bracket  = -1;
+  int last_close_bracket = -1;
+  int last_period        = -1;
+
+  // The original plan on this loop was to get fancy and use a reverse iterator and exit when
+  // everything was found, but the expectation is there are no brackets in this string, so we have
+  // to traverse the entire thing anyway to verify that. The math is easier with a forward iterator,
+  // so we're back here.
+  for (auto iter = host.begin(); iter < host.end(); ++iter) {
+    auto const c = *iter;
+    if (c == '[') {
+      last_open_bracket = iter.position();
+    } else if (c == ']') {
+      last_close_bracket = iter.position();
+    } else if (c == '.') {
+      last_period = iter.position();
+    }
+  }
 
-    // Use the last thread of the warp to initialize `found_token` to false.
-    if (warp_lane == cudf::detail::warp_size - 1) { found_token[local_warp_id] = false; }
+  if (last_open_bracket >= 0 || last_close_bracket >= 0) { return chunk_validity::FATAL; }
 
-    for (size_type block_idx = 0; block_idx < nblocks && valid; block_idx++) {
-      auto const string_length_block =
-        std::min(char_block_size, string_length - char_block_size * block_idx);
+  // If we didn't find a period or if the last character is a period or the character after the last
+  // period is non numeric
+  if (last_period < 0 || last_period == host.length() - 1 || host[last_period + 1] < '0' ||
+      host[last_period + 1] > '9') {
+    // must be domain name or it is invalid
+    if (validate_domain_name(host)) { return chunk_validity::VALID; }
 
-      // Each warp collectively loads input characters of the current block to the shared memory.
-      for (auto char_idx = warp_lane; char_idx < string_length_block;
-           char_idx += cudf::detail::warp_size) {
-        auto const in_idx         = block_idx * char_block_size + char_idx;
-        in_chars_shared[char_idx] = in_idx < string_length ? in_chars[in_idx] : 0;
+    // the only other option is that this is a IPv4 address
+  } else if (validate_ipv4(host)) {
+    return chunk_validity::VALID;
+  }
+
+  return chunk_validity::INVALID;
+}
+
+bool __device__ validate_query(string_view query)
+{
+  // query can be alphanum and _-!.~'()*,;:$&+=?/[]@"
+  return validate_chunk(query, [] __device__(string_view::const_iterator iter) {
+    auto const c = *iter;
+    if (c != '!' && c != '"' && c != '$' && !(c >= '&' && c <= ';') && c != '=' &&
+        !(c >= '?' && c <= ']' && c != '\\') && !(c >= 'a' && c <= 'z') && c != '_' && c != '~') {
+      return false;
+    }
+    return true;
+  });
+}
+
+bool __device__ validate_authority(string_view authority, bool allow_invalid_escapes)
+{
+  // authority needs to be alphanum and @[]_-!.'()*,;:$&+=
+  return validate_chunk(
+    authority,
+    [allow_invalid_escapes] __device__(string_view::const_iterator iter) {
+      auto const c = *iter;
+      if (c != '!' && c != '$' && !(c >= '&' && c <= ';' && c != '/') && c != '=' &&
+          !(c >= '@' && c <= '_' && c != '^' && c != '\\') && !(c >= 'a' && c <= 'z') && c != '~' &&
+          (!allow_invalid_escapes || c != '%')) {
+        return false;
       }
+      return true;
+    },
+    allow_invalid_escapes);
+}
 
-      __syncwarp();
-
-      // `char_idx_start` represents the start character index of the current warp.
-      for (size_type char_idx_start = 0; char_idx_start < string_length_block;
-           char_idx_start += cudf::detail::warp_size) {
-        auto const char_idx      = char_idx_start + warp_lane;
-        char const* const ch_ptr = in_chars_shared + char_idx;
-
-        // need to know if the character we are validating is before or after the token
-        // as valid characters changes. Default to 1 to handle the case where we have
-        // alreayd found the token and do not search for it again.
-        int8_t out_tokens{1};
-        if (!found_token[local_warp_id]) {
-          // Warp-wise prefix sum to establish tokens of string.
-          // All threads in the warp participate in the prefix sum, even if `char_idx` is beyond
-          // `string_length_block`.
-          int8_t const is_token = (char_idx < string_length_block && *ch_ptr == ':') ? 1 : 0;
-          cub::WarpScan<int8_t>(cub_storage[local_warp_id]).InclusiveSum(is_token, out_tokens);
-        }
+bool __device__ validate_userinfo(string_view userinfo)
+{
+  // can't be ] or [ in here
+  return validate_chunk(userinfo, [] __device__(string_view::const_iterator iter) {
+    auto const c = *iter;
+    if (c == '[' || c == ']') { return false; }
+    return true;
+  });
+}
 
-        auto const before_token = out_tokens == 0;
-        valid                   = valid && __ballot_sync(0xffffffff,
-                                       (char_idx >= string_length_block ||
-                                        is_valid_character(*ch_ptr, before_token))
-                                                           ? 0
-                                                           : 1) == 0;
-        if (!valid) {
-          // last thread in warp sets validity
-          if (warp_lane == cudf::detail::warp_size - 1) {
-            clear_bit(out_validity, row_idx);
-            out_counts[row_idx] = 0;
-          }
+bool __device__ validate_port(string_view port)
+{
+  // port is positive numeric >=0 according to spark...shrug
+  return validate_chunk(port, [] __device__(string_view::const_iterator iter) {
+    auto const c = *iter;
+    if (c < '0' && c > '9') { return false; }
+    return true;
+  });
+}
+
+bool __device__ validate_path(string_view path)
+{
+  // path can be alphanum and @[]_-!.~'()*?/&,;:$+=
+  return validate_chunk(path, [] __device__(string_view::const_iterator iter) {
+    auto const c = *iter;
+    if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && !(c >= '@' && c <= 'Z') &&
+        c != '_' && !(c >= 'a' && c <= 'z') && c != '~') {
+      return false;
+    }
+    return true;
+  });
+}
+
+bool __device__ validate_opaque(string_view opaque)
+{
+  // opaque can be alphanum and @[]_-!.~'()*?/,;:$@+=
+  return validate_chunk(opaque, [] __device__(string_view::const_iterator iter) {
+    auto const c = *iter;
+    if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' &&
+        !(c >= '?' && c <= ']' && c != '\\') && c != '_' && c != '~' && !(c >= 'a' && c <= 'z')) {
+      return false;
+    }
+    return true;
+  });
+}
+
+bool __device__ validate_fragment(string_view fragment)
+{
+  // fragment can be alphanum and @[]_-!.~'()*?/,;:$&+=
+  return validate_chunk(fragment, [] __device__(string_view::const_iterator iter) {
+    auto const c = *iter;
+    if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' &&
+        !(c >= '?' && c <= ']' && c != '\\') && c != '_' && c != '~' && !(c >= 'a' && c <= 'z')) {
+      return false;
+    }
+    return true;
+  });
+}
+
+uri_parts __device__ validate_uri(const char* str, int len)
+{
+  uri_parts ret;
+
+  // look for :/# characters.
+  int col      = -1;
+  int slash    = -1;
+  int hash     = -1;
+  int question = -1;
+  for (const char* c = str;
+       c - str < len && (col == -1 || slash == -1 || hash == -1 || question == -1);
+       ++c) {
+    switch (*c) {
+      case ':':
+        if (col == -1) col = c - str;
+        break;
+      case '/':
+        if (slash == -1) slash = c - str;
+        break;
+      case '#':
+        if (hash == -1) hash = c - str;
+        break;
+      case '?':
+        if (question == -1) question = c - str;
+        break;
+      default: break;
+    }
+  }
+
+  // anything after the hash is part of the fragment and ignored for this part
+  if (hash >= 0) {
+    ret.fragment = {str + hash + 1, len - hash - 1};
+    if (!validate_fragment(ret.fragment)) {
+      ret.valid = false;
+      return ret;
+    }
+
+    len = hash;
+
+    if (col > hash) col = -1;
+    if (slash > hash) slash = -1;
+    if (question > hash) question = -1;
+  }
+
+  // if the first ':' is after the other tokens, this doesn't have a scheme or it is invalid
+  if (col != -1 && (slash == -1 || col < slash) && (hash == -1 || col < hash)) {
+    // we have a scheme up to the :
+    ret.scheme = {str, col};
+    if (!validate_scheme(ret.scheme)) {
+      ret.valid = false;
+      return ret;
+    }
+
+    // skip over scheme
+    auto const skip = col + 1;
+    str += skip;
+    len -= skip;
+    question -= skip;
+    hash -= skip;
+    slash -= skip;
+  }
+
+  // no more string to parse is an error
+  if (len <= 0) {
+    ret.valid = false;
+    return ret;
+  }
+
+  // If we have a '/' as the next character, we have a heirarchical uri. If not it is opaque.
+  bool const heirarchical = str[0] == '/';
+  if (heirarchical) {
+    // a '?' will break this into query and path/authority
+    if (question >= 0) {
+      ret.query = {str + question + 1, len - question - 1};
+      if (!validate_query(ret.query)) {
+        ret.valid = false;
+        return ret;
+      }
+    }
+    auto const path_len = question >= 0 ? question : len;
+
+    if (str[0] == '/' && str[1] == '/') {
+      // If we have a '/', we have //authority/path, otherwise we have //authority with no path.
+      int next_slash = -1;
+      for (int i = 2; i < path_len; ++i) {
+        if (str[i] == '/') {
+          next_slash = i;
           break;
         }
+      }
+      ret.authority = {&str[2],
+                       next_slash == -1 ? question < 0 ? len - 2 : question - 2 : next_slash - 2};
+      if (next_slash > 0) { ret.path = {str + next_slash, path_len - next_slash}; }
+
+      if (next_slash == -1 && ret.authority.size_bytes() == 0 && ret.query.size_bytes() == 0 &&
+          ret.fragment.size_bytes() == 0) {
+        // invalid! - but spark like to return things as long as you don't have illegal characters
+        // ret.valid = false;
+        ret.valid = true;
+        return ret;
+      }
+
+      if (ret.authority.size_bytes() > 0) {
+        auto ipv6_address = ret.authority.size_bytes() > 2 && *ret.authority.begin() == '[';
+        if (!validate_authority(ret.authority, ipv6_address)) {
+          ret.valid = false;
+          return ret;
+        }
 
-        // if we have already found our token, no more string copy we only need to validate
-        // characters
-        if (!found_token[local_warp_id]) {
-          // If the current character is before the token we will output the character.
-          int8_t const out_size = (char_idx >= string_length_block || out_tokens > 0) ? 0 : 1;
-
-          // Warp-wise prefix sum to establish output location of the current thread.
-          // All threads in the warp participate in the prefix sum, even if `char_idx` is beyond
-          // `string_length_block`.
-          int8_t out_offset;
-          cub::WarpScan<int8_t>(cub_storage[local_warp_id]).InclusiveSum(out_size, out_offset);
-
-          // last thread of the warp updates offsets and token since it has the last offset and
-          // token value
-          if (warp_lane == cudf::detail::warp_size - 1) {
-            output_string_size += out_offset;
-            found_token[local_warp_id] = out_tokens > 0;
+        // Inspect the authority for userinfo, host, and port
+        const char* auth   = ret.authority.data();
+        auto auth_size     = ret.authority.size_bytes();
+        int amp            = -1;
+        int closingbracket = -1;
+        int last_colon     = -1;
+        for (int i = 0; i < auth_size; ++i) {
+          switch (auth[i]) {
+            case '@':
+              if (amp == -1) {
+                amp = i;
+                if (last_colon > 0) { last_colon = -1; }
+                if (closingbracket > 0) { closingbracket = -1; }
+              }
+              break;
+            case ':': last_colon = amp > 0 ? i - amp - 1 : i; break;
+            case ']':
+              if (closingbracket == -1) closingbracket = amp > 0 ? i - amp : i;
+              break;
           }
         }
 
-        __syncwarp();
-      }
-    }
+        if (amp > 0) {
+          ret.userinfo = {auth, amp};
+          if (!validate_userinfo(ret.userinfo)) {
+            ret.valid = false;
+            return ret;
+          }
+          // skip over the @
+          amp++;
 
-    // last thread of the warp sets output size
-    if (warp_lane == cudf::detail::warp_size - 1) {
-      if (!found_token[local_warp_id]) {
-        clear_bit(out_validity, row_idx);
-        out_counts[row_idx] = 0;
-      } else if (valid) {
-        out_counts[row_idx] = output_string_size;
+          auth += amp;
+          auth_size -= amp;
+        }
+        if (last_colon > 0 && last_colon > closingbracket) {
+          // Found a port, attempt to parse it
+          ret.port = {auth + last_colon + 1, auth_size - last_colon - 1};
+          if (!validate_port(ret.port)) {
+            ret.valid = false;
+            return ret;
+          }
+          ret.host = {auth, last_colon};
+        } else {
+          ret.host = {auth, auth_size};
+        }
+        auto host_ret = validate_host(ret.host);
+        switch (host_ret) {
+          case chunk_validity::FATAL: ret.valid = false; return ret;
+          case chunk_validity::INVALID: ret.host = {}; break;
+        }
       }
+    } else {
+      // path with no authority
+      ret.path = {str, len};
+    }
+    if (!validate_path(ret.path)) {
+      ret.valid = false;
+      return ret;
+    }
+  } else {
+    ret.opaque = {str, len};
+    if (!validate_opaque(ret.opaque)) {
+      ret.valid = false;
+      return ret;
     }
   }
+
+  ret.valid = true;
+  return ret;
 }
 
+// A URI is broken into parts or chunks. There are optional chunks and required chunks. A simple URI
+// such as `https://www.nvidia.com` is easy to reason about, but it could also be written as
+// `www.nvidia.com`, which is still valid. On top of that, there are characters which are allowed in
+// certain chunks that are not allowed in others. There have been a multitude of methods attempted
+// to get this correct, but at the end of the day, we have to validate the URI completely. This
+// means even the simplest task of pulling off every character before the : still requires
+// understanding how to validate an ipv6 address. This kernel was originally conceived as a two-pass
+// kernel that ran the same code and either filled in offsets or filled in actual data. The problem
+// is that to know what characters you need to copy, you need to have parsed the entire string as a
+// 2 meg string could have `:/a` at the very end and everything up to that point is protocol or it
+// could end in `.com` and now it is a hostname. To prevent the code from parsing it completely for
+// length and then parsing it completely to copy the data, we will store off the offset of the
+// string of question. The length is already stored in the offset column, so we then have a pointer
+// and a number of bytes to copy and the second pass boils down to a series of memcpy calls.
+
 /**
- * @brief Parse protocol and copy from the input string column to the output char buffer.
- *
- * @tparam num_warps_per_threadblock Number of warps in a threadblock. This template argument must
- * match the launch configuration, i.e. the kernel must be launched with
- * `num_warps_per_threadblock * cudf::detail::warp_size` threads per threadblock.
- * @tparam char_block_size Number of characters which will be loaded into the shared memory at a
- * time.
+ * @brief Count the number of characters of each string after parsing the protocol.
  *
  * @param in_strings Input string column
- * @param in_validity Validity vector of output column
- * @param out_chars Character buffer for the output string column
- * @param out_offsets Offset value of each string associated with `out_chars`
+ * @param chunk Chunk of URI to return
+ * @param out_lengths Number of characters in each decode URL
+ * @param out_offsets Offsets to the start of the chunks
+ * @param out_validity Bitmask of validity data, updated in function
  */
-template <size_type num_warps_per_threadblock, size_type char_block_size>
-__global__ void parse_uri_to_protocol(column_device_view const in_strings,
-                                      bitmask_type* in_validity,
-                                      char* const out_chars,
-                                      size_type const* const out_offsets)
+__global__ void parse_uri_char_counter(column_device_view const in_strings,
+                                       URI_chunks chunk,
+                                       size_type* const out_lengths,
+                                       size_type* const out_offsets,
+                                       bitmask_type* out_validity)
 {
-  __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size];
-  __shared__ typename cub::WarpScan<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
-  __shared__ size_type out_idx[num_warps_per_threadblock];
-  __shared__ bool found_token[num_warps_per_threadblock];
-
-  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
-  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
-  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
-  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
-  auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
-  char* in_chars_shared = temporary_buffer[local_warp_id];
-
-  // Loop through strings, and assign each string to a warp
-  for (thread_index_type tidx = global_warp_id; tidx < in_strings.size(); tidx += nwarps) {
+  // thread per row
+  auto const tid      = cudf::detail::grid_1d::global_thread_id();
+  auto const base_ptr = in_strings.child(strings_column_view::chars_column_index).data<char>();
+
+  for (thread_index_type tidx = tid; tidx < in_strings.size();
+       tidx += cudf::detail::grid_1d::grid_stride()) {
     auto const row_idx = static_cast<size_type>(tidx);
-    if (!bit_is_set(in_validity, row_idx)) { continue; }
+    if (in_strings.is_null(row_idx)) {
+      out_lengths[row_idx] = 0;
+      continue;
+    }
 
     auto const in_string     = in_strings.element<string_view>(row_idx);
     auto const in_chars      = in_string.data();
     auto const string_length = in_string.size_bytes();
-    auto out_chars_string    = out_chars + out_offsets[row_idx];
-    auto const nblocks       = cudf::util::div_rounding_up_unsafe(string_length, char_block_size);
-
-    // Use the last thread of the warp to initialize `out_idx` to 0 and `found_token` to false.
-    if (warp_lane == cudf::detail::warp_size - 1) {
-      out_idx[local_warp_id]     = 0;
-      found_token[local_warp_id] = false;
-    }
 
-    __syncwarp();
-
-    for (size_type block_idx = 0; block_idx < nblocks && !found_token[local_warp_id]; block_idx++) {
-      auto const string_length_block =
-        std::min(char_block_size, string_length - char_block_size * block_idx);
+    auto const uri = validate_uri(in_chars, string_length);
+    if (!uri.valid) {
+      out_lengths[row_idx] = 0;
+      clear_bit(out_validity, row_idx);
+    } else {
+      // stash output offsets and lengths for next kernel to do the copy
+      switch (chunk) {
+        case URI_chunks::PROTOCOL:
+          out_lengths[row_idx] = uri.scheme.size_bytes();
+          out_offsets[row_idx] = uri.scheme.data() - base_ptr;
+          break;
+        case URI_chunks::HOST:
+          out_lengths[row_idx] = uri.host.size_bytes();
+          out_offsets[row_idx] = uri.host.data() - base_ptr;
+          break;
+        case URI_chunks::AUTHORITY:
+          out_lengths[row_idx] = uri.authority.size_bytes();
+          out_offsets[row_idx] = uri.authority.data() - base_ptr;
+          break;
+        case URI_chunks::PATH:
+          out_lengths[row_idx] = uri.path.size_bytes();
+          out_offsets[row_idx] = uri.path.data() - base_ptr;
+          break;
+        case URI_chunks::QUERY:
+          out_lengths[row_idx] = uri.query.size_bytes();
+          out_offsets[row_idx] = uri.query.data() - base_ptr;
+          break;
+        case URI_chunks::USERINFO:
+          out_lengths[row_idx] = uri.userinfo.size_bytes();
+          out_offsets[row_idx] = uri.userinfo.data() - base_ptr;
+          break;
+      }
 
-      // Each warp collectively loads input characters of the current block to shared memory.
-      for (auto char_idx = warp_lane; char_idx < string_length_block;
-           char_idx += cudf::detail::warp_size) {
-        auto const in_idx         = block_idx * char_block_size + char_idx;
-        in_chars_shared[char_idx] = in_idx >= 0 && in_idx < string_length ? in_chars[in_idx] : 0;
+      if (out_lengths[row_idx] == 0) {
+        // A URI can be valid, but still have no data for a specific chunk
+        clear_bit(out_validity, row_idx);
       }
+    }
+  }
+}
 
-      __syncwarp();
-
-      // `char_idx_start` represents the start character index of the current warp.
-      for (size_type char_idx_start = 0;
-           char_idx_start < string_length_block && !found_token[local_warp_id];
-           char_idx_start += cudf::detail::warp_size) {
-        auto const char_idx      = char_idx_start + warp_lane;
-        char const* const ch_ptr = in_chars_shared + char_idx;
-
-        // Warp-wise prefix sum to establish tokens of string.
-        // All threads in the warp participate in the prefix sum, even if `char_idx` is beyond
-        // `string_length_block`.
-        int8_t const is_token = (char_idx < string_length_block && *ch_ptr == ':') ? 1 : 0;
-        int8_t out_tokens;
-        cub::WarpScan<int8_t>(cub_storage[local_warp_id]).InclusiveSum(is_token, out_tokens);
-
-        // If the current character is before the token we will output the character.
-        int8_t const out_size = (char_idx >= string_length_block || out_tokens > 0) ? 0 : 1;
-
-        // Warp-wise prefix sum to establish output location of the current thread.
-        // All threads in the warp participate in the prefix sum, even if `char_idx` is beyond
-        // `string_length_block`.
-        int8_t out_offset;
-        cub::WarpScan<int8_t>(cub_storage[local_warp_id]).ExclusiveSum(out_size, out_offset);
-
-        // out_size of 1 means this thread writes a byte
-        if (out_size == 1) { out_chars_string[out_idx[local_warp_id] + out_offset] = *ch_ptr; }
-
-        // last thread of the warp updates the offset and the token
-        if (warp_lane == cudf::detail::warp_size - 1) {
-          out_idx[local_warp_id] += (out_offset + out_size);
-          found_token[local_warp_id] = out_tokens > 0;
-        }
+/**
+ * @brief Parse protocol and copy from the input string column to the output char buffer.
+ *
+ * @param in_strings Input string column
+ * @param src_offsets Offset value of source strings in in_strings
+ * @param offsets Offset value of each string associated with `out_chars`
+ * @param out_chars Character buffer for the output string column
+ */
+__global__ void parse_uri(column_device_view const in_strings,
+                          size_type const* const src_offsets,
+                          size_type const* const offsets,
+                          char* const out_chars)
+{
+  auto const tid      = cudf::detail::grid_1d::global_thread_id();
+  auto const base_ptr = in_strings.child(strings_column_view::chars_column_index).data<char>();
+
+  for (thread_index_type tidx = tid; tidx < in_strings.size();
+       tidx += cudf::detail::grid_1d::grid_stride()) {
+    auto const row_idx = static_cast<size_type>(tidx);
+    auto const len     = offsets[row_idx + 1] - offsets[row_idx];
 
-        __syncwarp();
+    if (len > 0) {
+      for (int i = 0; i < len; i++) {
+        out_chars[offsets[row_idx] + i] = base_ptr[src_offsets[row_idx] + i];
       }
     }
   }
@@ -286,16 +765,16 @@ __global__ void parse_uri_to_protocol(column_device_view const in_strings,
 
 }  // namespace
 
-std::unique_ptr<column> parse_uri_to_protocol(strings_column_view const& input,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> parse_uri(strings_column_view const& input,
+                                  URI_chunks chunk,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = input.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  if (strings_count == 0) { return make_empty_column(type_id::STRING); }
 
   constexpr size_type num_warps_per_threadblock = 4;
   constexpr size_type threadblock_size = num_warps_per_threadblock * cudf::detail::warp_size;
-  constexpr size_type char_block_size  = 256;
   auto const num_threadblocks =
     std::min(65536, cudf::util::div_rounding_up_unsafe(strings_count, num_warps_per_threadblock));
 
@@ -306,6 +785,9 @@ std::unique_ptr<column> parse_uri_to_protocol(strings_column_view const& input,
   auto offsets_column = make_numeric_column(
     data_type{type_to_id<size_type>()}, offset_count, mask_state::UNALLOCATED, stream, mr);
 
+  // build src offsets buffer
+  auto src_offsets = rmm::device_uvector<size_type>(strings_count, stream);
+
   // copy null mask
   rmm::device_buffer null_mask =
     input.parent().nullable()
@@ -315,11 +797,12 @@ std::unique_ptr<column> parse_uri_to_protocol(strings_column_view const& input,
   // count number of bytes in each string after parsing and store it in offsets_column
   auto offsets_view         = offsets_column->view();
   auto offsets_mutable_view = offsets_column->mutable_view();
-  parse_uri_protocol_char_counter<num_warps_per_threadblock, char_block_size>
-    <<<num_threadblocks, threadblock_size, 0, stream.value()>>>(
-      *d_strings,
-      offsets_mutable_view.begin<size_type>(),
-      reinterpret_cast<bitmask_type*>(null_mask.data()));
+  parse_uri_char_counter<<<num_threadblocks, threadblock_size, 0, stream.value()>>>(
+    *d_strings,
+    chunk,
+    offsets_mutable_view.begin<size_type>(),
+    reinterpret_cast<size_type*>(src_offsets.data()),
+    reinterpret_cast<bitmask_type*>(null_mask.data()));
 
   // use scan to transform number of bytes into offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
@@ -335,13 +818,12 @@ std::unique_ptr<column> parse_uri_to_protocol(strings_column_view const& input,
   auto chars_column = cudf::strings::detail::create_chars_child_column(out_chars_bytes, stream, mr);
   auto d_out_chars  = chars_column->mutable_view().data<char>();
 
-  // parse and copy the characters from the input column to the output column
-  parse_uri_to_protocol<num_warps_per_threadblock, char_block_size>
-    <<<num_threadblocks, threadblock_size, 0, stream.value()>>>(
-      *d_strings,
-      reinterpret_cast<bitmask_type*>(null_mask.data()),
-      d_out_chars,
-      offsets_column->view().begin<size_type>());
+  // copy the characters from the input column to the output column
+  parse_uri<<<num_threadblocks, threadblock_size, 0, stream.value()>>>(
+    *d_strings,
+    reinterpret_cast<size_type*>(src_offsets.data()),
+    offsets_column->view().begin<size_type>(),
+    d_out_chars);
 
   auto null_count =
     cudf::null_count(reinterpret_cast<bitmask_type*>(null_mask.data()), 0, strings_count);
@@ -362,7 +844,7 @@ std::unique_ptr<column> parse_uri_to_protocol(strings_column_view const& input,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::parse_uri_to_protocol(input, stream, mr);
+  return detail::parse_uri(input, detail::URI_chunks::PROTOCOL, stream, mr);
 }
 
 }  // namespace spark_rapids_jni
\ No newline at end of file
diff --git a/src/main/cpp/tests/parse_uri.cpp b/src/main/cpp/tests/parse_uri.cpp
index 3ff14a6075..6f522829b6 100644
--- a/src/main/cpp/tests/parse_uri.cpp
+++ b/src/main/cpp/tests/parse_uri.cpp
@@ -71,7 +71,33 @@ TEST_F(ParseURIProtocolTests, SparkEdges)
      "/absolute/path",
      "http://%77%77%77.%4EV%49%44%49%41.com",
      "https:://broken.url",
-     "https://www.nvidia.com/q/This%20is%20a%20query"});
+     "https://www.nvidia.com/q/This%20is%20a%20query",
+     "https://www.nvidia.com/\x93path/path/to/file",
+     "http://?",
+     "http://??",
+     "http://\?\?/",
+     "http://#",
+     "http://user:pass@host/file;param?query;p2",
+     "http://[1:2:3:4:5:6:7::]",
+     "http://[::2:3:4:5:6:7:8]",
+     "http://[fe80::7:8%eth0]",
+     "http://[fe80::7:8%1]",
+     "http://foo.bar/abc/\\\\\\http://foo.bar/abc.gif\\\\\\",
+     "www.nvidia.com:8100/servlet/"
+     "impc.DisplayCredits?primekey_in=2000041100:05:14115240636",
+     "https://nvidia.com/2Ru15Ss ",
+     "http://www.nvidia.com/plugins//##",
+     "www.nvidia.com:81/Free.fr/L7D9qw9X4S-aC0&amp;D4X0/Panels&amp;solutionId=0X54a/"
+     "cCdyncharset=UTF-8&amp;t=01wx58Tab&amp;ps=solution/"
+     "ccmd=_help&amp;locale0X1&amp;countrycode=MA/",
+     "http://www.nvidia.com/tags.php?%2F88\323\351\300ึณวน\331\315\370%2F",
+     "http://www.nvidia.com//wp-admin/includes/index.html#9389#123",
+     "http://www.nvidia.com/"
+     "object.php?object=ะก-\320%9Fะฑ-ะฟ-ะก\321%82\321%80ะตะป\321%8Cะฝะฐ-\321%83ะป-\320%"
+     "97ะฐะฒะพะด\321%81ะบะฐ\321%8F.html&sid=5",
+     "http://www.nvidia.com/picshow.asp?id=106&mnid=5080&classname=\271\253ืฐฦช",
+     "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com:443",
+     "http://userid:password@example.com:8080/"});
 
   auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
 
@@ -88,8 +114,87 @@ TEST_F(ParseURIProtocolTests, SparkEdges)
                                                "",
                                                "http",
                                                "https",
-                                               "https"},
-                                              {1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1});
+                                               "https",
+                                               "",
+                                               "http",
+                                               "http",
+                                               "http",
+                                               "http",
+                                               "http",
+                                               "http",
+                                               "http",
+                                               "http",
+                                               "http",
+                                               "",
+                                               "www.nvidia.com",
+                                               "",
+                                               "",
+                                               "www.nvidia.com",
+                                               "",
+                                               "",
+                                               "",
+                                               "",
+                                               "http",
+                                               "http"},
+                                              {1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
+                                               1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
-}
\ No newline at end of file
+}
+
+TEST_F(ParseURIProtocolTests, IP6)
+{
+  cudf::test::strings_column_wrapper col({
+    "https://[fe80::]",
+    "https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]",
+    "https://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334]",
+    "https://[2001:db8::1:0]",
+    "http://[2001:db8::2:1]",
+    "https://[::1]",
+    "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:443",
+    "https://[2001:db8:3333:4444:5555:6666:1.2.3.4]/path/to/file",
+    "https://[2001:db8:3333:4444:5555:6666:7777:8888:1.2.3.4]/path/to/file",
+    "https://[::db8:3333:4444:5555:6666:1.2.3.4]/path/to/file]",  // this is valid, but spark
+                                                                  // doesn't think so
+  });
+  auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper expected(
+    {"https", "https", "https", "https", "http", "https", "https", "https", "", ""},
+    {1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+}
+
+TEST_F(ParseURIProtocolTests, IP4)
+{
+  cudf::test::strings_column_wrapper col({
+    "https://192.168.1.100/",
+    "https://192.168.1.100:8443/",
+    "https://192.168.1.100.5/",
+    "https://192.168.1/",
+    "https://280.100.1.1/",
+    "https://182.168..100/path/to/file",
+  });
+  auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper expected(
+    {"https", "https", "https", "https", "https", "https"});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+}
+
+TEST_F(ParseURIProtocolTests, UTF8)
+{
+  cudf::test::strings_column_wrapper col({
+    "https://nvidia.com/%4EV%49%44%49%41",
+    "http://%77%77%77.%4EV%49%44%49%41.com",
+    "http://✪↩d⁚f„⁈.ws/123",
+    "https:// /path/to/file",
+  });
+  auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper expected({"https", "http", "http", ""}, {1, 1, 1, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
index 7289d110b2..5e90111f21 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
@@ -25,10 +25,33 @@
 import ai.rapids.cudf.ColumnVector;
 
 public class ParseURITest {
+  void buildExpectedAndRun(String[] testData) {
+    String[] expectedStrings = new String[testData.length];
+    for (int i=0; i<testData.length; i++) {
+      String scheme = null;
+      try {
+        URI uri = new URI(testData[i]);
+        scheme = uri.getScheme();
+      } catch (URISyntaxException ex) {
+        // leave the scheme null if URI is invalid
+      } catch (NullPointerException ex) {
+        // leave the scheme null if URI is null
+      }
+      expectedStrings[i] = scheme;
+    }
+    try (ColumnVector v0 = ColumnVector.fromStrings(testData);
+      ColumnVector expected = ColumnVector.fromStrings(expectedStrings);
+      ColumnVector result = ParseURI.parseURIProtocol(v0)) {
+      AssertUtils.assertColumnsAreEqual(expected, result);
+    }
+  }
+  
   @Test
-  void parseURIToProtocolTest() {
-    String[] testData = {"https://nvidia.com/https&#://nvidia.com",
+  void parseURIToProtocolSparkTest() {
+    String[] testData = {
+      "https://nvidia.com/https&#://nvidia.com",
       "https://http://www.nvidia.com",
+      "http://www.nvidia.com/object.php?object=ะก-\320%9Fะฑ-ะฟ-ะก\321%82\321%80ะตะป\321%8Cะฝะฐ-\321%83ะป-\320%97ะฐะฒะพะด\321%81ะบะฐ\321%8F.html&sid=5",
       "filesystemmagicthing://bob.yaml",
       "nvidia.com:8080",
       "http://thisisinvalid.data/due/to-the_character%s/inside*the#url`~",
@@ -48,26 +71,75 @@ void parseURIToProtocolTest() {
       "https://www.nvidia.com#8080",
       "file://path/to/cool/file",
       "http//www.nvidia.com/q",
+      "http://?",
+      "http://#",
+      "http://??",
+      "http://??/",
+      "http://user:pass@host/file;param?query;p2",
+      "http://foo.bar/abc/\\\\\\http://foo.bar/abc.gif\\\\\\",
+      "nvidia.com:8100/servlet/impc.DisplayCredits?primekey_in=2000041100:05:14115240636",
+      "https://nvidia.com/2Ru15Ss ",
+      "http://www.nvidia.com/xmlrpc//##",
+      "www.nvidia.com:8080/expert/sciPublication.jsp?ExpertId=1746&lenList=all",
+      "www.nvidia.com:8080/hrcxtf/view?docId=ead/00073.xml&query=T.%20E.%20Lawrence&query-join=and",
+      "www.nvidia.com:81/Free.fr/L7D9qw9X4S-aC0&amp;D4X0/Panels&amp;solutionId=0X54a/cCdyncharset=UTF-8&amp;t=01wx58Tab&amp;ps=solution/ccmd=_help&amp;locale0X1&amp;countrycode=MA/",
+      "http://www.nvidia.com/tags.php?%2F88\323\351\300ึณวน\331\315\370%2F",
+      "http://www.nvidia.com//wp-admin/includes/index.html#9389#123",
+      "http://[1:2:3:4:5:6:7::]",
+      "http://[::2:3:4:5:6:7:8]",
+      "http://[fe80::7:8%eth0]",
+      "http://[fe80::7:8%1]",
+      "http://www.nvidia.com/object.php?object=ะก-\320%9Fะฑ-ะฟ-ะก\321%82\321%80ะตะป\321%8Cะฝะฐ-\321%83ะป-\320%97ะฐะฒะพะด\321%81ะบะฐ\321%8F.html&sid=5",
+      "http://www.nvidia.com/picshow.asp?id=106&mnid=5080&classname=\271\253ืฐฦช",
+      "http://-.~_!$&'()*+,;=:%40:80%2f::::::@nvidia.com:443",
+      "http://userid:password@nvidia.com:8080/",
       "",
       null};
+
+    buildExpectedAndRun(testData);
+  }
+
+  @Test
+  void parseURIToProtocolUTF8Test() {
+    String[] testData = {
+      "https://nvidia.com/%4EV%49%44%49%41",
+      "http://%77%77%77.%4EV%49%44%49%41.com",
+      "http://✪↩d⁚f„⁈.ws/123"};
+
+    buildExpectedAndRun(testData);
+  }
+
+  @Test
+  void parseURIToProtocolIP4Test() {
+    String[] testData = {
+      "https://192.168.1.100/",
+      "https://192.168.1.100:8443/",
+      "https://192.168.1.100.5/",
+      "https://192.168.1/",
+      "https://280.100.1.1/"};
+
+    buildExpectedAndRun(testData);
+  }
+
+  @Test
+  void parseURIToProtocolIP6Test() {
+    String[] testData = {
+      "https://[fe80::]",
+      "https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]",
+      "https://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334]",
+      "https://[2001:db8::1:0]",
+      "http://[2001:db8::2:1]",
+      "https://[::1]",
+      "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:443",
+      "https://[2001:]db8:85a3:8d3:1319:8a2e:370:7348/",
+      "https://[][][][]nvidia.com/",
+      "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348:2001:db8:85a3]/path",
+      "http://[1:2:3:4:5:6:7::]",
+      "http://[::2:3:4:5:6:7:8]",
+      "http://[fe80::7:8%eth0]",
+      "http://[fe80::7:8%1]",
+    };
     
-    String[] expectedStrings = new String[testData.length];
-    for (int i=0; i<testData.length; i++) {
-      String scheme = null;
-      try {
-        URI uri = new URI(testData[i]);
-        scheme = uri.getScheme();
-      } catch (URISyntaxException ex) {
-        // leave the scheme null if URI is invalid
-      } catch (NullPointerException ex) {
-        // leave the scheme null if URI is null
-      }
-      expectedStrings[i] = scheme;
-    }
-    try (ColumnVector v0 = ColumnVector.fromStrings(testData);
-      ColumnVector expected = ColumnVector.fromStrings(expectedStrings);
-      ColumnVector result = ParseURI.parseURIProtocol(v0)) {
-      AssertUtils.assertColumnsAreEqual(expected, result);
-    }
+    buildExpectedAndRun(testData);
   }
 }

From f0b5c60f5cbd080df950053f0d07b6cd62ca2d1a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 5 Dec 2023 05:20:59 +0800
Subject: [PATCH 029/127] Update submodule cudf to
 39431db46e718c98f29bfaaf429c7cf40dc95a57 (#1610)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index c8074b5176..39431db46e 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit c8074b5176a74630101c78c43c24b66141352b24
+Subproject commit 39431db46e718c98f29bfaaf429c7cf40dc95a57

From 5579dade1034d496e561c1739b010fd0d24a2705 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Mon, 4 Dec 2023 23:05:24 +0000
Subject: [PATCH 030/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 0a56305696..39431db46e 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 0a56305696a37870495867cb76941699c3b53fe6
+Subproject commit 39431db46e718c98f29bfaaf429c7cf40dc95a57

From d7ef4219d40bf6a90b93cf2fe72400b5887115ea Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 5 Dec 2023 11:23:40 +0800
Subject: [PATCH 031/127] Update submodule cudf to
 1c46d7d2b6eb9aea543be596495cda4972ec7887 (#1613)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 39431db46e..1c46d7d2b6 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 39431db46e718c98f29bfaaf429c7cf40dc95a57
+Subproject commit 1c46d7d2b6eb9aea543be596495cda4972ec7887

From 9b680d75836c3c831ebacca56fa4c43e9b0c980e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 5 Dec 2023 21:23:11 +0800
Subject: [PATCH 032/127] Update submodule cudf to
 8f7cbe69d4c2f670b97decc63e73b08e0eef7329 (#1615)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 1c46d7d2b6..8f7cbe69d4 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 1c46d7d2b6eb9aea543be596495cda4972ec7887
+Subproject commit 8f7cbe69d4c2f670b97decc63e73b08e0eef7329

From 554645df617d1fa36b32a0171a762a920390385a Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Tue, 5 Dec 2023 12:46:37 -0500
Subject: [PATCH 033/127] Adding support for parse URI for hostnames (#1569)

* Adding host support for parse_uri

Signed-off-by: Mike Wilson <knobby@burntsheep.com>
---
 src/main/cpp/src/ParseURIJni.cpp              |  14 +
 src/main/cpp/src/parse_uri.cu                 |  23 +-
 src/main/cpp/src/parse_uri.hpp                |  13 +
 src/main/cpp/tests/parse_uri.cpp              | 402 ++++++++++++------
 .../com/nvidia/spark/rapids/jni/ParseURI.java |  11 +
 .../nvidia/spark/rapids/jni/ParseURITest.java |  34 +-
 6 files changed, 345 insertions(+), 152 deletions(-)

diff --git a/src/main/cpp/src/ParseURIJni.cpp b/src/main/cpp/src/ParseURIJni.cpp
index 0d2d245108..9079d99b9d 100644
--- a/src/main/cpp/src/ParseURIJni.cpp
+++ b/src/main/cpp/src/ParseURIJni.cpp
@@ -33,4 +33,18 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseProtocol(
   }
   CATCH_STD(env, 0);
 }
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseHost(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong input_column)
+{
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const input = reinterpret_cast<cudf::column_view const*>(input_column);
+    return cudf::jni::ptr_as_jlong(spark_rapids_jni::parse_uri_to_host(*input).release());
+  }
+  CATCH_STD(env, 0);
+}
 }
diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index d0629cb71f..13a8effb37 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -292,9 +292,10 @@ bool __device__ validate_domain_name(string_view name)
 {
   // domain name can be alphanum or -.
   // slash can not be the first of last character of the domain name or around a .
-  bool last_was_slash  = false;
-  bool last_was_period = false;
-  bool numeric_start   = false;
+  bool last_was_slash          = false;
+  bool last_was_period         = false;
+  bool numeric_start           = false;
+  int characters_before_period = 0;
   for (auto iter = name.begin(); iter < name.end(); ++iter) {
     auto const c = *iter;
     if (!is_alphanum(c) && c != '-' && c != '.') { return false; }
@@ -311,12 +312,14 @@ bool __device__ validate_domain_name(string_view name)
       last_was_slash  = true;
       last_was_period = false;
     } else if (c == '.') {
-      if (last_was_slash) { return false; }
-      last_was_period = true;
-      last_was_slash  = false;
+      if (last_was_slash || last_was_period || characters_before_period == 0) { return false; }
+      last_was_period          = true;
+      last_was_slash           = false;
+      characters_before_period = 0;
     } else {
       last_was_period = false;
       last_was_slash  = false;
+      characters_before_period++;
     }
   }
 
@@ -847,4 +850,12 @@ std::unique_ptr<column> parse_uri_to_protocol(strings_column_view const& input,
   return detail::parse_uri(input, detail::URI_chunks::PROTOCOL, stream, mr);
 }
 
+std::unique_ptr<column> parse_uri_to_host(strings_column_view const& input,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::parse_uri(input, detail::URI_chunks::HOST, stream, mr);
+}
+
 }  // namespace spark_rapids_jni
\ No newline at end of file
diff --git a/src/main/cpp/src/parse_uri.hpp b/src/main/cpp/src/parse_uri.hpp
index c65d06d80a..0a76cec1b4 100644
--- a/src/main/cpp/src/parse_uri.hpp
+++ b/src/main/cpp/src/parse_uri.hpp
@@ -39,4 +39,17 @@ std::unique_ptr<cudf::column> parse_uri_to_protocol(
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Parse host and copy from the input string column to the output char buffer.
+ *
+ * @param input Input string column of URIs to parse
+ * @param stream Stream on which to operate.
+ * @param mr Memory resource for returned column
+ * @return std::unique_ptr<column> String column of hosts parsed.
+ */
+std::unique_ptr<cudf::column> parse_uri_to_host(
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/tests/parse_uri.cpp b/src/main/cpp/tests/parse_uri.cpp
index 6f522829b6..1112fea232 100644
--- a/src/main/cpp/tests/parse_uri.cpp
+++ b/src/main/cpp/tests/parse_uri.cpp
@@ -22,144 +22,178 @@
 #include <cudf_test/table_utilities.hpp>
 
 struct ParseURIProtocolTests : public cudf::test::BaseFixture {};
+struct ParseURIHostTests : public cudf::test::BaseFixture {};
 
-TEST_F(ParseURIProtocolTests, Simple)
+enum class test_types {
+  SIMPLE,
+  SPARK_EDGES,
+  IPv6,
+  IPv4,
+  UTF8,
+};
+
+namespace {
+cudf::test::strings_column_wrapper get_test_data(test_types t)
 {
-  cudf::test::strings_column_wrapper col({
-    "https://www.nvidia.com/s/uri?param1=2",
-    "http://www.nvidia.com",
-    "file://path/to/a/cool/file",
-    "smb://network/path/to/file",
-    "http:/www.nvidia.com",
-    "file:path/to/a/cool/file",
-  });
-  auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
-
-  cudf::test::strings_column_wrapper expected({"https", "http", "file", "smb", "http", "file"});
+  switch (t) {
+    case test_types::SIMPLE:
+      return cudf::test::strings_column_wrapper({
+        "https://www.nvidia.com/s/uri?param1=2",
+        "http://www.nvidia.com",
+        "file://path/to/a/cool/file",
+        "smb://network/path/to/file",
+        "http:/www.nvidia.com",
+        "file:path/to/a/cool/file",
+        "/network/path/to/file",
+        "nvidia.com",
+        "www.nvidia.com/s/uri",
+      });
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    case test_types::SPARK_EDGES:
+      return cudf::test::strings_column_wrapper(
+        {"https://nvidia.com/https&#://nvidia.com",
+         "https://http://www.nvidia.com",
+         "filesystemmagicthing://bob.yaml",
+         "nvidia.com:8080",
+         "http://thisisinvalid.data/due/to-the_character%s/inside*the#url`~",
+         "file:/absolute/path",
+         "//www.nvidia.com",
+         "#bob",
+         "#this%doesnt#make//sense://to/me",
+         "HTTP:&bob",
+         "/absolute/path",
+         "http://%77%77%77.%4EV%49%44%49%41.com",
+         "https:://broken.url",
+         "https://www.nvidia.com/q/This%20is%20a%20query",
+         "https://www.nvidia.com/\x93path/path/to/file",
+         "http://?",
+         "http://??",
+         "http://\?\?/",
+         "http://#",
+         "http://user:pass@host/file;param?query;p2",
+         "http://[1:2:3:4:5:6:7::]",
+         "http://[::2:3:4:5:6:7:8]",
+         "http://[fe80::7:8%eth0]",
+         "http://[fe80::7:8%1]",
+         "http://foo.bar/abc/\\\\\\http://foo.bar/abc.gif\\\\\\",
+         "www.nvidia.com:8100/servlet/"
+         "impc.DisplayCredits?primekey_in=2000041100:05:14115240636",
+         "https://nvidia.com/2Ru15Ss ",
+         "http://www.nvidia.com/plugins//##",
+         "www.nvidia.com:81/Free.fr/L7D9qw9X4S-aC0&amp;D4X0/Panels&amp;solutionId=0X54a/"
+         "cCdyncharset=UTF-8&amp;t=01wx58Tab&amp;ps=solution/"
+         "ccmd=_help&amp;locale0X1&amp;countrycode=MA/",
+         "http://www.nvidia.com/tags.php?%2F88\323\351\300ึณวน\331\315\370%2F",
+         "http://www.nvidia.com//wp-admin/includes/index.html#9389#123",
+         "http://www.nvidia.com/"
+         "object.php?object=ะก-\320%9Fะฑ-ะฟ-ะก\321%82\321%80ะตะป\321%8Cะฝะฐ-\321%83ะป-\320%"
+         "97ะฐะฒะพะด\321%81ะบะฐ\321%8F.html&sid=5",
+         "http://www.nvidia.com/picshow.asp?id=106&mnid=5080&classname=\271\253ืฐฦช",
+         "http://-.~_!$&'()*+,;=:%40:80%2f::::::@nvidia.com:443",
+         "http://userid:password@example.com:8080/",
+         "http://.www.nvidia.com./",
+         "http://www.nvidia..com/"});
+    case test_types::IPv6:
+      return cudf::test::strings_column_wrapper({
+        "https://[fe80::]",
+        "https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]",
+        "https://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334]",
+        "https://[2001:db8::1:0]",
+        "http://[2001:db8::2:1]",
+        "https://[::1]",
+        "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:443",
+        "https://[2001:db8:3333:4444:5555:6666:1.2.3.4]/path/to/file",
+        "https://[2001:db8:3333:4444:5555:6666:7777:8888:1.2.3.4]/path/to/file",
+        "https://[::db8:3333:4444:5555:6666:1.2.3.4]/path/to/file]",  // this is valid, but spark
+                                                                      // doesn't think so
+      });
+    case test_types::IPv4:
+      return cudf::test::strings_column_wrapper({
+        "https://192.168.1.100/",
+        "https://192.168.1.100:8443/",
+        "https://192.168.1.100.5/",
+        "https://192.168.1/",
+        "https://280.100.1.1/",
+        "https://182.168..100/path/to/file",
+      });
+    case test_types::UTF8:
+      return cudf::test::strings_column_wrapper({
+        "https://nvidia.com/%4EV%49%44%49%41",
+        "http://%77%77%77.%4EV%49%44%49%41.com",
+        "http://✪↩d⁚f„⁈.ws/123",
+        "https:// /path/to/file",
+      });
+    default: CUDF_FAIL("Test type unsupported!"); return cudf::test::strings_column_wrapper();
+  }
 }
+}  // namespace
 
-TEST_F(ParseURIProtocolTests, Negatives)
+TEST_F(ParseURIProtocolTests, Simple)
 {
-  cudf::test::strings_column_wrapper col({
-    "https//www.nvidia.com/s/uri?param1=2",
-    "/network/path/to/file",
-    "nvidia.com",
-    "www.nvidia.com/s/uri",
-  });
-  auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
+  auto const col    = get_test_data(test_types::SIMPLE);
+  auto const result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
 
-  cudf::test::strings_column_wrapper expected({"", "", "", ""}, {0, 0, 0, 0});
+  cudf::test::strings_column_wrapper const expected(
+    {"https", "http", "file", "smb", "http", "file", "", "", ""}, {1, 1, 1, 1, 1, 1, 0, 0, 0});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 }
 
 TEST_F(ParseURIProtocolTests, SparkEdges)
 {
-  cudf::test::strings_column_wrapper col(
-    {"https://nvidia.com/https&#://nvidia.com",
-     "https://http://www.nvidia.com",
-     "filesystemmagicthing://bob.yaml",
-     "nvidia.com:8080",
-     "http://thisisinvalid.data/due/to-the_character%s/inside*the#url`~",
-     "file:/absolute/path",
-     "//www.nvidia.com",
-     "#bob",
-     "#this%doesnt#make//sense://to/me",
-     "HTTP:&bob",
-     "/absolute/path",
-     "http://%77%77%77.%4EV%49%44%49%41.com",
-     "https:://broken.url",
-     "https://www.nvidia.com/q/This%20is%20a%20query",
-     "https://www.nvidia.com/\x93path/path/to/file",
-     "http://?",
-     "http://??",
-     "http://\?\?/",
-     "http://#",
-     "http://user:pass@host/file;param?query;p2",
-     "http://[1:2:3:4:5:6:7::]",
-     "http://[::2:3:4:5:6:7:8]",
-     "http://[fe80::7:8%eth0]",
-     "http://[fe80::7:8%1]",
-     "http://foo.bar/abc/\\\\\\http://foo.bar/abc.gif\\\\\\",
-     "www.nvidia.com:8100/servlet/"
-     "impc.DisplayCredits?primekey_in=2000041100:05:14115240636",
-     "https://nvidia.com/2Ru15Ss ",
-     "http://www.nvidia.com/plugins//##",
-     "www.nvidia.com:81/Free.fr/L7D9qw9X4S-aC0&amp;D4X0/Panels&amp;solutionId=0X54a/"
-     "cCdyncharset=UTF-8&amp;t=01wx58Tab&amp;ps=solution/"
-     "ccmd=_help&amp;locale0X1&amp;countrycode=MA/",
-     "http://www.nvidia.com/tags.php?%2F88\323\351\300ึณวน\331\315\370%2F",
-     "http://www.nvidia.com//wp-admin/includes/index.html#9389#123",
-     "http://www.nvidia.com/"
-     "object.php?object=ะก-\320%9Fะฑ-ะฟ-ะก\321%82\321%80ะตะป\321%8Cะฝะฐ-\321%83ะป-\320%"
-     "97ะฐะฒะพะด\321%81ะบะฐ\321%8F.html&sid=5",
-     "http://www.nvidia.com/picshow.asp?id=106&mnid=5080&classname=\271\253ืฐฦช",
-     "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com:443",
-     "http://userid:password@example.com:8080/"});
-
-  auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
-
-  cudf::test::strings_column_wrapper expected({"https",
-                                               "https",
-                                               "filesystemmagicthing",
-                                               "nvidia.com",
-                                               "",
-                                               "file",
-                                               "",
-                                               "",
-                                               "",
-                                               "HTTP",
-                                               "",
-                                               "http",
-                                               "https",
-                                               "https",
-                                               "",
-                                               "http",
-                                               "http",
-                                               "http",
-                                               "http",
-                                               "http",
-                                               "http",
-                                               "http",
-                                               "http",
-                                               "http",
-                                               "",
-                                               "www.nvidia.com",
-                                               "",
-                                               "",
-                                               "www.nvidia.com",
-                                               "",
-                                               "",
-                                               "",
-                                               "",
-                                               "http",
-                                               "http"},
-                                              {1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
-                                               1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1});
+  auto const col    = get_test_data(test_types::SPARK_EDGES);
+  auto const result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected(
+    {"https",
+     "https",
+     "filesystemmagicthing",
+     "nvidia.com",
+     "",
+     "file",
+     "",
+     "",
+     "",
+     "HTTP",
+     "",
+     "http",
+     "https",
+     "https",
+     "",
+     "http",
+     "http",
+     "http",
+     "http",
+     "http",
+     "http",
+     "http",
+     "http",
+     "http",
+     "",
+     "www.nvidia.com",
+     "",
+     "",
+     "www.nvidia.com",
+     "",
+     "",
+     "",
+     "",
+     "http",
+     "http",
+     "http",
+     "http"},
+    {1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 }
 
 TEST_F(ParseURIProtocolTests, IP6)
 {
-  cudf::test::strings_column_wrapper col({
-    "https://[fe80::]",
-    "https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]",
-    "https://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334]",
-    "https://[2001:db8::1:0]",
-    "http://[2001:db8::2:1]",
-    "https://[::1]",
-    "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:443",
-    "https://[2001:db8:3333:4444:5555:6666:1.2.3.4]/path/to/file",
-    "https://[2001:db8:3333:4444:5555:6666:7777:8888:1.2.3.4]/path/to/file",
-    "https://[::db8:3333:4444:5555:6666:1.2.3.4]/path/to/file]",  // this is valid, but spark
-                                                                  // doesn't think so
-  });
-  auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
-
-  cudf::test::strings_column_wrapper expected(
+  auto const col    = get_test_data(test_types::IPv6);
+  auto const result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected(
     {"https", "https", "https", "https", "http", "https", "https", "https", "", ""},
     {1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
 
@@ -168,17 +202,10 @@ TEST_F(ParseURIProtocolTests, IP6)
 
 TEST_F(ParseURIProtocolTests, IP4)
 {
-  cudf::test::strings_column_wrapper col({
-    "https://192.168.1.100/",
-    "https://192.168.1.100:8443/",
-    "https://192.168.1.100.5/",
-    "https://192.168.1/",
-    "https://280.100.1.1/",
-    "https://182.168..100/path/to/file",
-  });
-  auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
-
-  cudf::test::strings_column_wrapper expected(
+  auto const col    = get_test_data(test_types::IPv4);
+  auto const result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected(
     {"https", "https", "https", "https", "https", "https"});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
@@ -186,15 +213,112 @@ TEST_F(ParseURIProtocolTests, IP4)
 
 TEST_F(ParseURIProtocolTests, UTF8)
 {
-  cudf::test::strings_column_wrapper col({
-    "https://nvidia.com/%4EV%49%44%49%41",
-    "http://%77%77%77.%4EV%49%44%49%41.com",
-    "http://✪↩d⁚f„⁈.ws/123",
-    "https:// /path/to/file",
-  });
-  auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
-
-  cudf::test::strings_column_wrapper expected({"https", "http", "http", ""}, {1, 1, 1, 0});
+  auto const col    = get_test_data(test_types::UTF8);
+  auto const result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected({"https", "http", "http", ""}, {1, 1, 1, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+}
+
+TEST_F(ParseURIHostTests, Simple)
+{
+  auto const col    = get_test_data(test_types::SIMPLE);
+  auto const result = spark_rapids_jni::parse_uri_to_host(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected(
+    {"www.nvidia.com", "www.nvidia.com", "path", "network", "", "", "", "", ""},
+    {1, 1, 1, 1, 0, 0, 0, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+}
+
+TEST_F(ParseURIHostTests, SparkEdges)
+{
+  auto const col    = get_test_data(test_types::SPARK_EDGES);
+  auto const result = spark_rapids_jni::parse_uri_to_host(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected(
+    {"nvidia.com",
+     "http",
+     "bob.yaml",
+     "",
+     "",
+     "",
+     "www.nvidia.com",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "www.nvidia.com",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "host",
+     "[1:2:3:4:5:6:7::]",
+     "[::2:3:4:5:6:7:8]",
+     "[fe80::7:8%eth0]",
+     "[fe80::7:8%1]",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "nvidia.com",
+     "example.com",
+     "",
+     ""},
+    {1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+     1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+}
+
+TEST_F(ParseURIHostTests, IP6)
+{
+  auto const col    = get_test_data(test_types::IPv6);
+  auto const result = spark_rapids_jni::parse_uri_to_host(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected({"[fe80::]",
+                                                     "[2001:0db8:85a3:0000:0000:8a2e:0370:7334]",
+                                                     "[2001:0DB8:85A3:0000:0000:8A2E:0370:7334]",
+                                                     "[2001:db8::1:0]",
+                                                     "[2001:db8::2:1]",
+                                                     "[::1]",
+                                                     "[2001:db8:85a3:8d3:1319:8a2e:370:7348]",
+                                                     "[2001:db8:3333:4444:5555:6666:1.2.3.4]",
+                                                     "",
+                                                     ""},
+                                                    {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+}
+
+TEST_F(ParseURIHostTests, IP4)
+{
+  auto const col    = get_test_data(test_types::IPv4);
+  auto const result = spark_rapids_jni::parse_uri_to_host(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected(
+    {"192.168.1.100", "192.168.1.100", "", "", "", ""}, {1, 1, 0, 0, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+}
+
+TEST_F(ParseURIHostTests, UTF8)
+{
+  auto const col    = get_test_data(test_types::UTF8);
+  auto const result = spark_rapids_jni::parse_uri_to_host(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected({"nvidia.com", "", "", ""}, {1, 0, 0, 0});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
index 0c0b046f15..0e14f388d4 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
@@ -38,7 +38,18 @@ public static ColumnVector parseURIProtocol(ColumnView uriColumn) {
     return new ColumnVector(parseProtocol(uriColumn.getNativeView()));
   }
 
+  /**
+   * Parse host for each URI from the incoming column.
+   *
+   * @param URIColumn The input strings column in which each row contains a URI.
+   * @return A string column with host data extracted.
+   */
+  public static ColumnVector parseURIHost(ColumnView uriColumn) {
+    assert uriColumn.getType().equals(DType.STRING) : "Input type must be String";
+    return new ColumnVector(parseHost(uriColumn.getNativeView()));
+  }
 
   private static native long parseProtocol(long jsonColumnHandle);
+  private static native long parseHost(long jsonColumnHandle);
 
 }
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
index 5e90111f21..c6e3b06ed1 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
@@ -26,7 +26,8 @@
 
 public class ParseURITest {
   void buildExpectedAndRun(String[] testData) {
-    String[] expectedStrings = new String[testData.length];
+    String[] expectedProtocolStrings = new String[testData.length];
+    String[] expectedHostStrings = new String[testData.length];
     for (int i=0; i<testData.length; i++) {
       String scheme = null;
       try {
@@ -37,12 +38,26 @@ void buildExpectedAndRun(String[] testData) {
       } catch (NullPointerException ex) {
         // leave the scheme null if URI is null
       }
-      expectedStrings[i] = scheme;
+      String host = null;
+      try {
+        URI uri = new URI(testData[i]);
+        host = uri.getHost();
+      } catch (URISyntaxException ex) {
+        // leave the host null if URI is invalid
+      } catch (NullPointerException ex) {
+        // leave the host null if URI is null
+      }
+
+      expectedProtocolStrings[i] = scheme;
+      expectedHostStrings[i] = host;
     }
     try (ColumnVector v0 = ColumnVector.fromStrings(testData);
-      ColumnVector expected = ColumnVector.fromStrings(expectedStrings);
-      ColumnVector result = ParseURI.parseURIProtocol(v0)) {
-      AssertUtils.assertColumnsAreEqual(expected, result);
+      ColumnVector expectedProtocol = ColumnVector.fromStrings(expectedProtocolStrings);
+      ColumnVector expectedHost = ColumnVector.fromStrings(expectedHostStrings);
+      ColumnVector protocolResult = ParseURI.parseURIProtocol(v0);
+      ColumnVector hostResult = ParseURI.parseURIHost(v0)) {
+      AssertUtils.assertColumnsAreEqual(expectedProtocol, protocolResult);
+      AssertUtils.assertColumnsAreEqual(expectedHost, hostResult);
     }
   }
   
@@ -102,6 +117,7 @@ void parseURIToProtocolSparkTest() {
   @Test
   void parseURIToProtocolUTF8Test() {
     String[] testData = {
+      "https:// /path/to/file",
       "https://nvidia.com/%4EV%49%44%49%41",
       "http://%77%77%77.%4EV%49%44%49%41.com",
       "http://✪↩d⁚f„⁈.ws/123"};
@@ -116,8 +132,8 @@ void parseURIToProtocolIP4Test() {
       "https://192.168.1.100:8443/",
       "https://192.168.1.100.5/",
       "https://192.168.1/",
-      "https://280.100.1.1/"};
-
+      "https://280.100.1.1/",
+      "https://182.168..100/path/to/file"};
     buildExpectedAndRun(testData);
   }
 
@@ -131,6 +147,10 @@ void parseURIToProtocolIP6Test() {
       "http://[2001:db8::2:1]",
       "https://[::1]",
       "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:443",
+      "https://[2001:db8:3333:4444:5555:6666:1.2.3.4]/path/to/file",
+      "https://[2001:db8:3333:4444:5555:6666:7777:8888:1.2.3.4]/path/to/file",
+      "https://[::db8:3333:4444:5555:6666:1.2.3.4]/path/to/file]",
+      "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:443",
       "https://[2001:]db8:85a3:8d3:1319:8a2e:370:7348/",
       "https://[][][][]nvidia.com/",
       "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348:2001:db8:85a3]/path",

From 28fec25bb04618a6b1ad3a01e59fa6abe9cc0dda Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 6 Dec 2023 05:24:49 +0800
Subject: [PATCH 034/127] Update submodule cudf to
 a2d2ef4829a1616764a929d4c4d0d60a8debd4e5 (#1616)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8f7cbe69d4..a2d2ef4829 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8f7cbe69d4c2f670b97decc63e73b08e0eef7329
+Subproject commit a2d2ef4829a1616764a929d4c4d0d60a8debd4e5

From bbf5732aecd4224a93b8e9dd948bdcf8e85bd3bd Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 5 Dec 2023 23:49:05 +0000
Subject: [PATCH 035/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 31aedf2ddc..a2d2ef4829 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 31aedf2ddcd99cb4b572f8685f7790b743500149
+Subproject commit a2d2ef4829a1616764a929d4c4d0d60a8debd4e5

From da8be0b20f6ebff279c50d9f4dab0f8d0a5f8d4b Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 6 Dec 2023 11:23:26 +0800
Subject: [PATCH 036/127] Update submodule cudf to
 d97b3e091778987562508612d216a36207f5cd7c (#1619)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index a2d2ef4829..d97b3e0917 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit a2d2ef4829a1616764a929d4c4d0d60a8debd4e5
+Subproject commit d97b3e091778987562508612d216a36207f5cd7c

From 5ad7fe44fcd60a55bc5cb2c2e3fa1dde4afd3b11 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 6 Dec 2023 18:00:43 +0000
Subject: [PATCH 037/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8eacf8f2ec..d97b3e0917 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8eacf8f2ecb70eedf917fec2dfca4403810399d1
+Subproject commit d97b3e091778987562508612d216a36207f5cd7c

From 9e64d262f0c47ed5852dbbe55f8bb9036e6059fc Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 7 Dec 2023 06:02:17 +0800
Subject: [PATCH 038/127] Update submodule cudf to
 fe612b3eaa30cd0cc6f0f49f99dce8785e0258f6 (#1622)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index d97b3e0917..fe612b3eaa 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit d97b3e091778987562508612d216a36207f5cd7c
+Subproject commit fe612b3eaa30cd0cc6f0f49f99dce8785e0258f6

From cd55b989eac33435da1d9ceeef189fb5c0a7ed3c Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 7 Dec 2023 01:48:56 +0000
Subject: [PATCH 039/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index c1d307396f..fe612b3eaa 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit c1d307396fd357a9b6b68d626c2b17813d1181b8
+Subproject commit fe612b3eaa30cd0cc6f0f49f99dce8785e0258f6

From b3c23091740d3a493157bc6bf6064a967d71f345 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 7 Dec 2023 11:27:41 +0800
Subject: [PATCH 040/127] Update submodule cudf to
 d8f49750c76694c5093c22b415308ffc1ae1172f (#1626)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index fe612b3eaa..d8f49750c7 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit fe612b3eaa30cd0cc6f0f49f99dce8785e0258f6
+Subproject commit d8f49750c76694c5093c22b415308ffc1ae1172f

From 776990f2517ee817dda624b9a020ebb048582f1a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 7 Dec 2023 17:29:45 +0800
Subject: [PATCH 041/127] Update submodule cudf to
 f5dca59b0066427e3fa6e73570f4cd3b96fe3043 (#1627)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index d8f49750c7..f5dca59b00 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit d8f49750c76694c5093c22b415308ffc1ae1172f
+Subproject commit f5dca59b0066427e3fa6e73570f4cd3b96fe3043

From aeb0f5cb0075c0ad90395e242673a76610eb3bc7 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 7 Dec 2023 23:27:28 +0800
Subject: [PATCH 042/127] Update submodule cudf to
 a253826fbce0a81ee2b35f48174f002f66c228a6 (#1628)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f5dca59b00..a253826fbc 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f5dca59b0066427e3fa6e73570f4cd3b96fe3043
+Subproject commit a253826fbce0a81ee2b35f48174f002f66c228a6

From 46472b8755a455e2aca92c41d9d599d6b883389f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 8 Dec 2023 05:25:08 +0800
Subject: [PATCH 043/127] Update submodule cudf to
 6fc230ab8fd545ac1018664086af582fff2abd68 (#1629)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index a253826fbc..6fc230ab8f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit a253826fbce0a81ee2b35f48174f002f66c228a6
+Subproject commit 6fc230ab8fd545ac1018664086af582fff2abd68

From 844a336b31c17c9355f7c9c2f9639522c4bdca62 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 8 Dec 2023 11:24:36 +0800
Subject: [PATCH 044/127] Update submodule cudf to
 248aa2c8873c12e41f1e6ea2660740a0a4ddaf68 (#1630)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 6fc230ab8f..248aa2c887 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 6fc230ab8fd545ac1018664086af582fff2abd68
+Subproject commit 248aa2c8873c12e41f1e6ea2660740a0a4ddaf68

From 4c20e3ab78ce28e759ee57f6327049e9748fae6c Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 8 Dec 2023 13:23:08 +0800
Subject: [PATCH 045/127] Adding float to string kernel (#1508)

* wip

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* wip

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Add float to string kernel

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Update src/main/cpp/src/cast_float_to_string.cu

Co-authored-by: Mike Wilson <hyperbolic2346@users.noreply.github.com>

* Update src/main/cpp/src/cast_float_to_string.cu

Co-authored-by: Mike Wilson <hyperbolic2346@users.noreply.github.com>

* address comments and use different precision for float

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* rewrite the solution with ryu

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* update license

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Split ftos_converter out

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* resolve cudf conflicts

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* resolve cudf conflicts

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* resolve cudf conflicts

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* resolve cudf conflicts

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* remove cudf changes

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* remove cudf changes

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Add copyright and notice

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Fix copyrights and license

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* cudf conflict resolve

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Add nv apache license to ftos_converter

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Update src/main/cpp/src/ftos_converter.cu

Co-authored-by: Jason Lowe <jlowe@nvidia.com>

* address some comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* cudf conflict

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Update src/main/cpp/src/cast_float_to_string.cu

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>

* addressed comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clang format

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* sync

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

---------

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
Co-authored-by: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Co-authored-by: Jason Lowe <jlowe@nvidia.com>
Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 NOTICE                                        |   20 +
 src/main/cpp/CMakeLists.txt                   |    1 +
 src/main/cpp/src/CastStringJni.cpp            |   20 +-
 src/main/cpp/src/cast_float_to_string.cu      |  127 ++
 src/main/cpp/src/cast_string.hpp              |    7 +-
 src/main/cpp/src/ftos_converter.cuh           | 1179 +++++++++++++++++
 src/main/cpp/tests/CMakeLists.txt             |    3 +
 src/main/cpp/tests/cast_float_to_string.cpp   |   83 ++
 .../nvidia/spark/rapids/jni/CastStrings.java  |   13 +-
 9 files changed, 1449 insertions(+), 4 deletions(-)
 create mode 100644 NOTICE
 create mode 100644 src/main/cpp/src/cast_float_to_string.cu
 create mode 100644 src/main/cpp/src/ftos_converter.cuh
 create mode 100644 src/main/cpp/tests/cast_float_to_string.cpp

diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000000..a0975c00c8
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,20 @@
+RAPIDS Accelerator JNI For Apache Spark
+Copyright (c) 2022-2023, NVIDIA CORPORATION
+
+--------------------------------------------------------------------------------
+
+This project includes code from ryu (https://github.com/ulfjack/ryu).
+
+Copyright (2018) Ulf Adams and contributors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 29b6a795a3..18c0cd12e8 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -164,6 +164,7 @@ add_library(
   src/ZOrderJni.cpp
   src/bloom_filter.cu
   src/cast_decimal_to_string.cu
+  src/cast_float_to_string.cu
   src/cast_string.cu
   src/cast_string_to_float.cu
   src/datetime_rebase.cu
diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index d09bc33e4c..933fc15e34 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -109,6 +109,22 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toFloat(
   CATCH_CAST_EXCEPTION(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat(JNIEnv* env,
+                                                                               jclass,
+                                                                               jlong input_column)
+{
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    auto const& cv = *reinterpret_cast<cudf::column_view const*>(input_column);
+    return cudf::jni::release_as_jlong(
+      spark_rapids_jni::float_to_string(cv, cudf::get_default_stream()));
+  }
+  CATCH_CAST_EXCEPTION(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromDecimal(JNIEnv* env,
                                                                                  jclass,
                                                                                  jlong input_column)
@@ -118,7 +134,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromDecimal
   try {
     cudf::jni::auto_set_device(env);
 
-    cudf::column_view cv{*reinterpret_cast<cudf::column_view const*>(input_column)};
+    auto const& cv = *reinterpret_cast<cudf::column_view const*>(input_column);
     return cudf::jni::release_as_jlong(
       spark_rapids_jni::decimal_to_non_ansi_string(cv, cudf::get_default_stream()));
   }
diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
new file mode 100644
index 0000000000..6fc4d20f79
--- /dev/null
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cast_string.hpp"
+#include "ftos_converter.cuh"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace spark_rapids_jni {
+
+namespace detail {
+namespace {
+
+template <typename FloatType>
+struct float_to_string_fn {
+  cudf::column_device_view d_floats;
+  cudf::size_type* d_offsets;
+  char* d_chars;
+
+  __device__ cudf::size_type compute_output_size(cudf::size_type idx) const
+  {
+    auto const value        = d_floats.element<FloatType>(idx);
+    bool constexpr is_float = std::is_same_v<FloatType, float>;
+    return static_cast<cudf::size_type>(
+      ftos_converter::compute_ftos_size(static_cast<double>(value), is_float));
+  }
+
+  __device__ void float_to_string(cudf::size_type idx) const
+  {
+    auto const value        = d_floats.element<FloatType>(idx);
+    bool constexpr is_float = std::is_same_v<FloatType, float>;
+    auto const output       = d_chars + d_offsets[idx];
+    ftos_converter::float_to_string(static_cast<double>(value), is_float, output);
+  }
+
+  __device__ void operator()(cudf::size_type idx) const
+  {
+    if (d_floats.is_null(idx)) {
+      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      return;
+    }
+    if (d_chars != nullptr) {
+      float_to_string(idx);
+    } else {
+      d_offsets[idx] = compute_output_size(idx);
+    }
+  }
+};
+
+/**
+ * @brief This dispatch method is for converting floats into strings.
+ *
+ * The template function declaration ensures only float types are allowed.
+ */
+struct dispatch_float_to_string_fn {
+  template <typename FloatType, CUDF_ENABLE_IF(std::is_floating_point_v<FloatType>)>
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const& floats,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+  {
+    auto const strings_count = floats.size();
+    if (strings_count == 0) { return cudf::make_empty_column(cudf::type_id::STRING); }
+
+    auto const input_ptr = cudf::column_device_view::create(floats, stream);
+
+    auto [offsets, chars] = cudf::strings::detail::make_strings_children(
+      float_to_string_fn<FloatType>{*input_ptr}, strings_count, stream, mr);
+
+    return make_strings_column(strings_count,
+                               std::move(offsets),
+                               std::move(chars),
+                               floats.null_count(),
+                               cudf::detail::copy_bitmask(floats, stream, mr));
+  }
+
+  // non-float types throw an exception
+  template <typename T, CUDF_ENABLE_IF(not std::is_floating_point_v<T>)>
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
+                                           rmm::cuda_stream_view,
+                                           rmm::mr::device_memory_resource*)
+  {
+    CUDF_FAIL("Values for float_to_string function must be a float type.");
+  }
+};
+
+}  // namespace
+
+// This will convert all float column types into a strings column.
+std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  return type_dispatcher(floats.type(), dispatch_float_to_string_fn{}, floats, stream, mr);
+}
+
+}  // namespace detail
+
+// external API
+std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::float_to_string(floats, stream, mr);
+}
+
+}  // namespace spark_rapids_jni
\ No newline at end of file
diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp
index df74407355..c4f850b47f 100644
--- a/src/main/cpp/src/cast_string.hpp
+++ b/src/main/cpp/src/cast_string.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -115,6 +115,11 @@ std::unique_ptr<cudf::column> string_to_float(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<cudf::column> float_to_string(
+  cudf::column_view const& input,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 std::unique_ptr<cudf::column> decimal_to_non_ansi_string(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream,
diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
new file mode 100644
index 0000000000..444f790d3c
--- /dev/null
+++ b/src/main/cpp/src/ftos_converter.cuh
@@ -0,0 +1,1179 @@
+/*
+ * Copyright 2018 Ulf Adams
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda/std/cassert>
+#include <cuda/std/climits>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+namespace spark_rapids_jni::ftos_converter {
+
+namespace {
+
+// d2s.c from ryu
+// A floating decimal representing m * 10^e.
+typedef struct floating_decimal_64 {
+  uint64_t mantissa;
+  // Decimal exponent's range is -324 to 308
+  // inclusive, and can fit in a short if needed.
+  int32_t exponent;
+} floating_decimal_64;
+
+// f2s.c from ryu
+// A floating decimal representing m * 10^e.
+typedef struct floating_decimal_32 {
+  uint32_t mantissa;
+  // Decimal exponent's range is -45 to 38
+  // inclusive, and can fit in a short if needed.
+  int32_t exponent;
+} floating_decimal_32;
+
+//===== constants from ryu =====
+
+// These tables are generated by PrintDoubleLookupTable.
+constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125;
+constexpr unsigned int DOUBLE_POW5_BITCOUNT     = 125;
+constexpr unsigned int FLOAT_POW5_INV_BITCOUNT  = (DOUBLE_POW5_INV_BITCOUNT - 64);
+constexpr unsigned int FLOAT_POW5_BITCOUNT      = (DOUBLE_POW5_BITCOUNT - 64);
+constexpr unsigned int DOUBLE_MANTISSA_BITS     = 52;
+constexpr unsigned int DOUBLE_EXPONENT_BITS     = 11;
+constexpr unsigned int DOUBLE_BIAS              = 1023;
+constexpr unsigned int FLOAT_MANTISSA_BITS      = 23;
+constexpr unsigned int FLOAT_EXPONENT_BITS      = 8;
+constexpr unsigned int FLOAT_BIAS               = 127;
+
+__constant__ uint64_t const DOUBLE_POW5_INV_SPLIT2[15][2] = {
+  {1u, 2305843009213693952u},
+  {5955668970331000884u, 1784059615882449851u},
+  {8982663654677661702u, 1380349269358112757u},
+  {7286864317269821294u, 2135987035920910082u},
+  {7005857020398200553u, 1652639921975621497u},
+  {17965325103354776697u, 1278668206209430417u},
+  {8928596168509315048u, 1978643211784836272u},
+  {10075671573058298858u, 1530901034580419511u},
+  {597001226353042382u, 1184477304306571148u},
+  {1527430471115325346u, 1832889850782397517u},
+  {12533209867169019542u, 1418129833677084982u},
+  {5577825024675947042u, 2194449627517475473u},
+  {11006974540203867551u, 1697873161311732311u},
+  {10313493231639821582u, 1313665730009899186u},
+  {12701016819766672773u, 2032799256770390445u}};
+
+__constant__ uint32_t const POW5_INV_OFFSETS[19] = {0x54544554,
+                                                    0x04055545,
+                                                    0x10041000,
+                                                    0x00400414,
+                                                    0x40010000,
+                                                    0x41155555,
+                                                    0x00000454,
+                                                    0x00010044,
+                                                    0x40000000,
+                                                    0x44000041,
+                                                    0x50454450,
+                                                    0x55550054,
+                                                    0x51655554,
+                                                    0x40004000,
+                                                    0x01000001,
+                                                    0x00010500,
+                                                    0x51515411,
+                                                    0x05555554,
+                                                    0x00000000};
+
+__constant__ uint64_t const DOUBLE_POW5_SPLIT2[13][2] = {
+  {0u, 1152921504606846976u},
+  {0u, 1490116119384765625u},
+  {1032610780636961552u, 1925929944387235853u},
+  {7910200175544436838u, 1244603055572228341u},
+  {16941905809032713930u, 1608611746708759036u},
+  {13024893955298202172u, 2079081953128979843u},
+  {6607496772837067824u, 1343575221513417750u},
+  {17332926989895652603u, 1736530273035216783u},
+  {13037379183483547984u, 2244412773384604712u},
+  {1605989338741628675u, 1450417759929778918u},
+  {9630225068416591280u, 1874621017369538693u},
+  {665883850346957067u, 1211445438634777304u},
+  {14931890668723713708u, 1565756531257009982u}};
+
+__constant__ uint32_t const POW5_OFFSETS[21] = {
+  0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995, 0x55545555,
+  0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540, 0x45555550, 0x40004000,
+  0x96440440, 0x55565565, 0x54454045, 0x40154151, 0x55559155, 0x51405555, 0x00000105};
+
+constexpr uint32_t POW5_TABLE_SIZE = 26;
+
+__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {
+  1ull,
+  5ull,
+  25ull,
+  125ull,
+  625ull,
+  3125ull,
+  15625ull,
+  78125ull,
+  390625ull,
+  1953125ull,
+  9765625ull,
+  48828125ull,
+  244140625ull,
+  1220703125ull,
+  6103515625ull,
+  30517578125ull,
+  152587890625ull,
+  762939453125ull,
+  3814697265625ull,
+  19073486328125ull,
+  95367431640625ull,
+  476837158203125ull,
+  2384185791015625ull,
+  11920928955078125ull,
+  59604644775390625ull,
+  298023223876953125ull  //, 1490116119384765625ull
+};
+
+//===== common.h from ryu =====
+
+// Returns the number of decimal digits in v, which must not contain more than 9 digits.
+__device__ inline uint32_t decimalLength9(uint32_t const v)
+{
+  // Function precondition: v is not a 10-digit number.
+  // (f2s: 9 digits are sufficient for round-tripping.)
+  // (d2fixed: We print 9-digit blocks.)
+  assert(v < 1000000000);
+  if (v >= 100000000) { return 9; }
+  if (v >= 10000000) { return 8; }
+  if (v >= 1000000) { return 7; }
+  if (v >= 100000) { return 6; }
+  if (v >= 10000) { return 5; }
+  if (v >= 1000) { return 4; }
+  if (v >= 100) { return 3; }
+  if (v >= 10) { return 2; }
+  return 1;
+}
+
+// Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
+__device__ inline int32_t pow5bits(int32_t const e)
+{
+  // This approximation works up to the point that the multiplication overflows at e = 3529.
+  // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
+  // than 2^9297.
+  assert(e >= 0);
+  assert(e <= 3528);
+  return (int32_t)(((((uint32_t)e) * 1217359) >> 19) + 1);
+}
+
+// Returns floor(log_10(2^e)); requires 0 <= e <= 1650.
+__device__ inline uint32_t log10Pow2(int32_t const e)
+{
+  // The first value this approximation fails for is 2^1651 which is just greater than 10^297.
+  assert(e >= 0);
+  assert(e <= 1650);
+  return (((uint32_t)e) * 78913) >> 18;
+}
+
+// Returns floor(log_10(5^e)); requires 0 <= e <= 2620.
+__device__ inline uint32_t log10Pow5(int32_t const e)
+{
+  // The first value this approximation fails for is 5^2621 which is just greater than 10^1832.
+  assert(e >= 0);
+  assert(e <= 2620);
+  return (((uint32_t)e) * 732923) >> 20;
+}
+
+__device__ inline uint32_t pow5factor_32(uint32_t value)
+{
+  uint32_t count = 0;
+  for (;;) {
+    assert(value != 0);
+    uint32_t const q = value / 5;
+    uint32_t const r = value % 5;
+    if (r != 0) { break; }
+    value = q;
+    ++count;
+  }
+  return count;
+}
+
+// Returns true if value is divisible by 5^p.
+__device__ inline bool multipleOfPowerOf5_32(uint32_t const value, uint32_t const p)
+{
+  return pow5factor_32(value) >= p;
+}
+
+// Returns true if value is divisible by 2^p.
+__device__ inline bool multipleOfPowerOf2_32(uint32_t const value, uint32_t const p)
+{
+  // __builtin_ctz doesn't appear to be faster here.
+  return (value & ((1u << p) - 1)) == 0;
+}
+
+// It seems to be slightly faster to avoid uint128_t here, although the
+// generated code for uint128_t looks slightly nicer.
+__device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, int32_t const shift)
+{
+  assert(shift > 32);
+
+  // The casts here help MSVC to avoid calls to the __allmul library
+  // function.
+  uint32_t const factorLo = (uint32_t)(factor);
+  uint32_t const factorHi = (uint32_t)(factor >> 32);
+  uint64_t const bits0    = (uint64_t)m * factorLo;
+  uint64_t const bits1    = (uint64_t)m * factorHi;
+
+  uint64_t const sum        = (bits0 >> 32) + bits1;
+  uint64_t const shiftedSum = sum >> (shift - 32);
+  assert(shiftedSum <= UINT32_MAX);
+  return (uint32_t)shiftedSum;
+}
+
+__device__ inline int copy_special_str(char* const result,
+                                       bool const sign,
+                                       bool const exponent,
+                                       bool const mantissa)
+{
+  if (mantissa) {
+    memcpy(result, "NaN", 3);
+    return 3;
+  }
+  if (sign) { result[0] = '-'; }
+  if (exponent) {
+    memcpy(result + sign, "Infinity", 8);
+    return sign + 8;
+  }
+  memcpy(result + sign, "0.0", 3);
+  return sign + 3;
+}
+
+__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa)
+{
+  if (mantissa) { return 3; }
+  if (exponent) { return sign + 8; }
+  return sign + 3;
+}
+
+__device__ inline uint32_t float_to_bits(float const f)
+{
+  uint32_t bits = 0;
+  memcpy(&bits, &f, sizeof(float));
+  return bits;
+}
+
+__device__ inline uint64_t double_to_bits(double const d)
+{
+  uint64_t bits = 0;
+  memcpy(&bits, &d, sizeof(double));
+  return bits;
+}
+
+//===== d2s_intrinsics.h from ryu =====
+
+__device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* const productHi)
+{
+  // The casts here help MSVC to avoid calls to the __allmul library function.
+  uint32_t const aLo = (uint32_t)a;
+  uint32_t const aHi = (uint32_t)(a >> 32);
+  uint32_t const bLo = (uint32_t)b;
+  uint32_t const bHi = (uint32_t)(b >> 32);
+
+  uint64_t const b00 = (uint64_t)aLo * bLo;
+  uint64_t const b01 = (uint64_t)aLo * bHi;
+  uint64_t const b10 = (uint64_t)aHi * bLo;
+  uint64_t const b11 = (uint64_t)aHi * bHi;
+
+  uint32_t const b00Lo = (uint32_t)b00;
+  uint32_t const b00Hi = (uint32_t)(b00 >> 32);
+
+  uint64_t const mid1   = b10 + b00Hi;
+  uint32_t const mid1Lo = (uint32_t)(mid1);
+  uint32_t const mid1Hi = (uint32_t)(mid1 >> 32);
+
+  uint64_t const mid2   = b01 + mid1Lo;
+  uint32_t const mid2Lo = (uint32_t)(mid2);
+  uint32_t const mid2Hi = (uint32_t)(mid2 >> 32);
+
+  uint64_t const pHi = b11 + mid1Hi + mid2Hi;
+  uint64_t const pLo = ((uint64_t)mid2Lo << 32) | b00Lo;
+
+  *productHi = pHi;
+  return pLo;
+}
+
+__device__ inline uint64_t shiftright128(uint64_t const lo, uint64_t const hi, uint32_t const dist)
+{
+  // We don't need to handle the case dist >= 64 here (see above).
+  assert(dist < 64);
+  assert(dist > 0);
+  return (hi << (64 - dist)) | (lo >> dist);
+}
+
+__device__ inline uint64_t div5(uint64_t const x) { return x / 5; }
+
+__device__ inline uint64_t div10(uint64_t const x) { return x / 10; }
+
+__device__ inline uint64_t div100(uint64_t const x) { return x / 100; }
+
+__device__ inline uint32_t pow5Factor(uint64_t value)
+{
+  uint64_t const m_inv_5 = 14757395258967641293u;  // 5 * m_inv_5 = 1 (mod 2^64)
+  uint64_t const n_div_5 = 3689348814741910323u;   // #{ n | n = 0 (mod 2^64) } = 2^64 / 5
+  uint32_t count         = 0;
+  for (;;) {
+    assert(value != 0);
+    value *= m_inv_5;
+    if (value > n_div_5) break;
+    ++count;
+  }
+  return count;
+}
+
+// Returns true if value is divisible by 5^p.
+__device__ inline bool multipleOfPowerOf5(uint64_t const value, uint32_t const p)
+{
+  // I tried a case distinction on p, but there was no performance difference.
+  return pow5Factor(value) >= p;
+}
+
+// Returns true if value is divisible by 2^p.
+__device__ inline bool multipleOfPowerOf2(uint64_t const value, uint32_t const p)
+{
+  assert(value != 0);
+  assert(p < 64);
+  // __builtin_ctzll doesn't appear to be faster here.
+  return (value & ((1ull << p) - 1)) == 0;
+}
+
+__device__ inline uint64_t mulShift64(uint64_t const m, uint64_t const* const mul, int32_t const j)
+{
+  // m is maximum 55 bits
+  uint64_t high1;                                    // 128
+  uint64_t const low1 = umul128(m, mul[1], &high1);  // 64
+  uint64_t high0;                                    // 64
+  umul128(m, mul[0], &high0);                        // 0
+  uint64_t const sum = high0 + low1;
+  if (sum < high0) {
+    ++high1;  // overflow into high1
+  }
+  return shiftright128(sum, high1, j - 64);
+}
+
+__device__ inline uint64_t mulShiftAll64(uint64_t const m,
+                                         uint64_t const* const mul,
+                                         int32_t const j,
+                                         uint64_t* const vp,
+                                         uint64_t* const vm,
+                                         uint32_t const mmShift)
+{
+  *vp = mulShift64(4 * m + 2, mul, j);
+  *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
+  return mulShift64(4 * m, mul, j);
+}
+
+//===== d2s_small_table.h from ryu =====
+
+// Computes 5^i in the form required by Ryu, and stores it in the given pointer.
+__device__ inline void double_computePow5(uint32_t const i, uint64_t* const result)
+{
+  uint32_t const base       = i / POW5_TABLE_SIZE;
+  uint32_t const base2      = base * POW5_TABLE_SIZE;
+  uint32_t const offset     = i - base2;
+  uint64_t const* const mul = DOUBLE_POW5_SPLIT2[base];
+  if (offset == 0) {
+    result[0] = mul[0];
+    result[1] = mul[1];
+    return;
+  }
+  uint64_t const m = DOUBLE_POW5_TABLE[offset];
+  uint64_t high1;
+  uint64_t const low1 = umul128(m, mul[1], &high1);
+  uint64_t high0;
+  uint64_t const low0 = umul128(m, mul[0], &high0);
+  uint64_t const sum  = high0 + low1;
+  if (sum < high0) {
+    ++high1;  // overflow into high1
+  }
+  // high1 | sum | low0
+  uint32_t const delta = pow5bits(i) - pow5bits(base2);
+  result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
+  result[1] = shiftright128(sum, high1, delta);
+}
+
+// Computes 5^-i in the form required by Ryu, and stores it in the given pointer.
+__device__ inline void double_computeInvPow5(uint32_t const i, uint64_t* const result)
+{
+  uint32_t const base       = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE;
+  uint32_t const base2      = base * POW5_TABLE_SIZE;
+  uint32_t const offset     = base2 - i;
+  uint64_t const* const mul = DOUBLE_POW5_INV_SPLIT2[base];  // 1/5^base2
+  if (offset == 0) {
+    result[0] = mul[0];
+    result[1] = mul[1];
+    return;
+  }
+  uint64_t const m = DOUBLE_POW5_TABLE[offset];
+  uint64_t high1;
+  uint64_t const low1 = umul128(m, mul[1], &high1);
+  uint64_t high0;
+  uint64_t const low0 = umul128(m, mul[0] - 1, &high0);
+  uint64_t const sum  = high0 + low1;
+  if (sum < high0) {
+    ++high1;  // overflow into high1
+  }
+  // high1 | sum | low0
+  uint32_t const delta = pow5bits(base2) - pow5bits(i);
+  result[0] =
+    shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
+  result[1] = shiftright128(sum, high1, delta);
+}
+
+//===== f2s_intrinsics.h from ryu =====
+
+__device__ inline uint32_t mulPow5InvDivPow2(uint32_t const m, uint32_t const q, int32_t const j)
+{
+  // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double
+  // lookup table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely
+  // on the fact that the added 1 that's already stored in the table never overflows into the upper
+  // 64 bits.
+  uint64_t pow5[2];
+  double_computeInvPow5(q, pow5);
+  return mulShift32(m, pow5[1] + 1, j);
+}
+
+__device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, int32_t const j)
+{
+  uint64_t pow5[2];
+  double_computePow5(i, pow5);
+  return mulShift32(m, pow5[1], j);
+}
+
+//===== d2s.c and f2s.c from ryu =====
+
+__device__ inline uint32_t decimalLength17(uint64_t const v)
+{
+  // This is slightly faster than a loop.
+  // The average output length is 16.38 digits, so we check high-to-low.
+  // Function precondition: v is not an 18, 19, or 20-digit number.
+  // (17 digits are sufficient for round-tripping.)
+  assert(v < 100000000000000000L);
+  if (v >= 10000000000000000L) { return 17; }
+  if (v >= 1000000000000000L) { return 16; }
+  if (v >= 100000000000000L) { return 15; }
+  if (v >= 10000000000000L) { return 14; }
+  if (v >= 1000000000000L) { return 13; }
+  if (v >= 100000000000L) { return 12; }
+  if (v >= 10000000000L) { return 11; }
+  if (v >= 1000000000L) { return 10; }
+  if (v >= 100000000L) { return 9; }
+  if (v >= 10000000L) { return 8; }
+  if (v >= 1000000L) { return 7; }
+  if (v >= 100000L) { return 6; }
+  if (v >= 10000L) { return 5; }
+  if (v >= 1000L) { return 4; }
+  if (v >= 100L) { return 3; }
+  if (v >= 10L) { return 2; }
+  return 1;
+}
+
+__device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t const ieeeExponent)
+{
+  int32_t e2;
+  uint64_t m2;
+  if (ieeeExponent == 0) {
+    // We subtract 2 so that the bounds computation has 2 additional bits.
+    e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
+    m2 = ieeeMantissa;
+  } else {
+    e2 = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
+    m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
+  }
+  bool const even         = (m2 & 1) == 0;
+  bool const acceptBounds = even;
+
+  // Step 2: Determine the interval of valid decimal representations.
+  uint64_t const mv = 4 * m2;
+  // Implicit bool -> int conversion. True is 1, false is 0.
+  uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
+  // We would compute mp and mm like this:
+  // uint64_t mp = 4 * m2 + 2;
+  // uint64_t mm = mv - 1 - mmShift;
+
+  // Step 3: Convert to a decimal power base using 128-bit arithmetic.
+  uint64_t vr, vp, vm;
+  int32_t e10;
+  bool vmIsTrailingZeros = false;
+  bool vrIsTrailingZeros = false;
+  if (e2 >= 0) {
+    // I tried special-casing q == 0, but there was no effect on performance.
+    // This expression is slightly faster than max(0, log10Pow2(e2) - 1).
+    uint32_t const q = log10Pow2(e2) - (e2 > 3);
+    e10              = (int32_t)q;
+    int32_t const k  = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1;
+    int32_t const i  = -e2 + (int32_t)q + k;
+    uint64_t pow5[2];
+    double_computeInvPow5(q, pow5);
+    vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift);
+
+    if (q <= 21) {
+      // This should use q <= 22, but I think 21 is also safe. Smaller values
+      // may still be safe, but it's more difficult to reason about them.
+      // Only one of mp, mv, and mm can be a multiple of 5, if any.
+      uint32_t const mvMod5 = ((uint32_t)mv) - 5 * ((uint32_t)div5(mv));
+      if (mvMod5 == 0) {
+        vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
+      } else if (acceptBounds) {
+        // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q
+        // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q
+        // <=> true && pow5Factor(mm) >= q, since e2 >= q.
+        vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q);
+      } else {
+        // Same as min(e2 + 1, pow5Factor(mp)) >= q.
+        vp -= multipleOfPowerOf5(mv + 2, q);
+      }
+    }
+  } else {
+    // This expression is slightly faster than max(0, log10Pow5(-e2) - 1).
+    uint32_t const q = log10Pow5(-e2) - (-e2 > 1);
+    e10              = (int32_t)q + e2;
+    int32_t const i  = -e2 - (int32_t)q;
+    int32_t const k  = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
+    int32_t const j  = (int32_t)q - k;
+
+    uint64_t pow5[2];
+    double_computePow5(i, pow5);
+    vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift);
+
+    if (q <= 1) {
+      // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
+      // mv = 4 * m2, so it always has at least two trailing 0 bits.
+      vrIsTrailingZeros = true;
+      if (acceptBounds) {
+        // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
+        vmIsTrailingZeros = mmShift == 1;
+      } else {
+        // mp = mv + 2, so it always has at least one trailing 0 bit.
+        --vp;
+      }
+    } else if (q < 63) {  // TODO(ulfjack): Use a tighter bound here.
+      // We want to know if the full product has at least q trailing zeros.
+      // We need to compute min(p2(mv), p5(mv) - e2) >= q
+      // <=> p2(mv) >= q && p5(mv) - e2 >= q
+      // <=> p2(mv) >= q (because -e2 >= q)
+      vrIsTrailingZeros = multipleOfPowerOf2(mv, q);
+    }
+  }
+
+  // Step 4: Find the shortest decimal representation in the interval of valid representations.
+  int32_t removed          = 0;
+  uint8_t lastRemovedDigit = 0;
+  uint64_t output;
+  // On average, we remove ~2 digits.
+  if (vmIsTrailingZeros || vrIsTrailingZeros) {
+    // General case, which happens rarely (~0.7%).
+    for (;;) {
+      uint64_t const vpDiv10 = div10(vp);
+      uint64_t const vmDiv10 = div10(vm);
+      if (vpDiv10 <= vmDiv10) { break; }
+      uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10);
+      uint64_t const vrDiv10 = div10(vr);
+      uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10);
+      vmIsTrailingZeros &= vmMod10 == 0;
+      vrIsTrailingZeros &= lastRemovedDigit == 0;
+      lastRemovedDigit = (uint8_t)vrMod10;
+      vr               = vrDiv10;
+      vp               = vpDiv10;
+      vm               = vmDiv10;
+      ++removed;
+    }
+
+    if (vmIsTrailingZeros) {
+      for (;;) {
+        uint64_t const vmDiv10 = div10(vm);
+        uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10);
+        if (vmMod10 != 0) { break; }
+        uint64_t const vpDiv10 = div10(vp);
+        uint64_t const vrDiv10 = div10(vr);
+        uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10);
+        vrIsTrailingZeros &= lastRemovedDigit == 0;
+        lastRemovedDigit = (uint8_t)vrMod10;
+        vr               = vrDiv10;
+        vp               = vpDiv10;
+        vm               = vmDiv10;
+        ++removed;
+      }
+    }
+
+    if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
+      // Round even if the exact number is .....50..0.
+      lastRemovedDigit = 4;
+    }
+    // We need to take vr + 1 if vr is outside bounds or we need to round up.
+    output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
+  } else {
+    // Specialized for the common case (~99.3%). Percentages below are relative to this.
+    bool roundUp            = false;
+    uint64_t const vpDiv100 = div100(vp);
+    uint64_t const vmDiv100 = div100(vm);
+    if (vpDiv100 > vmDiv100) {  // Optimization: remove two digits at a time (~86.2%).
+      uint64_t const vrDiv100 = div100(vr);
+      uint32_t const vrMod100 = ((uint32_t)vr) - 100 * ((uint32_t)vrDiv100);
+      roundUp                 = vrMod100 >= 50;
+      vr                      = vrDiv100;
+      vp                      = vpDiv100;
+      vm                      = vmDiv100;
+      removed += 2;
+    }
+    // Loop iterations below (approximately), without optimization above:
+    // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
+    // Loop iterations below (approximately), with optimization above:
+    // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
+    for (;;) {
+      uint64_t const vpDiv10 = div10(vp);
+      uint64_t const vmDiv10 = div10(vm);
+      if (vpDiv10 <= vmDiv10) { break; }
+      uint64_t const vrDiv10 = div10(vr);
+      uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10);
+      roundUp                = vrMod10 >= 5;
+      vr                     = vrDiv10;
+      vp                     = vpDiv10;
+      vm                     = vmDiv10;
+      ++removed;
+    }
+
+    // We need to take vr + 1 if vr is outside bounds or we need to round up.
+    output = vr + (vr == vm || roundUp);
+  }
+  int32_t const exp = e10 + removed;
+
+  floating_decimal_64 fd;
+  fd.exponent = exp;
+  fd.mantissa = output;
+  return fd;
+}
+
+__device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t const ieeeExponent)
+{
+  int32_t e2;
+  uint32_t m2;
+  if (ieeeExponent == 0) {
+    // We subtract 2 so that the bounds computation has 2 additional bits.
+    e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
+    m2 = ieeeMantissa;
+  } else {
+    e2 = (int32_t)ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
+    m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa;
+  }
+  bool const even         = (m2 & 1) == 0;
+  bool const acceptBounds = even;
+
+  // Step 2: Determine the interval of valid decimal representations.
+  uint32_t const mv = 4 * m2;
+  uint32_t const mp = 4 * m2 + 2;
+  // Implicit bool -> int conversion. True is 1, false is 0.
+  uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
+  uint32_t const mm      = 4 * m2 - 1 - mmShift;
+
+  // Step 3: Convert to a decimal power base using 64-bit arithmetic.
+  uint32_t vr, vp, vm;
+  int32_t e10;
+  bool vmIsTrailingZeros   = false;
+  bool vrIsTrailingZeros   = false;
+  uint8_t lastRemovedDigit = 0;
+  if (e2 >= 0) {
+    uint32_t const q = log10Pow2(e2);
+    e10              = (int32_t)q;
+    int32_t const k  = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1;
+    int32_t const i  = -e2 + (int32_t)q + k;
+    vr               = mulPow5InvDivPow2(mv, q, i);
+    vp               = mulPow5InvDivPow2(mp, q, i);
+    vm               = mulPow5InvDivPow2(mm, q, i);
+    if (q != 0 && (vp - 1) / 10 <= vm / 10) {
+      // We need to know one removed digit even if we are not going to loop below. We could use
+      // q = X - 1 above, except that would require 33 bits for the result, and we've found that
+      // 32-bit arithmetic is faster even on 64-bit machines.
+      int32_t const l  = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)(q - 1)) - 1;
+      lastRemovedDigit = (uint8_t)(mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t)q - 1 + l) % 10);
+    }
+    if (q <= 9) {
+      // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well.
+      // Only one of mp, mv, and mm can be a multiple of 5, if any.
+      if (mv % 5 == 0) {
+        vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q);
+      } else if (acceptBounds) {
+        vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q);
+      } else {
+        vp -= multipleOfPowerOf5_32(mp, q);
+      }
+    }
+  } else {
+    uint32_t const q = log10Pow5(-e2);
+    e10              = (int32_t)q + e2;
+    int32_t const i  = -e2 - (int32_t)q;
+    int32_t const k  = pow5bits(i) - FLOAT_POW5_BITCOUNT;
+    int32_t j        = (int32_t)q - k;
+    vr               = mulPow5divPow2(mv, (uint32_t)i, j);
+    vp               = mulPow5divPow2(mp, (uint32_t)i, j);
+    vm               = mulPow5divPow2(mm, (uint32_t)i, j);
+    if (q != 0 && (vp - 1) / 10 <= vm / 10) {
+      j                = (int32_t)q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT);
+      lastRemovedDigit = (uint8_t)(mulPow5divPow2(mv, (uint32_t)(i + 1), j) % 10);
+    }
+    if (q <= 1) {
+      // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
+      // mv = 4 * m2, so it always has at least two trailing 0 bits.
+      vrIsTrailingZeros = true;
+      if (acceptBounds) {
+        // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
+        vmIsTrailingZeros = mmShift == 1;
+      } else {
+        // mp = mv + 2, so it always has at least one trailing 0 bit.
+        --vp;
+      }
+    } else if (q < 31) {  // TODO(ulfjack): Use a tighter bound here.
+      vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1);
+    }
+  }
+
+  // Step 4: Find the shortest decimal representation in the interval of valid representations.
+  int32_t removed = 0;
+  uint32_t output;
+  if (vmIsTrailingZeros || vrIsTrailingZeros) {
+    // General case, which happens rarely (~4.0%).
+    while (vp / 10 > vm / 10) {
+      vmIsTrailingZeros &= vm % 10 == 0;
+      vrIsTrailingZeros &= lastRemovedDigit == 0;
+      lastRemovedDigit = (uint8_t)(vr % 10);
+      vr /= 10;
+      vp /= 10;
+      vm /= 10;
+      ++removed;
+    }
+    if (vmIsTrailingZeros) {
+      while (vm % 10 == 0) {
+        vrIsTrailingZeros &= lastRemovedDigit == 0;
+        lastRemovedDigit = (uint8_t)(vr % 10);
+        vr /= 10;
+        vp /= 10;
+        vm /= 10;
+        ++removed;
+      }
+    }
+    if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
+      // Round even if the exact number is .....50..0.
+      lastRemovedDigit = 4;
+    }
+    // We need to take vr + 1 if vr is outside bounds or we need to round up.
+    output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
+  } else {
+    // Specialized for the common case (~96.0%). Percentages below are relative to this.
+    // Loop iterations below (approximately):
+    // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01%
+    while (vp / 10 > vm / 10) {
+      lastRemovedDigit = (uint8_t)(vr % 10);
+      vr /= 10;
+      vp /= 10;
+      vm /= 10;
+      ++removed;
+    }
+    // We need to take vr + 1 if vr is outside bounds or we need to round up.
+    output = vr + (vr == vm || lastRemovedDigit >= 5);
+  }
+  int32_t const exp = e10 + removed;
+
+  floating_decimal_32 fd;
+  fd.exponent = exp;
+  fd.mantissa = output;
+  return fd;
+}
+
+__device__ inline int to_chars(floating_decimal_64 const v, bool const sign, char* const result)
+{
+  // Step 5: Print the decimal representation.
+  int index = 0;
+  if (sign) { result[index++] = '-'; }
+
+  uint64_t output         = v.mantissa;
+  uint32_t const olength  = decimalLength17(output);
+  int32_t exp             = v.exponent + (int32_t)olength - 1;
+  bool scientificNotation = (exp < -3) || (exp >= 7);
+
+  // Values in the interval [1E-3, 1E7) are special.
+  if (scientificNotation) {
+    // Print in the format x.xxxxxE-yy.
+    for (uint32_t i = 0; i < olength - 1; ++i) {
+      uint32_t const c = output % 10;
+      output /= 10;
+      result[index + olength - i] = (char)('0' + c);
+    }
+    result[index]     = '0' + output % 10;
+    result[index + 1] = '.';
+    index += olength + 1;
+    if (olength == 1) { result[index++] = '0'; }
+    // Print 'E', the exponent sign, and the exponent, which has at most three digits.
+    result[index++] = 'E';
+    if (exp < 0) {
+      result[index++] = '-';
+      exp             = -exp;
+    }
+    if (exp >= 100) {
+      result[index++] = (char)('0' + exp / 100);
+      exp %= 100;
+      result[index++] = (char)('0' + exp / 10);
+    } else if (exp >= 10) {
+      result[index++] = (char)('0' + exp / 10);
+    }
+    result[index++] = (char)('0' + exp % 10);
+  } else {
+    // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+    if (exp < 0) {
+      // Decimal dot is before any of the digits.
+      result[index++] = '0';
+      result[index++] = '.';
+      for (int i = -1; i > exp; i--) {
+        result[index++] = '0';
+      }
+      int current = index;
+      for (int i = 0; i < olength; i++) {
+        result[current + olength - i - 1] = (char)('0' + output % 10);
+        output /= 10;
+        index++;
+      }
+    } else if (exp + 1 >= olength) {
+      // Decimal dot is after any of the digits.
+      for (int i = 0; i < olength; i++) {
+        result[index + olength - i - 1] = (char)('0' + output % 10);
+        output /= 10;
+      }
+      index += olength;
+      for (int i = olength; i < exp + 1; i++) {
+        result[index++] = '0';
+      }
+      result[index++] = '.';
+      result[index++] = '0';
+    } else {
+      // Decimal dot is somewhere between the digits.
+      int current = index + 1;
+      for (int i = 0; i < olength; i++) {
+        if (olength - i - 1 == exp) {
+          result[current + olength - i - 1] = '.';
+          current--;
+        }
+        result[current + olength - i - 1] = (char)('0' + output % 10);
+        output /= 10;
+      }
+      index += olength + 1;
+    }
+  }
+  return index;
+}
+
+__device__ inline int d2s_size(floating_decimal_64 const v, bool const sign)
+{
+  int index = 0;
+  if (sign) { index++; }
+
+  uint64_t output         = v.mantissa;
+  uint32_t const olength  = decimalLength17(output);
+  int32_t exp             = v.exponent + (int32_t)olength - 1;
+  bool scientificNotation = (exp < -3) || (exp >= 7);
+
+  if (scientificNotation) {
+    index += olength + 1;
+    if (olength == 1) { index++; }
+    // 'E'
+    index++;
+    if (exp < 0) {
+      exp = -exp;
+      index++;
+    }
+    if (exp >= 100) {
+      index += 3;
+    } else if (exp >= 10) {
+      index += 2;
+    } else {
+      index++;
+    }
+  } else {
+    // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+    if (exp < 0) {
+      index += 1 - exp + olength;
+    } else if (exp + 1 >= olength) {
+      index += exp + 3;
+    } else {
+      index += olength + 1;
+    }
+  }
+  return index;
+}
+
+__device__ inline int to_chars(floating_decimal_32 const v, bool const sign, char* const result)
+{
+  // Step 5: Print the decimal representation.
+  int index = 0;
+  if (sign) { result[index++] = '-'; }
+
+  uint32_t output         = v.mantissa;
+  uint32_t const olength  = decimalLength9(output);
+  int32_t exp             = v.exponent + olength - 1;
+  bool scientificNotation = (exp < -3) || (exp >= 7);
+
+  if (scientificNotation) {
+    // Print in the format x.xxxxxE-yy.
+    for (int i = 0; i < olength - 1; i++) {
+      int c = output % 10;
+      output /= 10;
+      result[index + olength - i] = (char)('0' + c);
+    }
+    result[index]     = (char)('0' + output % 10);
+    result[index + 1] = '.';
+    index += olength + 1;
+    if (olength == 1) { result[index++] = '0'; }
+
+    // Print 'E', the exponent sign, and the exponent, which has at most two digits.
+    result[index++] = 'E';
+    if (exp < 0) {
+      result[index++] = '-';
+      exp             = -exp;
+    }
+    if (exp >= 10) { result[index++] = (char)('0' + exp / 10); }
+    result[index++] = (char)('0' + exp % 10);
+  } else {
+    // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+    if (exp < 0) {
+      // Decimal dot is before any of the digits.
+      result[index++] = '0';
+      result[index++] = '.';
+      for (int i = -1; i > exp; i--) {
+        result[index++] = '0';
+      }
+      int current = index;
+      for (int i = 0; i < olength; i++) {
+        result[current + olength - i - 1] = (char)('0' + output % 10);
+        output /= 10;
+        index++;
+      }
+    } else if (exp + 1 >= olength) {
+      // Decimal dot is after any of the digits.
+      for (int i = 0; i < olength; i++) {
+        result[index + olength - i - 1] = (char)('0' + output % 10);
+        output /= 10;
+      }
+      index += olength;
+      for (int i = olength; i < exp + 1; i++) {
+        result[index++] = '0';
+      }
+      result[index++] = '.';
+      result[index++] = '0';
+    } else {
+      // Decimal dot is somewhere between the digits.
+      int current = index + 1;
+      for (int i = 0; i < olength; i++) {
+        if (olength - i - 1 == exp) {
+          result[current + olength - i - 1] = '.';
+          current--;
+        }
+        result[current + olength - i - 1] = (char)('0' + output % 10);
+        output /= 10;
+      }
+      index += olength + 1;
+    }
+  }
+  return index;
+}
+
+__device__ inline int f2s_size(floating_decimal_32 const v, bool const sign)
+{
+  // Step 5: Print the decimal representation.
+  int index = 0;
+  if (sign) { index++; }
+
+  uint32_t output         = v.mantissa;
+  uint32_t const olength  = decimalLength9(output);
+  int32_t exp             = v.exponent + olength - 1;
+  bool scientificNotation = (exp < -3) || (exp >= 7);
+
+  if (scientificNotation) {
+    index += olength + 1;
+    if (olength == 1) { index++; }
+    // 'E'
+    index++;
+    if (exp < 0) {
+      index++;
+      exp = -exp;
+    }
+    if (exp >= 10) { index++; }
+    index++;
+  } else {
+    // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+    if (exp < 0) {
+      // Decimal dot is before any of the digits.
+      index += 1 - exp + olength;
+    } else if (exp + 1 >= olength) {
+      // Decimal dot is after any of the digits.
+      index += exp + 3;
+    } else {
+      // Decimal dot is somewhere between the digits.
+      index += olength + 1;
+    }
+  }
+  return index;
+}
+
+__device__ inline bool d2d_small_int(uint64_t const ieeeMantissa,
+                                     uint32_t const ieeeExponent,
+                                     floating_decimal_64* const v)
+{
+  uint64_t const m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
+  int32_t const e2  = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
+
+  if (e2 > 0) {
+    // f = m2 * 2^e2 >= 2^53 is an integer.
+    // Ignore this case for now.
+    return false;
+  }
+
+  if (e2 < -52) {
+    // f < 1.
+    return false;
+  }
+
+  // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53.
+  // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0.
+  uint64_t const mask     = (1ull << -e2) - 1;
+  uint64_t const fraction = m2 & mask;
+  if (fraction != 0) { return false; }
+
+  // f is an integer in the range [1, 2^53).
+  // Note: mantissa might contain trailing (decimal) 0's.
+  // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17().
+  v->mantissa = m2 >> -e2;
+  v->exponent = 0;
+  return true;
+}
+
+__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special)
+{
+  // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
+  uint64_t const bits = double_to_bits(f);
+
+  // Decode bits into sign, mantissa, and exponent.
+  ieeeSign                    = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
+  uint64_t const ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
+  uint32_t const ieeeExponent =
+    (uint32_t)((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
+  // Case distinction; exit early for the easy cases.
+  if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) ||
+      (ieeeExponent == 0 && ieeeMantissa == 0)) {
+    special = true;
+    return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent};
+  }
+  special = false;
+  floating_decimal_64 v;
+  bool const isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v);
+  if (isSmallInt) {
+    // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros.
+    // For scientific notation we need to move these zeros into the exponent.
+    // (This is not needed for fixed-point notation, so it might be beneficial to trim
+    // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.)
+    for (;;) {
+      uint64_t const q = div10(v.mantissa);
+      uint32_t const r = ((uint32_t)v.mantissa) - 10 * ((uint32_t)q);
+      if (r != 0) { break; }
+      v.mantissa = q;
+      ++v.exponent;
+    }
+  } else {
+    v = d2d(ieeeMantissa, ieeeExponent);
+  }
+  return v;
+}
+
+__device__ int d2s_buffered_n(double f, char* result)
+{
+  bool sign = false, special = false;
+  floating_decimal_64 v = d2d(f, sign, special);
+  if (special) { return copy_special_str(result, sign, v.exponent, v.mantissa); }
+  return to_chars(v, sign, result);
+}
+
+__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special)
+{
+  // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
+  uint32_t const bits = float_to_bits(f);
+
+  // Decode bits into sign, mantissa, and exponent.
+  ieeeSign                    = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0;
+  uint32_t const ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1);
+  uint32_t const ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1);
+
+  // Case distinction; exit early for the easy cases.
+  if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) ||
+      (ieeeExponent == 0 && ieeeMantissa == 0)) {
+    special = true;
+    return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent};
+  }
+  special = false;
+  return f2d(ieeeMantissa, ieeeExponent);
+}
+
+__device__ int f2s_buffered_n(float f, char* result)
+{
+  bool sign = false, special = false;
+  floating_decimal_32 v = f2d(f, sign, special);
+  if (special) { return copy_special_str(result, sign, v.exponent, v.mantissa); }
+  return to_chars(v, sign, result);
+}
+
+//===== compute float to string size =====
+
+__device__ int compute_d2s_size(double value)
+{
+  bool sign = false, special = false;
+  floating_decimal_64 v = d2d(value, sign, special);
+  if (special) { return special_str_size(sign, v.exponent, v.mantissa); }
+  return d2s_size(v, sign);
+}
+
+__device__ int compute_f2s_size(float value)
+{
+  bool sign = false, special = false;
+  floating_decimal_32 v = f2d(value, sign, special);
+  if (special) { return special_str_size(sign, v.exponent, v.mantissa); }
+  return f2s_size(v, sign);
+}
+
+}  // namespace
+
+//===== APIs =====
+
+__device__ int compute_ftos_size(double value, bool is_float)
+{
+  if (is_float) {
+    return compute_f2s_size(value);
+  } else {
+    return compute_d2s_size(value);
+  }
+}
+
+__device__ int float_to_string(double value, bool is_float, char* output)
+{
+  if (is_float) {
+    return f2s_buffered_n(value, output);
+  } else {
+    return d2s_buffered_n(value, output);
+  }
+}
+
+}  // namespace spark_rapids_jni::ftos_converter
diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt
index 5e16398145..c9bb13046f 100644
--- a/src/main/cpp/tests/CMakeLists.txt
+++ b/src/main/cpp/tests/CMakeLists.txt
@@ -51,6 +51,9 @@ ConfigureTest(CAST_STRING
 ConfigureTest(CAST_DECIMAL_TO_STRING
     cast_decimal_to_string.cpp)
 
+ConfigureTest(CAST_FLOAT_TO_STRING
+    cast_float_to_string.cpp)
+
 ConfigureTest(DATETIME_REBASE
     datetime_rebase.cpp)
 
diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp
new file mode 100644
index 0000000000..1ae066fe42
--- /dev/null
+++ b/src/main/cpp/tests/cast_float_to_string.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cast_string.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <limits>
+
+using namespace cudf;
+
+constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR};
+
+struct FloatToStringTests : public cudf::test::BaseFixture {};
+
+TEST_F(FloatToStringTests, FromFloats32)
+{
+  auto const floats =
+    cudf::test::fixed_width_column_wrapper<float>{100.0f,
+                                                  654321.25f,
+                                                  -12761.125f,
+                                                  0.f,
+                                                  5.0f,
+                                                  -4.0f,
+                                                  std::numeric_limits<float>::quiet_NaN(),
+                                                  123456789012.34f,
+                                                  -0.0f};
+
+  auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream());
+
+  auto const expected = cudf::test::strings_column_wrapper{
+    "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "1.2345679E11", "-0.0"};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+}
+
+TEST_F(FloatToStringTests, FromFloats64)
+{
+  auto const floats =
+    cudf::test::fixed_width_column_wrapper<double>{100.0d,
+                                                   654321.25d,
+                                                   -12761.125d,
+                                                   1.123456789123456789d,
+                                                   0.000000000000000000123456789123456789d,
+                                                   0.0d,
+                                                   5.0d,
+                                                   -4.0d,
+                                                   std::numeric_limits<double>::quiet_NaN(),
+                                                   839542223232.794248339d,
+                                                   -0.0d};
+
+  auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream());
+
+  auto const expected = cudf::test::strings_column_wrapper{"100.0",
+                                                           "654321.25",
+                                                           "-12761.125",
+                                                           "1.1234567891234568",
+                                                           "1.234567891234568E-19",
+                                                           "0.0",
+                                                           "5.0",
+                                                           "-4.0",
+                                                           "NaN",
+                                                           "8.395422232327942E11",
+                                                           "-0.0"};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+}
\ No newline at end of file
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index eab42c41f6..022cb93085 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,6 +80,16 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st
     return new ColumnVector(toDecimal(cv.getNativeView(), ansiMode, strip, precision, scale));
   }
 
+  /**
+   * Convert a float column to a string column.
+   *
+   * @param cv the column data to process
+   * @return the converted column
+   */
+  public static ColumnVector fromFloat(ColumnView cv) {
+    return new ColumnVector(fromFloat(cv.getNativeView()));
+  }
+
   /**
    * Convert a decimal column to a string column.
    *
@@ -137,6 +147,7 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled
       int precision, int scale);
   private static native long toFloat(long nativeColumnView, boolean ansi_enabled, int dtype);
   private static native long fromDecimal(long nativeColumnView);
+  private static native long fromFloat(long nativeColumnView);
   private static native long toIntegersWithBase(long nativeColumnView, int base,
     boolean ansiEnabled, int dtype);
   private static native long fromIntegersWithBase(long nativeColumnView, int base);

From b51fde8fdf3e8e52199044a9e3b02f8a1f3a085b Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 8 Dec 2023 23:27:08 +0800
Subject: [PATCH 046/127] Update submodule cudf to
 dee47c7b55b3adb6c4f7545699112ddd6240441f (#1631)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 248aa2c887..dee47c7b55 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 248aa2c8873c12e41f1e6ea2660740a0a4ddaf68
+Subproject commit dee47c7b55b3adb6c4f7545699112ddd6240441f

From 4e67dd9ddea7783f8c18d4648b447656c15e9a56 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 9 Dec 2023 11:24:00 +0800
Subject: [PATCH 047/127] Update submodule cudf to
 899e3923b4a25078f1274ef1ba85cc5ef90552d6 (#1632)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index dee47c7b55..899e3923b4 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit dee47c7b55b3adb6c4f7545699112ddd6240441f
+Subproject commit 899e3923b4a25078f1274ef1ba85cc5ef90552d6

From 2b4bc70bb78cb3164fe7dfd5dfd157553e1fa21e Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Mon, 11 Dec 2023 02:27:36 +0000
Subject: [PATCH 048/127] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 2ce46216b5..899e3923b4 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 2ce46216b5bc9926aec438b64fc490c31c526a31
+Subproject commit 899e3923b4a25078f1274ef1ba85cc5ef90552d6

From ba7c3ad2015e7ebb874bfe5a67b74dbb0e1f170d Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Mon, 11 Dec 2023 17:27:15 +0800
Subject: [PATCH 049/127] Update submodule cudf to
 759a1c867fda8b207154f024b63de89701b2dad6 (#1636)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 899e3923b4..759a1c867f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 899e3923b4a25078f1274ef1ba85cc5ef90552d6
+Subproject commit 759a1c867fda8b207154f024b63de89701b2dad6

From 9e88a5711c80df203148d0bc99f303e11534fa7a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 12 Dec 2023 05:31:56 +0800
Subject: [PATCH 050/127] Update submodule cudf to
 fcaebeba50c0f6ea1c98491ab232a7cd9e018c71 (#1638)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 759a1c867f..fcaebeba50 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 759a1c867fda8b207154f024b63de89701b2dad6
+Subproject commit fcaebeba50c0f6ea1c98491ab232a7cd9e018c71

From feca82aa2461b6704a5e669535962848c055f133 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 12 Dec 2023 11:26:31 +0800
Subject: [PATCH 051/127] Update submodule cudf to
 1c6f80dc630d3a18e216812d4d6bd912995971d0 (#1640)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index fcaebeba50..1c6f80dc63 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit fcaebeba50c0f6ea1c98491ab232a7cd9e018c71
+Subproject commit 1c6f80dc630d3a18e216812d4d6bd912995971d0

From abfc5415acf68827623c49c8836865cc962e80ea Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 12 Dec 2023 17:28:34 +0800
Subject: [PATCH 052/127] Update submodule cudf to
 f8e891fc551ff691ac62c6d4067cb1867ea6213c (#1641)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 1c6f80dc63..f8e891fc55 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 1c6f80dc630d3a18e216812d4d6bd912995971d0
+Subproject commit f8e891fc551ff691ac62c6d4067cb1867ea6213c

From c95098c7a740d53c73b1f487ba5e318075cbae44 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 12 Dec 2023 23:30:11 +0800
Subject: [PATCH 053/127] Update submodule cudf to
 ef11061911aa9ef77cf615fea042a2bfa6f6cdea (#1642)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f8e891fc55..ef11061911 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f8e891fc551ff691ac62c6d4067cb1867ea6213c
+Subproject commit ef11061911aa9ef77cf615fea042a2bfa6f6cdea

From f6bb2052af9d3bce60d087c758c76b6d77b9936f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 13 Dec 2023 06:07:54 +0800
Subject: [PATCH 054/127] Update submodule cudf to
 21c90d6a264ee4334084f513b34425d5fdd032f8 (#1644)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index ef11061911..21c90d6a26 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit ef11061911aa9ef77cf615fea042a2bfa6f6cdea
+Subproject commit 21c90d6a264ee4334084f513b34425d5fdd032f8

From 6f049a7bf3a2ec990766840f0b723d5e7d750efb Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 13 Dec 2023 12:04:11 +0800
Subject: [PATCH 055/127] Update submodule cudf to
 06984380b724d30565c8da40c6512ea62ba4a64f (#1645)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 21c90d6a26..06984380b7 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 21c90d6a264ee4334084f513b34425d5fdd032f8
+Subproject commit 06984380b724d30565c8da40c6512ea62ba4a64f

From 7340b17448ca140a996de5b4160de793db6626e1 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 13 Dec 2023 18:04:22 +0800
Subject: [PATCH 056/127] Update submodule cudf to
 420dc5d787d4571c00266364f1a253e5ccffb094 (#1646)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 06984380b7..420dc5d787 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 06984380b724d30565c8da40c6512ea62ba4a64f
+Subproject commit 420dc5d787d4571c00266364f1a253e5ccffb094

From 6fc60a7ef8f7f480b218390e6209d7c203d713fc Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 14 Dec 2023 00:03:43 +0800
Subject: [PATCH 057/127] Update submodule cudf to
 a894ca03b18bd0304180f97882ccaaffa18028a0 (#1647)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 420dc5d787..a894ca03b1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 420dc5d787d4571c00266364f1a253e5ccffb094
+Subproject commit a894ca03b18bd0304180f97882ccaaffa18028a0

From 5466b8fc6c07280a531519ed7de3584176f130e4 Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gera@apache.org>
Date: Wed, 13 Dec 2023 10:02:32 -0800
Subject: [PATCH 058/127] Make OOM injections separable by host & device
 (#1637)

This PR makes it possible to inject
- host OOM only
- device OOM only
- in addition to the current mixed OOM injections

It also enables deferring OOM to the N+1st allocation by skipping first N

---------

Signed-off-by: Gera Shegalov <gera@apache.org>
---
 src/main/cpp/src/SparkResourceAdaptorJni.cpp  | 108 +++++++++++++-----
 .../com/nvidia/spark/rapids/jni/RmmSpark.java |  29 ++++-
 .../rapids/jni/SparkResourceAdaptor.java      |  27 ++++-
 3 files changed, 123 insertions(+), 41 deletions(-)

diff --git a/src/main/cpp/src/SparkResourceAdaptorJni.cpp b/src/main/cpp/src/SparkResourceAdaptorJni.cpp
index d3821fcc18..b8fb337bf2 100644
--- a/src/main/cpp/src/SparkResourceAdaptorJni.cpp
+++ b/src/main/cpp/src/SparkResourceAdaptorJni.cpp
@@ -226,6 +226,36 @@ struct task_metrics {
   }
 };
 
+enum class oom_type {
+  CPU_OR_GPU = 0,
+  CPU,
+  GPU,
+};
+
+struct oom_state_type {
+  int hit_count   = 0;
+  int skip_count  = 0;
+  oom_type filter = oom_type::CPU_OR_GPU;
+
+  void init(int const num_ooms, int const skip_count, int const oom_type_id)
+  {
+    if (num_ooms < 0) { throw std::invalid_argument("num_ooms cannot be negative"); }
+    if (skip_count < 0) { throw std::invalid_argument("skip_count cannot be negative"); }
+    if (oom_type_id < 0 || oom_type_id > 2) {
+      throw std::invalid_argument("oom_filter must be between 0 and 2");
+    }
+    this->hit_count  = num_ooms;
+    this->skip_count = skip_count;
+    this->filter     = static_cast<oom_type>(oom_type_id);
+  }
+
+  bool matches(bool is_for_cpu)
+  {
+    return filter == oom_type::CPU_OR_GPU || (is_for_cpu && filter == oom_type::CPU) ||
+           ((!is_for_cpu) && filter == oom_type::GPU);
+  }
+};
+
 /**
  * This is the full state of a thread. Some things like the thread_id and task_id
  * should not change after the state is set up. Everything else is up for change,
@@ -249,10 +279,12 @@ class full_thread_state {
   std::unordered_set<long> pool_task_ids;
   bool is_cpu_alloc = false;
   // Is the thread transitively blocked on a pool or not.
-  bool pool_blocked                = false;
-  int retry_oom_injected           = 0;
-  int split_and_retry_oom_injected = 0;
-  int cudf_exception_injected      = 0;
+  bool pool_blocked = false;
+
+  oom_state_type retry_oom;
+  oom_state_type split_and_retry_oom;
+
+  int cudf_exception_injected = 0;
   // watchdog limit on maximum number of retries to avoid unexpected live lock situations
   int num_times_retried = 0;
   // When did the retry time for this thread start, or when did the block time end.
@@ -663,12 +695,15 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * Force a specific thread to throw one or more RetryOOM exceptions when an
    * alloc is called. This is intended only for testing.
    */
-  void force_retry_oom(long const thread_id, int const num_ooms)
+  void force_retry_oom(long const thread_id,
+                       int const num_ooms,
+                       int const oom_filter,
+                       int const skip_count)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
     auto const threads_at = threads.find(thread_id);
     if (threads_at != threads.end()) {
-      threads_at->second.retry_oom_injected = num_ooms;
+      threads_at->second.retry_oom.init(num_ooms, skip_count, oom_filter);
     } else {
       throw std::invalid_argument("the thread is not associated with any task/shuffle");
     }
@@ -678,12 +713,15 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * Force a specific thread to throw one or more SplitAndRetryOOM exceptions
    * when an alloc is called. This is intended only for testing.
    */
-  void force_split_and_retry_oom(long const thread_id, int const num_ooms)
+  void force_split_and_retry_oom(long const thread_id,
+                                 int const num_ooms,
+                                 int const oom_filter,
+                                 int const skip_count)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
     auto const threads_at = threads.find(thread_id);
     if (threads_at != threads.end()) {
-      threads_at->second.split_and_retry_oom_injected = num_ooms;
+      threads_at->second.split_and_retry_oom.init(num_ooms, skip_count, oom_filter);
     } else {
       throw std::invalid_argument("the thread is not associated with any task/shuffle");
     }
@@ -1228,15 +1266,18 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
         default: break;
       }
 
-      if (thread->second.retry_oom_injected > 0) {
-        thread->second.retry_oom_injected--;
-        thread->second.metrics.num_times_retry_throw++;
-        log_status("INJECTED_RETRY_OOM", thread_id, thread->second.task_id, thread->second.state);
-        thread->second.record_failed_retry_time();
-        if (is_for_cpu) {
-          throw_java_exception(CPU_RETRY_OOM_CLASS, "injected RetryOOM");
-        } else {
-          throw_java_exception(GPU_RETRY_OOM_CLASS, "injected RetryOOM");
+      if (thread->second.retry_oom.matches(is_for_cpu)) {
+        if (thread->second.retry_oom.skip_count > 0) {
+          thread->second.retry_oom.skip_count--;
+        } else if (thread->second.retry_oom.hit_count > 0) {
+          thread->second.retry_oom.hit_count--;
+          thread->second.metrics.num_times_retry_throw++;
+          std::string const op_prefix = "INJECTED_RETRY_OOM_";
+          std::string const op        = op_prefix + (is_for_cpu ? "CPU" : "GPU");
+          log_status(op, thread_id, thread->second.task_id, thread->second.state);
+          thread->second.record_failed_retry_time();
+          throw_java_exception(is_for_cpu ? CPU_RETRY_OOM_CLASS : GPU_RETRY_OOM_CLASS,
+                               "injected RetryOOM");
         }
       }
 
@@ -1248,16 +1289,21 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
         throw_java_exception(cudf::jni::CUDF_ERROR_CLASS, "injected CudfException");
       }
 
-      if (thread->second.split_and_retry_oom_injected > 0) {
-        thread->second.split_and_retry_oom_injected--;
-        thread->second.metrics.num_times_split_retry_throw++;
-        log_status(
-          "INJECTED_SPLIT_AND_RETRY_OOM", thread_id, thread->second.task_id, thread->second.state);
-        thread->second.record_failed_retry_time();
-        if (is_for_cpu) {
-          throw_java_exception(CPU_SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM");
-        } else {
-          throw_java_exception(GPU_SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM");
+      if (thread->second.split_and_retry_oom.matches(is_for_cpu)) {
+        if (thread->second.split_and_retry_oom.skip_count > 0) {
+          thread->second.split_and_retry_oom.skip_count--;
+        } else if (thread->second.split_and_retry_oom.hit_count > 0) {
+          thread->second.split_and_retry_oom.hit_count--;
+          thread->second.metrics.num_times_split_retry_throw++;
+          std::string const op_prefix = "INJECTED_SPLIT_AND_RETRY_OOM_";
+          std::string const op        = op_prefix + (is_for_cpu ? "CPU" : "GPU");
+          log_status(op, thread_id, thread->second.task_id, thread->second.state);
+          thread->second.record_failed_retry_time();
+          if (is_for_cpu) {
+            throw_java_exception(CPU_SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM");
+          } else {
+            throw_java_exception(GPU_SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM");
+          }
         }
       }
 
@@ -1927,25 +1973,25 @@ JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_don
 }
 
 JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_forceRetryOOM(
-  JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_ooms)
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_ooms, jint oom_filter, jint skip_count)
 {
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
     auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
-    mr->force_retry_oom(thread_id, num_ooms);
+    mr->force_retry_oom(thread_id, num_ooms, oom_filter, skip_count);
   }
   CATCH_STD(env, )
 }
 
 JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_forceSplitAndRetryOOM(
-  JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_ooms)
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_ooms, jint oom_filter, jint skip_count)
 {
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
     auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
-    mr->force_split_and_retry_oom(thread_id, num_ooms);
+    mr->force_split_and_retry_oom(thread_id, num_ooms, oom_filter, skip_count);
   }
   CATCH_STD(env, )
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java b/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java
index 558124e2fe..e171894601 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java
@@ -23,11 +23,20 @@
 import ai.rapids.cudf.RmmException;
 import ai.rapids.cudf.RmmTrackingResourceAdaptor;
 
+import java.util.Arrays;
+import java.util.Map;
+
 /**
  * Initialize RMM in ways that are specific to Spark.
  */
 public class RmmSpark {
 
+  public enum OomInjectionType {
+    CPU_OR_GPU,
+    CPU,
+    GPU;
+  }
+
   private static volatile SparkResourceAdaptor sra = null;
 
   /**
@@ -432,17 +441,23 @@ public static void forceRetryOOM(long threadId) {
    * allocation attempt, depending on the type of allocation being done.
    * @param threadId the ID of the thread to throw the exception (not java thread id).
    * @param numOOMs the number of times the *RetryOOM should be thrown
+   * @param oomMode the ordinal corresponding to OomInjectionType to filter allocations
+   * @param skipCount how many matching allocations to skip
    */
-  public static void forceRetryOOM(long threadId, int numOOMs) {
+  public static void forceRetryOOM(long threadId, int numOOMs, int oomMode, int skipCount) {
     synchronized (Rmm.class) {
       if (sra != null && sra.isOpen()) {
-        sra.forceRetryOOM(threadId, numOOMs);
+        sra.forceRetryOOM(threadId, numOOMs, oomMode, skipCount);
       } else {
         throw new IllegalStateException("RMM has not been configured for OOM injection");
       }
     }
   }
 
+  public static void forceRetryOOM(long threadId, int numOOMs) {
+    forceRetryOOM(threadId, numOOMs, OomInjectionType.CPU_OR_GPU.ordinal(), 0);
+  }
+
   /**
    * Force the thread with the given ID to throw a GpuSplitAndRetryOOM of CpuSplitAndRetryOOM
    * on their next allocation attempt, depending on the allocation being done.
@@ -457,17 +472,23 @@ public static void forceSplitAndRetryOOM(long threadId) {
    * on their next allocation attempt, depending on the allocation being done.
    * @param threadId the ID of the thread to throw the exception (not java thread id).
    * @param numOOMs the number of times the *SplitAndRetryOOM should be thrown
+   * @param oomMode the ordinal corresponding to OomInjectionType to filter allocations
+   * @param skipCount how many matching allocations to skip
    */
-  public static void forceSplitAndRetryOOM(long threadId, int numOOMs) {
+  public static void forceSplitAndRetryOOM(long threadId, int numOOMs, int oomMode, int skipCount) {
     synchronized (Rmm.class) {
       if (sra != null && sra.isOpen()) {
-        sra.forceSplitAndRetryOOM(threadId, numOOMs);
+        sra.forceSplitAndRetryOOM(threadId, numOOMs, oomMode, skipCount);
       } else {
         throw new IllegalStateException("RMM has not been configured for OOM injection");
       }
     }
   }
 
+  public static void forceSplitAndRetryOOM(long threadId, int numOOMs) {
+    forceSplitAndRetryOOM(threadId, numOOMs, OomInjectionType.CPU_OR_GPU.ordinal(), 0);
+  }
+
   /**
    * Force the thread with the given ID to throw a CudfException on their next allocation attempt.
    * This is to simulate a cuDF exception being thrown from a kernel and test retry handling code.
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java b/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java
index 74f1946748..d766c34230 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java
@@ -15,6 +15,8 @@
  */
 package com.nvidia.spark.rapids.jni;
 
+import com.nvidia.spark.rapids.jni.RmmSpark.OomInjectionType;
+
 import ai.rapids.cudf.NativeDepsLoader;
 import ai.rapids.cudf.RmmDeviceMemoryResource;
 import ai.rapids.cudf.RmmEventHandlerResourceAdaptor;
@@ -186,18 +188,31 @@ public void doneWaitingOnPool(long threadId) {
    * Force the thread with the given ID to throw a GpuRetryOOM on their next allocation attempt.
    * @param threadId the ID of the thread to throw the exception (not java thread id).
    * @param numOOMs the number of times the GpuRetryOOM should be thrown
+   * @param oomMode ordinal of the corresponding RmmSpark.OomInjectionType
+   * @param skipCount the number of times a matching allocation is skipped before injecting the first OOM
    */
-  public void forceRetryOOM(long threadId, int numOOMs) {
-    forceRetryOOM(getHandle(), threadId, numOOMs);
+  public void forceRetryOOM(long threadId, int numOOMs, int oomMode, int skipCount) {
+    validateOOMInjectionParams(numOOMs, oomMode, skipCount);
+    forceRetryOOM(getHandle(), threadId, numOOMs, oomMode, skipCount);
+  }
+
+  private void validateOOMInjectionParams(int numOOMs, int oomMode, int skipCount) {
+    assert numOOMs >= 0 : "non-negative numOoms expected: actual=" + numOOMs;
+    assert skipCount >= 0 : "non-negative skipCount expected: actual=" + skipCount;
+    assert oomMode >= 0 && oomMode < OomInjectionType.values().length:
+      "non-negative oomMode<" + OomInjectionType.values().length + " expected: actual=" + oomMode;
   }
 
   /**
    * Force the thread with the given ID to throw a GpuSplitAndRetryOOM on their next allocation attempt.
    * @param threadId the ID of the thread to throw the exception (not java thread id).
    * @param numOOMs the number of times the GpuSplitAndRetryOOM should be thrown
+   * @param oomMode ordinal of the corresponding RmmSpark.OomInjectionType
+   * @param skipCount the number of times a matching allocation is skipped before injecting the first OOM
    */
-  public void forceSplitAndRetryOOM(long threadId, int numOOMs) {
-    forceSplitAndRetryOOM(getHandle(), threadId, numOOMs);
+  public void forceSplitAndRetryOOM(long threadId, int numOOMs, int oomMode, int skipCount) {
+    validateOOMInjectionParams(numOOMs, oomMode, skipCount);
+    forceSplitAndRetryOOM(getHandle(), threadId, numOOMs, oomMode, skipCount);
   }
 
   /**
@@ -295,8 +310,8 @@ public void cpuDeallocate(long ptr, long amount) {
   private static native void submittingToPool(long handle, long threadId);
   private static native void waitingOnPool(long handle, long threadId);
   private static native void doneWaitingOnPool(long handle, long threadId);
-  private static native void forceRetryOOM(long handle, long threadId, int numOOMs);
-  private static native void forceSplitAndRetryOOM(long handle, long threadId, int numOOMs);
+  private static native void forceRetryOOM(long handle, long threadId, int numOOMs, int oomMode, int skipCount);
+  private static native void forceSplitAndRetryOOM(long handle, long threadId, int numOOMs, int oomMode, int skipCount);
   private static native void forceCudfException(long handle, long threadId, int numTimes);
   private static native void blockThreadUntilReady(long handle);
   private static native int getStateOf(long handle, long threadId);

From 56081f7472ec498f8b54f80627b3bf1b8deb7b0d Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 14 Dec 2023 05:25:53 +0800
Subject: [PATCH 059/127] Update submodule cudf to
 8136a16701f970b512d2bd35a45606f00263fd89 (#1648)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index a894ca03b1..8136a16701 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit a894ca03b18bd0304180f97882ccaaffa18028a0
+Subproject commit 8136a16701f970b512d2bd35a45606f00263fd89

From 4d2fc033a0b1636adb44edace5134cf462493f21 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 14 Dec 2023 11:28:24 +0800
Subject: [PATCH 060/127] Update submodule cudf to
 cee642916cfc3b8df73e819bf3bc50f1b9fc684f (#1649)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8136a16701..cee642916c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8136a16701f970b512d2bd35a45606f00263fd89
+Subproject commit cee642916cfc3b8df73e819bf3bc50f1b9fc684f

From 19dc0595dd1fa99005f9150ac315040dcd622df8 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 15 Dec 2023 11:30:56 +0800
Subject: [PATCH 061/127] Update submodule cudf to
 2cb8f3da3a3dd539301f90dcfbccadaf06963fd2 (#1653)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index cee642916c..2cb8f3da3a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit cee642916cfc3b8df73e819bf3bc50f1b9fc684f
+Subproject commit 2cb8f3da3a3dd539301f90dcfbccadaf06963fd2

From 6320bbe5eb4f19c8a4f7781abd97351f5e3db18a Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Fri, 15 Dec 2023 18:47:50 +0100
Subject: [PATCH 062/127] Handle Decimal-128 Multiplication For Newer Spark
 Versions (#1623)

* Added another multiplication method for decimal 128

* Signing off

Signed-off-by: Raza Jafri <rjafri@nvidia.com>

* addressed review comments

* fixed clang

* addressed review comments

* ran pre-commit

* removed pass-by reference

* possible reason for CI failure, as locally it still builds

* addressed review comments

* formatted Java code

---------

Signed-off-by: Raza Jafri <rjafri@nvidia.com>
---
 src/main/cpp/src/DecimalUtilsJni.cpp          | 11 ++--
 src/main/cpp/src/decimal_utils.cu             | 51 +++++++++++--------
 src/main/cpp/src/decimal_utils.hpp            |  1 +
 .../nvidia/spark/rapids/jni/DecimalUtils.java | 35 +++++++++++--
 .../spark/rapids/jni/DecimalUtilsTest.java    | 12 +++++
 5 files changed, 82 insertions(+), 28 deletions(-)

diff --git a/src/main/cpp/src/DecimalUtilsJni.cpp b/src/main/cpp/src/DecimalUtilsJni.cpp
index f732276817..6c7c1cc781 100644
--- a/src/main/cpp/src/DecimalUtilsJni.cpp
+++ b/src/main/cpp/src/DecimalUtilsJni.cpp
@@ -19,8 +19,13 @@
 
 extern "C" {
 
-JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_multiply128(
-  JNIEnv* env, jclass, jlong j_view_a, jlong j_view_b, jint j_product_scale)
+JNIEXPORT jlongArray JNICALL
+Java_com_nvidia_spark_rapids_jni_DecimalUtils_multiply128(JNIEnv* env,
+                                                          jclass,
+                                                          jlong j_view_a,
+                                                          jlong j_view_b,
+                                                          jint j_product_scale,
+                                                          bool cast_interim_result)
 {
   JNI_NULL_CHECK(env, j_view_a, "column is null", 0);
   JNI_NULL_CHECK(env, j_view_b, "column is null", 0);
@@ -30,7 +35,7 @@ JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_multi
     auto view_b = reinterpret_cast<cudf::column_view const*>(j_view_b);
     auto scale  = static_cast<int>(j_product_scale);
     return cudf::jni::convert_table_for_return(
-      env, cudf::jni::multiply_decimal128(*view_a, *view_b, scale));
+      env, cudf::jni::multiply_decimal128(*view_a, *view_b, scale, cast_interim_result));
   }
   CATCH_STD(env, 0);
 }
diff --git a/src/main/cpp/src/decimal_utils.cu b/src/main/cpp/src/decimal_utils.cu
index 392fb495b4..92273ff545 100644
--- a/src/main/cpp/src/decimal_utils.cu
+++ b/src/main/cpp/src/decimal_utils.cu
@@ -657,14 +657,16 @@ struct dec128_multiplier {
   dec128_multiplier(bool* overflows,
                     cudf::mutable_column_view const& product_view,
                     cudf::column_view const& a_col,
-                    cudf::column_view const& b_col)
+                    cudf::column_view const& b_col,
+                    bool const cast_interim_result)
     : overflows(overflows),
       a_data(a_col.data<__int128_t>()),
       b_data(b_col.data<__int128_t>()),
       product_data(product_view.data<__int128_t>()),
       a_scale(a_col.type().scale()),
       b_scale(b_col.type().scale()),
-      prod_scale(product_view.type().scale())
+      prod_scale(product_view.type().scale()),
+      cast_interim_result(cast_interim_result)
   {
   }
 
@@ -675,22 +677,24 @@ struct dec128_multiplier {
 
     chunked256 product = multiply(a, b);
 
-    // Spark does some really odd things that I personally think are a bug
-    // https://issues.apache.org/jira/browse/SPARK-40129
-    // But to match Spark we need to first round the result to a precision of 38
-    // and this is specific to the value in the result of the multiply.
-    // Then we need to round the result to the final scale that we care about.
-    int dec_precision       = precision10(product);
-    int first_div_precision = dec_precision - 38;
-
-    int mult_scale = a_scale + b_scale;
-    if (first_div_precision > 0) {
-      auto const first_div_scale_divisor = pow_ten(first_div_precision).as_128_bits();
-      product                            = divide_and_round(product, first_div_scale_divisor);
-
-      // a_scale and b_scale are negative. first_div_precision is not
-      mult_scale = a_scale + b_scale + first_div_precision;
-    }
+    int const mult_scale = [&]() {
+      // According to https://issues.apache.org/jira/browse/SPARK-40129
+      // and https://issues.apache.org/jira/browse/SPARK-45786, Spark has a bug in
+      // versions 3.2.4, 3.3.3, 3.4.1, 3.5.0 and 4.0.0 The bug is fixed for later versions but to
+      // match the legacy behavior we need to first round the result to a precision of 38 then we
+      // need to round the result to the final scale that we care about.
+      if (cast_interim_result) {
+        auto const first_div_precision = precision10(product) - 38;
+        if (first_div_precision > 0) {
+          auto const first_div_scale_divisor = pow_ten(first_div_precision).as_128_bits();
+          product                            = divide_and_round(product, first_div_scale_divisor);
+
+          // a_scale and b_scale are negative. first_div_precision is not
+          return a_scale + b_scale + first_div_precision;
+        }
+      }
+      return a_scale + b_scale;
+    }();
 
     int exponent = prod_scale - mult_scale;
     if (exponent < 0) {
@@ -718,6 +722,7 @@ struct dec128_multiplier {
  private:
   // output column for overflow detected
   bool* const overflows;
+  bool const cast_interim_result;
 
   // input data for multiply
   __int128_t const* const a_data;
@@ -968,6 +973,7 @@ namespace cudf::jni {
 std::unique_ptr<cudf::table> multiply_decimal128(cudf::column_view const& a,
                                                  cudf::column_view const& b,
                                                  int32_t product_scale,
+                                                 bool const cast_interim_result,
                                                  rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column");
@@ -992,10 +998,11 @@ std::unique_ptr<cudf::table> multiply_decimal128(cudf::column_view const& a,
   auto overflows_view = columns[0]->mutable_view();
   auto product_view   = columns[1]->mutable_view();
   check_scale_divisor(a.type().scale() + b.type().scale(), product_scale);
-  thrust::for_each(rmm::exec_policy(stream),
-                   thrust::make_counting_iterator<cudf::size_type>(0),
-                   thrust::make_counting_iterator<cudf::size_type>(num_rows),
-                   dec128_multiplier(overflows_view.begin<bool>(), product_view, a, b));
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<cudf::size_type>(0),
+    thrust::make_counting_iterator<cudf::size_type>(num_rows),
+    dec128_multiplier(overflows_view.begin<bool>(), product_view, a, b, cast_interim_result));
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
diff --git a/src/main/cpp/src/decimal_utils.hpp b/src/main/cpp/src/decimal_utils.hpp
index 95c6c56c3d..9793e63445 100644
--- a/src/main/cpp/src/decimal_utils.hpp
+++ b/src/main/cpp/src/decimal_utils.hpp
@@ -30,6 +30,7 @@ std::unique_ptr<cudf::table> multiply_decimal128(
   cudf::column_view const& a,
   cudf::column_view const& b,
   int32_t product_scale,
+  bool const cast_interim_result,
   rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 std::unique_ptr<cudf::table> divide_decimal128(
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/DecimalUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/DecimalUtils.java
index 389679965a..17337691c5 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/DecimalUtils.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/DecimalUtils.java
@@ -25,21 +25,50 @@ public class DecimalUtils {
     NativeDepsLoader.loadNativeDeps();
   }
 
+  /**
+   * Multiply two DECIMAL128 columns together into a DECIMAL128 product rounded to the specified
+   * scale with overflow detection. This method considers a precision greater than 38 as overflow
+   * even if the number still fits in a 128-bit representation.
+   *
+   * WARNING: This method has a bug which we match with Spark versions before 3.4.2,
+   * 4.0.0, 3.5.1. Consider the following example using Decimal with a precision of 38 and scale of 10:
+   * -8533444864753048107770677711.1312637916 * -12.0000000000 = 102401338377036577293248132533.575166
+   * while the actual answer based on Java BigDecimal is 102401338377036577293248132533.575165
+   *
+   * @param a            factor input, must match row count of the other factor input
+   * @param b            factor input, must match row count of the other factor input
+   * @param productScale scale to use for the product type
+   * @return table containing a boolean column and a DECIMAL128 product column of the specified
+   * scale. The boolean value will be true if an overflow was detected for that row's
+   * DECIMAL128 product value. A null input row will result in a corresponding null output
+   * row.
+   */
+  public static Table multiply128(ColumnView a, ColumnView b, int productScale) {
+    return new Table(multiply128(a.getNativeView(), b.getNativeView(), productScale, true));
+  }
 
   /**
    * Multiply two DECIMAL128 columns together into a DECIMAL128 product rounded to the specified
    * scale with overflow detection. This method considers a precision greater than 38 as overflow
    * even if the number still fits in a 128-bit representation.
+   *
+   * WARNING: With interimCast set to  true, this method has a bug which we match with Spark versions before 3.4.2,
+   * 4.0.0, 3.5.1. Consider the following example using Decimal with a precision of 38 and scale of 10:
+   * -8533444864753048107770677711.1312637916 * -12.0000000000 = 102401338377036577293248132533.575166
+   * while the actual answer based on Java BigDecimal is 102401338377036577293248132533.575165
+   *
    * @param a factor input, must match row count of the other factor input
    * @param b factor input, must match row count of the other factor input
    * @param productScale scale to use for the product type
+   * @param interimCast whether to cast the result of the division to 38 precision before casting it again to the final
+   *                    precision
    * @return table containing a boolean column and a DECIMAL128 product column of the specified
    *         scale. The boolean value will be true if an overflow was detected for that row's
    *         DECIMAL128 product value. A null input row will result in a corresponding null output
    *         row.
    */
-  public static Table multiply128(ColumnView a, ColumnView b, int productScale) {
-    return new Table(multiply128(a.getNativeView(), b.getNativeView(), productScale));
+  public static Table multiply128(ColumnView a, ColumnView b, int productScale, boolean interimCast) {
+    return new Table(multiply128(a.getNativeView(), b.getNativeView(), productScale, interimCast));
   }
 
   /**
@@ -148,7 +177,7 @@ public static Table add128(ColumnView a, ColumnView b, int targetScale) {
     return new Table(add128(a.getNativeView(), b.getNativeView(), targetScale));
   }
 
-  private static native long[] multiply128(long viewA, long viewB, int productScale);
+  private static native long[] multiply128(long viewA, long viewB, int productScale, boolean interimCast);
 
   private static native long[] divide128(long viewA, long viewB, int quotientScale, boolean isIntegerDivide);
 
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/DecimalUtilsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/DecimalUtilsTest.java
index 4698855f31..7f3079e825 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/DecimalUtilsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/DecimalUtilsTest.java
@@ -86,6 +86,18 @@ void simplePosMultiplyZeroByNegOne() {
     }
   }
 
+  @Test
+  void multiply128WithoutInterimCast() {
+    try (ColumnVector lhs = makeDec128Column("-8533444864753048107770677711.1312637916");
+         ColumnVector rhs = makeDec128Column("-12.0000000000");
+         ColumnVector expectedBasic = makeDec128Column("102401338377036577293248132533.575165");
+         ColumnVector expectedValid = ColumnVector.fromBooleans(false);
+         Table found = DecimalUtils.multiply128(lhs, rhs, -6, false)) {
+      assertColumnsAreEqual(expectedValid, found.getColumn(0));
+      assertColumnsAreEqual(expectedBasic, found.getColumn(1));
+    }
+  }
+
   @Test
   void largePosMultiplyTenByTen() {
     try (ColumnVector lhs =

From c58aa0b4b9fa064bcb34772de81e70a599740bc4 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 16 Dec 2023 05:27:47 +0800
Subject: [PATCH 063/127] Update submodule cudf to
 0762fbea9100421e8a0a826fb3c5704c2a3f6a31 (#1657)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 2cb8f3da3a..0762fbea91 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 2cb8f3da3a3dd539301f90dcfbccadaf06963fd2
+Subproject commit 0762fbea9100421e8a0a826fb3c5704c2a3f6a31

From a9842f9eda1943e6a907dfe37afea121974b9e5c Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 16 Dec 2023 12:07:13 +0800
Subject: [PATCH 064/127] Update submodule cudf to
 9c16d895f509e1d4e9710651e57e4cd29defbcce (#1658)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 0762fbea91..9c16d895f5 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 0762fbea9100421e8a0a826fb3c5704c2a3f6a31
+Subproject commit 9c16d895f509e1d4e9710651e57e4cd29defbcce

From 6bdc68b1881a0bbaa399dfc79b20857b16b5dbde Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Mon, 18 Dec 2023 23:29:28 +0800
Subject: [PATCH 065/127] Update submodule cudf to
 8dca25c782bbe239ed6e9b6317cc3a01b15a2b42 (#1659)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 9c16d895f5..8dca25c782 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 9c16d895f509e1d4e9710651e57e4cd29defbcce
+Subproject commit 8dca25c782bbe239ed6e9b6317cc3a01b15a2b42

From 48d27360029bedc9002e468a68f51512bbb640d5 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 19 Dec 2023 03:48:49 +0800
Subject: [PATCH 066/127] Adding format_float kernel (#1572)

* wip

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* wip

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Add float to string kernel

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Update src/main/cpp/src/cast_float_to_string.cu

Co-authored-by: Mike Wilson <hyperbolic2346@users.noreply.github.com>

* Update src/main/cpp/src/cast_float_to_string.cu

Co-authored-by: Mike Wilson <hyperbolic2346@users.noreply.github.com>

* address comments and use different precision for float

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* a runnable format_number demo

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* rewrite the solution with ryu

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* update license

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Split ftos_converter out

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* resolve cudf conflicts

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* resolve cudf conflicts

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* resolve cudf conflicts

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* resolve cudf conflicts

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* remove cudf changes

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* remove cudf changes

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* add ryu

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Add copyright and notice

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Fix copyrights and license

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* cudf conflict resolve

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Add format_float kernel

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Fixed two bugs

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Added a failed case back

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Refactor

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Handle d=0 case

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Add nv apache license to ftos_converter

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Add nv apache license to ftos_converter

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Fix an rounding bug

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Update src/main/cpp/src/ftos_converter.cu

Co-authored-by: Jason Lowe <jlowe@nvidia.com>

* address some comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* cudf conflict

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Update src/main/cpp/src/cast_float_to_string.cu

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>

* Make it runable again

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* address some comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* addressed comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clang format

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* fix build after upmerge

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* move inf/nan replacement to kernel

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Apply suggestions from code review

Co-authored-by: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>

* address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Apply suggestions from code review

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>

* address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* cudf

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* cudf

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* format

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* cudf reset

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Apply suggestions from code review

Co-authored-by: Mike Wilson <hyperbolic2346@users.noreply.github.com>

---------

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
Co-authored-by: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Co-authored-by: Jason Lowe <jlowe@nvidia.com>
Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 src/main/cpp/CMakeLists.txt                   |   1 +
 src/main/cpp/src/CastStringJni.cpp            |  15 +
 src/main/cpp/src/cast_string.hpp              |   6 +
 src/main/cpp/src/format_float.cu              | 131 +++++
 src/main/cpp/src/ftos_converter.cuh           | 453 ++++++++++++++++--
 src/main/cpp/tests/CMakeLists.txt             |   3 +
 src/main/cpp/tests/cast_decimal_to_string.cpp |   3 +-
 src/main/cpp/tests/cast_string.cpp            |   3 +-
 src/main/cpp/tests/format_float.cpp           |  88 ++++
 .../nvidia/spark/rapids/jni/CastStrings.java  |  12 +
 10 files changed, 677 insertions(+), 38 deletions(-)
 create mode 100644 src/main/cpp/src/format_float.cu
 create mode 100644 src/main/cpp/tests/format_float.cpp

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 18c0cd12e8..fee3e60b8e 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -164,6 +164,7 @@ add_library(
   src/ZOrderJni.cpp
   src/bloom_filter.cu
   src/cast_decimal_to_string.cu
+  src/format_float.cu
   src/cast_float_to_string.cu
   src/cast_string.cu
   src/cast_string_to_float.cu
diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index 933fc15e34..b7d898a0c8 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -125,6 +125,21 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat(J
   CATCH_CAST_EXCEPTION(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloatWithFormat(
+  JNIEnv* env, jclass, jlong input_column, jint digits)
+{
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    auto const& cv = *reinterpret_cast<cudf::column_view const*>(input_column);
+    return cudf::jni::release_as_jlong(
+      spark_rapids_jni::format_float(cv, digits, cudf::get_default_stream()));
+  }
+  CATCH_CAST_EXCEPTION(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromDecimal(JNIEnv* env,
                                                                                  jclass,
                                                                                  jlong input_column)
diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp
index c4f850b47f..43ec36e576 100644
--- a/src/main/cpp/src/cast_string.hpp
+++ b/src/main/cpp/src/cast_string.hpp
@@ -115,6 +115,12 @@ std::unique_ptr<cudf::column> string_to_float(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<cudf::column> format_float(
+  cudf::column_view const& input,
+  int const digits,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 std::unique_ptr<cudf::column> float_to_string(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream,
diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
new file mode 100644
index 0000000000..d9ecbe8206
--- /dev/null
+++ b/src/main/cpp/src/format_float.cu
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cast_string.hpp"
+#include "ftos_converter.cuh"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace spark_rapids_jni {
+
+namespace detail {
+namespace {
+
+template <typename FloatType>
+struct format_float_fn {
+  cudf::column_device_view d_floats;
+  int digits;
+  cudf::size_type* d_offsets;
+  char* d_chars;
+
+  __device__ cudf::size_type compute_output_size(FloatType const value) const
+  {
+    bool constexpr is_float = std::is_same_v<FloatType, float>;
+    return static_cast<cudf::size_type>(
+      ftos_converter::compute_format_float_size(static_cast<double>(value), digits, is_float));
+  }
+
+  __device__ void format_float(cudf::size_type const idx) const
+  {
+    auto const value        = d_floats.element<FloatType>(idx);
+    bool constexpr is_float = std::is_same_v<FloatType, float>;
+    auto const output       = d_chars + d_offsets[idx];
+    ftos_converter::format_float(static_cast<double>(value), digits, is_float, output);
+  }
+
+  __device__ void operator()(cudf::size_type const idx) const
+  {
+    if (d_floats.is_null(idx)) {
+      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      return;
+    }
+    if (d_chars != nullptr) {
+      format_float(idx);
+    } else {
+      d_offsets[idx] = compute_output_size(d_floats.element<FloatType>(idx));
+    }
+  }
+};
+
+/**
+ * @brief This dispatch method is for converting floats into strings.
+ *
+ * The template function declaration ensures only float types are allowed.
+ */
+struct dispatch_format_float_fn {
+  template <typename FloatType, CUDF_ENABLE_IF(std::is_floating_point_v<FloatType>)>
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const& floats,
+                                           int const digits,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr) const
+  {
+    auto const strings_count = floats.size();
+    if (strings_count == 0) { return cudf::make_empty_column(cudf::type_id::STRING); }
+
+    auto const input_ptr = cudf::column_device_view::create(floats, stream);
+
+    auto [offsets, chars] = cudf::strings::detail::make_strings_children(
+      format_float_fn<FloatType>{*input_ptr, digits}, strings_count, stream, mr);
+
+    return cudf::make_strings_column(strings_count,
+                                     std::move(offsets),
+                                     std::move(chars),
+                                     floats.null_count(),
+                                     cudf::detail::copy_bitmask(floats, stream, mr));
+  }
+
+  // non-float types throw an exception
+  template <typename T, CUDF_ENABLE_IF(not std::is_floating_point_v<T>)>
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
+                                           int const,
+                                           rmm::cuda_stream_view,
+                                           rmm::mr::device_memory_resource*) const
+  {
+    CUDF_FAIL("Values for format_float function must be a float type.");
+  }
+};
+
+}  // namespace
+
+// This will convert all float column types into a strings column.
+std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
+                                           int const digits,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  return type_dispatcher(floats.type(), dispatch_format_float_fn{}, floats, digits, stream, mr);
+}
+
+}  // namespace detail
+
+// external API
+std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
+                                           int const digits,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::format_float(floats, digits, stream, mr);
+}
+
+}  // namespace spark_rapids_jni
\ No newline at end of file
diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
index 444f790d3c..e684f73921 100644
--- a/src/main/cpp/src/ftos_converter.cuh
+++ b/src/main/cpp/src/ftos_converter.cuh
@@ -15,6 +15,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <cuda/std/cassert>
 #include <cuda/std/climits>
 #include <cuda/std/cstdint>
@@ -116,34 +118,32 @@ __constant__ uint32_t const POW5_OFFSETS[21] = {
 
 constexpr uint32_t POW5_TABLE_SIZE = 26;
 
-__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {
-  1ull,
-  5ull,
-  25ull,
-  125ull,
-  625ull,
-  3125ull,
-  15625ull,
-  78125ull,
-  390625ull,
-  1953125ull,
-  9765625ull,
-  48828125ull,
-  244140625ull,
-  1220703125ull,
-  6103515625ull,
-  30517578125ull,
-  152587890625ull,
-  762939453125ull,
-  3814697265625ull,
-  19073486328125ull,
-  95367431640625ull,
-  476837158203125ull,
-  2384185791015625ull,
-  11920928955078125ull,
-  59604644775390625ull,
-  298023223876953125ull  //, 1490116119384765625ull
-};
+__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {1ull,
+                                                                  5ull,
+                                                                  25ull,
+                                                                  125ull,
+                                                                  625ull,
+                                                                  3125ull,
+                                                                  15625ull,
+                                                                  78125ull,
+                                                                  390625ull,
+                                                                  1953125ull,
+                                                                  9765625ull,
+                                                                  48828125ull,
+                                                                  244140625ull,
+                                                                  1220703125ull,
+                                                                  6103515625ull,
+                                                                  30517578125ull,
+                                                                  152587890625ull,
+                                                                  762939453125ull,
+                                                                  3814697265625ull,
+                                                                  19073486328125ull,
+                                                                  95367431640625ull,
+                                                                  476837158203125ull,
+                                                                  2384185791015625ull,
+                                                                  11920928955078125ull,
+                                                                  59604644775390625ull,
+                                                                  298023223876953125ull};
 
 //===== common.h from ryu =====
 
@@ -1063,7 +1063,7 @@ __device__ inline bool d2d_small_int(uint64_t const ieeeMantissa,
   return true;
 }
 
-__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special)
+__device__ inline floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special)
 {
   // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
   uint64_t const bits = double_to_bits(f);
@@ -1100,7 +1100,7 @@ __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special)
   return v;
 }
 
-__device__ int d2s_buffered_n(double f, char* result)
+__device__ inline int d2s_buffered_n(double f, char* result)
 {
   bool sign = false, special = false;
   floating_decimal_64 v = d2d(f, sign, special);
@@ -1108,7 +1108,7 @@ __device__ int d2s_buffered_n(double f, char* result)
   return to_chars(v, sign, result);
 }
 
-__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special)
+__device__ inline floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special)
 {
   // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
   uint32_t const bits = float_to_bits(f);
@@ -1128,7 +1128,7 @@ __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special)
   return f2d(ieeeMantissa, ieeeExponent);
 }
 
-__device__ int f2s_buffered_n(float f, char* result)
+__device__ inline int f2s_buffered_n(float f, char* result)
 {
   bool sign = false, special = false;
   floating_decimal_32 v = f2d(f, sign, special);
@@ -1138,7 +1138,7 @@ __device__ int f2s_buffered_n(float f, char* result)
 
 //===== compute float to string size =====
 
-__device__ int compute_d2s_size(double value)
+__device__ inline int compute_d2s_size(double value)
 {
   bool sign = false, special = false;
   floating_decimal_64 v = d2d(value, sign, special);
@@ -1146,7 +1146,7 @@ __device__ int compute_d2s_size(double value)
   return d2s_size(v, sign);
 }
 
-__device__ int compute_f2s_size(float value)
+__device__ inline int compute_f2s_size(float value)
 {
   bool sign = false, special = false;
   floating_decimal_32 v = f2d(value, sign, special);
@@ -1158,7 +1158,7 @@ __device__ int compute_f2s_size(float value)
 
 //===== APIs =====
 
-__device__ int compute_ftos_size(double value, bool is_float)
+__device__ inline int compute_ftos_size(double value, bool is_float)
 {
   if (is_float) {
     return compute_f2s_size(value);
@@ -1167,7 +1167,7 @@ __device__ int compute_ftos_size(double value, bool is_float)
   }
 }
 
-__device__ int float_to_string(double value, bool is_float, char* output)
+__device__ inline int float_to_string(double value, bool is_float, char* output)
 {
   if (is_float) {
     return f2s_buffered_n(value, output);
@@ -1176,4 +1176,385 @@ __device__ int float_to_string(double value, bool is_float, char* output)
   }
 }
 
+//===== format float =====
+
+__constant__ uint64_t const POW10_TABLE[19] = {1ull,
+                                               10ull,
+                                               100ull,
+                                               1000ull,
+                                               10000ull,
+                                               100000ull,
+                                               1000000ull,
+                                               10000000ull,
+                                               100000000ull,
+                                               1000000000ull,
+                                               10000000000ull,
+                                               100000000000ull,
+                                               1000000000000ull,
+                                               10000000000000ull,
+                                               100000000000000ull,
+                                               1000000000000000ull,
+                                               10000000000000000ull,
+                                               100000000000000000ull};
+
+template <typename T>
+__device__ inline T round_half_even(T const input, int const olength, int const digits)
+{
+  // "round" a integer to digits digits, with the half-even rounding mode.
+  if (digits > olength) {
+    T num = input;
+    for (int i = 0; i < digits - olength; i++) {
+      num *= 10;
+    }
+    return num;
+  }
+  T div = POW10_TABLE[olength - digits];
+  T mod = input % div;
+  T num = input / div;
+  if (mod > (div / 2) || ((mod == (div / 2) && (num % 2 == 1) && mod != 0))) { num++; }
+  return num;
+}
+
+__device__ inline int to_formated_chars(floating_decimal_64 const v,
+                                        bool const sign,
+                                        char* const result,
+                                        int digits)
+{
+  int index = 0;
+  if (sign) { result[index++] = '-'; }
+  uint64_t output        = v.mantissa;
+  const uint32_t olength = decimalLength17(output);
+  int32_t exp            = v.exponent + (int32_t)olength - 1;
+  if (exp < 0) {
+    // Decimal dot is before any of the digits.
+    int index_for_carrier = index;
+    result[index++]       = '0';
+    if (digits == 0) { return index; }
+    result[index++]   = '.';
+    int actural_round = digits;
+    for (int i = -1; i > exp; i--) {
+      index_for_carrier = index;
+      result[index++]   = '0';
+      actural_round--;
+      if (actural_round == 0) {
+        if (i != exp + 1) { return index; }  // else, possible carry
+        break;
+      }
+    }
+    int actural_olength     = fmin(int(olength), actural_round);
+    uint64_t rounded_output = round_half_even(output, olength, actural_round);
+    // check if carry
+    if (rounded_output >= POW10_TABLE[actural_olength]) {
+      result[index_for_carrier] = '1';
+      rounded_output -= POW10_TABLE[actural_olength];
+    }
+    int current = index;
+    for (int i = 0; i < actural_olength; i++) {
+      result[current + actural_olength - i - 1] = (char)('0' + rounded_output % 10);
+      rounded_output /= 10;
+      index++;
+    }
+    actural_round -= actural_olength;
+    if (actural_round > 0) {
+      for (int i = 0; i < actural_round; i++) {
+        result[index++] = '0';
+      }
+    }
+  } else if (exp + 1 >= olength) {
+    // Decimal dot is after any of the digits.
+    int integer_len = index + exp + 1 + exp / 3;
+    int sep_cnt     = 0;
+    int rev_index   = 0;
+    for (int i = olength; i < exp + 1; i++) {
+      result[integer_len - (rev_index++) - 1] = '0';
+      sep_cnt++;
+      if (sep_cnt == 3) {
+        result[integer_len - (rev_index++) - 1] = ',';
+        sep_cnt                                 = 0;
+      }
+    }
+    for (int i = 0; i < olength; i++) {
+      if (sep_cnt == 3) {
+        result[integer_len - (rev_index++) - 1] = ',';
+        sep_cnt                                 = 0;
+      }
+      result[integer_len - (rev_index++) - 1] = (char)('0' + output % 10);
+      sep_cnt++;
+      output /= 10;
+    }
+    index = integer_len;
+    if (digits == 0) { return index; }
+    result[index++] = '.';
+    for (int i = 0; i < digits; i++) {
+      result[index++] = '0';
+    }
+  } else {
+    uint32_t temp_d = digits, tailing_zero = 0;
+    if (exp + digits > olength) {
+      temp_d       = olength - exp;
+      tailing_zero = digits - temp_d;
+    }
+    uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1);
+    uint64_t pow10          = POW10_TABLE[temp_d];
+    uint64_t integer        = rounded_output / pow10;
+    uint64_t decimal        = rounded_output % pow10;
+    // calculate integer length after format to cover carry case
+    uint32_t integer_len          = decimalLength17(integer);
+    uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
+    uint32_t sep_cnt              = 0;
+    int rev_index                 = 0;
+    for (int i = 0; i < integer_len; i++) {
+      if (sep_cnt == 3) {
+        result[formated_integer_len - (rev_index++) - 1] = ',';
+        sep_cnt                                          = 0;
+      }
+      result[formated_integer_len - (rev_index++) - 1] = (char)('0' + integer % 10);
+      sep_cnt++;
+      integer /= 10;
+    }
+    index = formated_integer_len;
+    if (digits == 0) { return index; }
+    result[index++] = '.';
+    int current     = index;
+    for (int i = 0; i < tailing_zero; i++) {
+      result[current + digits - i - 1] = '0';
+      index++;
+    }
+    for (int i = tailing_zero; i < digits; i++) {
+      result[current + digits - i - 1] = (char)('0' + decimal % 10);
+      decimal /= 10;
+      index++;
+    }
+  }
+  return index;
+}
+
+__device__ inline int format_float_size(floating_decimal_64 const v, bool const sign, int digits)
+{
+  int index = 0;
+  if (sign) { index++; }
+  uint64_t output        = v.mantissa;
+  const uint32_t olength = decimalLength17(output);
+  int32_t exp            = v.exponent + (int32_t)olength - 1;
+  if (exp < 0) {
+    index += 2 + digits;
+  } else if (exp + 1 >= olength) {
+    index += exp + 1 + exp / 3 + 1 + digits;
+  } else {
+    uint32_t temp_d = digits;
+    if (exp + digits > olength) { temp_d = olength - exp; }
+    uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1);
+    uint64_t pow10          = POW10_TABLE[temp_d];
+    uint64_t integer        = rounded_output / pow10;
+    uint32_t integer_len    = decimalLength17(integer);
+    index += integer_len + (integer_len - 1) / 3 + 1 + digits;
+  }
+  if (digits == 0) { index--; }
+  return index;
+}
+
+__device__ inline int to_formated_chars(floating_decimal_32 const v,
+                                        bool const sign,
+                                        char* const result,
+                                        int digits)
+{
+  int index = 0;
+  if (sign) { result[index++] = '-'; }
+  uint32_t output        = v.mantissa;
+  uint32_t const olength = decimalLength9(output);
+  int32_t exp            = v.exponent + (int32_t)olength - 1;
+  if (exp < 0) {
+    // Decimal dot is before any of the digits.
+    int index_for_carrier = index;
+    result[index++]       = '0';
+    if (digits == 0) { return index; }
+    result[index++]   = '.';
+    int actural_round = digits;
+    for (int i = -1; i > exp; i--) {
+      index_for_carrier = index;
+      result[index++]   = '0';
+      actural_round--;
+      if (actural_round == 0) {
+        if (i != exp + 1) { return index; }  // else, possible carry
+        break;
+      }
+    }
+    int actural_olength     = fmin(int(olength), actural_round);
+    uint64_t rounded_output = round_half_even(output, olength, actural_round);
+    // check if carry
+    if (rounded_output >= POW10_TABLE[actural_olength]) {
+      result[index_for_carrier] = '1';
+      rounded_output -= POW10_TABLE[actural_olength];
+    }
+    int current = index;
+    for (int i = 0; i < actural_olength; i++) {
+      result[current + actural_olength - i - 1] = (char)('0' + rounded_output % 10);
+      rounded_output /= 10;
+      index++;
+    }
+    actural_round -= actural_olength;
+    if (actural_round > 0) {
+      for (int i = 0; i < actural_round; i++) {
+        result[index++] = '0';
+      }
+    }
+  } else if (exp + 1 >= olength) {
+    // Decimal dot is after any of the digits.
+    int integer_len = index + exp + 1 + exp / 3;
+    int sep_cnt     = 0;
+    int rev_index   = 0;
+    for (int i = olength; i < exp + 1; i++) {
+      result[integer_len - (rev_index++) - 1] = '0';
+      sep_cnt++;
+      if (sep_cnt == 3) {
+        result[integer_len - (rev_index++) - 1] = ',';
+        sep_cnt                                 = 0;
+      }
+    }
+    for (int i = 0; i < olength; i++) {
+      if (sep_cnt == 3) {
+        result[integer_len - (rev_index++) - 1] = ',';
+        sep_cnt                                 = 0;
+      }
+      result[integer_len - (rev_index++) - 1] = (char)('0' + output % 10);
+      sep_cnt++;
+      output /= 10;
+    }
+    index = integer_len;
+    if (digits == 0) { return index; }
+    result[index++] = '.';
+    for (int i = 0; i < digits; i++) {
+      result[index++] = '0';
+    }
+  } else {
+    uint32_t temp_d = digits, tailing_zero = 0;
+    if (exp + digits > olength) {
+      temp_d       = olength - exp;
+      tailing_zero = digits - temp_d;
+    }
+    uint32_t rounded_output = round_half_even(output, olength, exp + temp_d + 1);
+    uint32_t pow10          = POW10_TABLE[temp_d];
+    uint32_t integer        = rounded_output / pow10;
+    uint32_t decimal        = rounded_output % pow10;
+    // calculate integer length after format to cover carry case
+    uint32_t integer_len          = decimalLength9(integer);
+    uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
+    uint32_t sep_cnt              = 0;
+    int rev_index                 = 0;
+    for (int i = 0; i < integer_len; i++) {
+      if (sep_cnt == 3) {
+        result[formated_integer_len - (rev_index++) - 1] = ',';
+        sep_cnt                                          = 0;
+      }
+      result[formated_integer_len - (rev_index++) - 1] = (char)('0' + integer % 10);
+      sep_cnt++;
+      integer /= 10;
+    }
+    index = formated_integer_len;
+    if (digits == 0) { return index; }
+    result[index++] = '.';
+    int current     = index;
+    for (int i = 0; i < tailing_zero; i++) {
+      result[current + digits - i - 1] = '0';
+      index++;
+    }
+    for (int i = tailing_zero; i < digits; i++) {
+      result[current + digits - i - 1] = (char)('0' + decimal % 10);
+      decimal /= 10;
+      index++;
+    }
+  }
+  return index;
+}
+
+__device__ inline int format_float_size(floating_decimal_32 const v, bool const sign, int digits)
+{
+  int index = 0;
+  if (sign) { index++; }
+  uint64_t output        = v.mantissa;
+  uint32_t const olength = decimalLength9(output);
+  int32_t exp            = v.exponent + (int32_t)olength - 1;
+  if (exp < 0) {
+    index += 2 + digits;
+  } else if (exp + 1 >= olength) {
+    index += exp + 1 + exp / 3 + 1 + digits;
+  } else {
+    uint32_t temp_d = digits;
+    if (exp + digits > olength) { temp_d = olength - exp; }
+    uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1);
+    uint64_t pow10          = POW10_TABLE[temp_d];
+    uint64_t integer        = rounded_output / pow10;
+    uint32_t integer_len    = decimalLength9(integer);
+    index += integer_len + (integer_len - 1) / 3 + 1 + digits;
+  }
+  if (digits == 0) { index--; }
+  return index;
+}
+
+__device__ inline int copy_format_special_str(char* const result,
+                                              bool const sign,
+                                              bool const exponent,
+                                              bool const mantissa,
+                                              int const digits = 1)
+{
+  if (mantissa) {
+    memcpy(result, "\xEF\xBF\xBD", 3);  // U+FFFD, replacement character, NaN
+    return 3;
+  }
+  if (sign) { result[0] = '-'; }
+  if (exponent) {
+    memcpy(result + sign, "\xE2\x88\x9E", 3);  // U+221E, infinity symbol
+    return sign + 3;
+  }
+  result[sign] = '0';
+  if (digits == 0) {
+    return sign + 1;
+  } else {
+    result[sign + 1] = '.';
+  }
+  for (int i = 0; i < digits; i++) {
+    result[sign + 2 + i] = '0';
+  }
+  return sign + 2 + digits;
+}
+
+__device__ inline int special_format_str_size(bool const sign,
+                                              bool const exponent,
+                                              bool const mantissa,
+                                              int const digits = 1)
+{
+  if (mantissa) { return 3; }
+  if (exponent) { return sign + 3; }
+  if (digits == 0) { return sign + 1; }
+  return sign + 2 + digits;
+}
+
+__device__ inline int compute_format_float_size(double value, int digits, bool is_float)
+{
+  bool sign = false, special = false;
+  if (is_float) {
+    floating_decimal_32 v = f2d(value, sign, special);
+    if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); }
+    return format_float_size(v, sign, digits);
+  } else {
+    floating_decimal_64 v = d2d(value, sign, special);
+    if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); }
+    return format_float_size(v, sign, digits);
+  }
+}
+
+__device__ inline int format_float(double value, int digits, bool is_float, char* output)
+{
+  bool sign = false, special = false;
+  if (is_float) {
+    floating_decimal_32 v = f2d(value, sign, special);
+    if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); }
+    return to_formated_chars(v, sign, output, digits);
+  } else {
+    floating_decimal_64 v = d2d(value, sign, special);
+    if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); }
+    return to_formated_chars(v, sign, output, digits);
+  }
+}
+
 }  // namespace spark_rapids_jni::ftos_converter
diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt
index c9bb13046f..b34b1b8b01 100644
--- a/src/main/cpp/tests/CMakeLists.txt
+++ b/src/main/cpp/tests/CMakeLists.txt
@@ -51,6 +51,9 @@ ConfigureTest(CAST_STRING
 ConfigureTest(CAST_DECIMAL_TO_STRING
     cast_decimal_to_string.cpp)
 
+ConfigureTest(FORMAT_FLOAT
+    format_float.cpp)
+
 ConfigureTest(CAST_FLOAT_TO_STRING
     cast_float_to_string.cpp)
 
diff --git a/src/main/cpp/tests/cast_decimal_to_string.cpp b/src/main/cpp/tests/cast_decimal_to_string.cpp
index 1a93354339..ba1aaf05c8 100644
--- a/src/main/cpp/tests/cast_decimal_to_string.cpp
+++ b/src/main/cpp/tests/cast_decimal_to_string.cpp
@@ -24,9 +24,10 @@
 
 #include <cudf/strings/convert/convert_floats.hpp>
 
-#include <limits>
 #include <rmm/device_uvector.hpp>
 
+#include <limits>
+
 using namespace cudf;
 
 template <typename T>
diff --git a/src/main/cpp/tests/cast_string.cpp b/src/main/cpp/tests/cast_string.cpp
index c736d5971f..1f7aaaad21 100644
--- a/src/main/cpp/tests/cast_string.cpp
+++ b/src/main/cpp/tests/cast_string.cpp
@@ -24,9 +24,10 @@
 
 #include <cudf/strings/convert/convert_floats.hpp>
 
-#include <limits>
 #include <rmm/device_uvector.hpp>
 
+#include <limits>
+
 using namespace cudf;
 
 template <typename T>
diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp
new file mode 100644
index 0000000000..b9d77593db
--- /dev/null
+++ b/src/main/cpp/tests/format_float.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cast_string.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <limits>
+
+using namespace cudf;
+
+constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR};
+
+struct FormatFloatTests : public cudf::test::BaseFixture {};
+
+TEST_F(FormatFloatTests, FormatFloats32)
+{
+  auto const floats =
+    cudf::test::fixed_width_column_wrapper<float>{100.0f,
+                                                  654321.25f,
+                                                  -12761.125f,
+                                                  0.0f,
+                                                  5.0f,
+                                                  -4.0f,
+                                                  std::numeric_limits<float>::quiet_NaN(),
+                                                  123456789012.34f,
+                                                  -0.0f};
+
+  auto const expected = cudf::test::strings_column_wrapper{"100.00000",
+                                                           "654,321.25000",
+                                                           "-12,761.12500",
+                                                           "0.00000",
+                                                           "5.00000",
+                                                           "-4.00000",
+                                                           "\xEF\xBF\xBD",
+                                                           "123,456,790,000.00000",
+                                                           "-0.00000"};
+
+  auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+}
+
+TEST_F(FormatFloatTests, FormatFloats64)
+{
+  auto const floats =
+    cudf::test::fixed_width_column_wrapper<double>{100.0d,
+                                                   654321.25d,
+                                                   -12761.125d,
+                                                   1.123456789123456789d,
+                                                   0.000000000000000000123456789123456789d,
+                                                   0.0d,
+                                                   5.0d,
+                                                   -4.0d,
+                                                   std::numeric_limits<double>::quiet_NaN(),
+                                                   839542223232.794248339d,
+                                                   -0.0d};
+
+  auto const expected = cudf::test::strings_column_wrapper{"100.00000",
+                                                           "654,321.25000",
+                                                           "-12,761.12500",
+                                                           "1.12346",
+                                                           "0.00000",
+                                                           "0.00000",
+                                                           "5.00000",
+                                                           "-4.00000",
+                                                           "\xEF\xBF\xBD",
+                                                           "839,542,223,232.79420",
+                                                           "-0.00000"};
+
+  auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+}
\ No newline at end of file
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index 022cb93085..2b2267f034 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -80,6 +80,17 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st
     return new ColumnVector(toDecimal(cv.getNativeView(), ansiMode, strip, precision, scale));
   }
 
+  /**
+   * Convert a float column to a formatted string column.
+   *
+   * @param cv the column data to process
+   * @param digits the number of digits to display after the decimal point
+   * @return the converted column
+   */
+  public static ColumnVector fromFloatWithFormat(ColumnView cv, int digits) {
+    return new ColumnVector(fromFloatWithFormat(cv.getNativeView(), digits));
+  }
+
   /**
    * Convert a float column to a string column.
    *
@@ -147,6 +158,7 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled
       int precision, int scale);
   private static native long toFloat(long nativeColumnView, boolean ansi_enabled, int dtype);
   private static native long fromDecimal(long nativeColumnView);
+  private static native long fromFloatWithFormat(long nativeColumnView, int digits);
   private static native long fromFloat(long nativeColumnView);
   private static native long toIntegersWithBase(long nativeColumnView, int base,
     boolean ansiEnabled, int dtype);

From 9dffe324e52d10261a571be059edf14225e862bf Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 19 Dec 2023 05:29:51 +0800
Subject: [PATCH 067/127] Update submodule cudf to
 90cccef3e3070b0f03df75c49aca64517d5a4cfa (#1660)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8dca25c782..90cccef3e3 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8dca25c782bbe239ed6e9b6317cc3a01b15a2b42
+Subproject commit 90cccef3e3070b0f03df75c49aca64517d5a4cfa

From b15b8391acaa1d67c59259247026227d700d6a26 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 19 Dec 2023 11:25:17 +0800
Subject: [PATCH 068/127] Update submodule cudf to
 bb047a230a805476f3008abb031741f8995c6f1e (#1661)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 90cccef3e3..bb047a230a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 90cccef3e3070b0f03df75c49aca64517d5a4cfa
+Subproject commit bb047a230a805476f3008abb031741f8995c6f1e

From dadc7a091c1350a040aec25d565f6ba97e8e80ae Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 19 Dec 2023 17:30:39 +0800
Subject: [PATCH 069/127] Update submodule cudf to
 8b695e340355d43261800a1cff876369e916ae90 (#1663)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index bb047a230a..8b695e3403 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit bb047a230a805476f3008abb031741f8995c6f1e
+Subproject commit 8b695e340355d43261800a1cff876369e916ae90

From 98dc423dfbacb68e0d5d8d15069455aaffad618f Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Tue, 19 Dec 2023 13:38:25 -0500
Subject: [PATCH 070/127] Adding query support to parseURI (#1652)

* Adding query to parseuri

Signed-off-by: Mike Wilson <knobby@burntsheep.com>
---
 src/main/cpp/src/ParseURIJni.cpp              | 14 +++
 src/main/cpp/src/parse_uri.cu                 | 87 +++++++++++++------
 src/main/cpp/src/parse_uri.hpp                | 15 +++-
 src/main/cpp/tests/parse_uri.cpp              | 69 ++++++++++++---
 .../com/nvidia/spark/rapids/jni/ParseURI.java | 13 ++-
 .../nvidia/spark/rapids/jni/ParseURITest.java | 79 +++++++++++++----
 6 files changed, 223 insertions(+), 54 deletions(-)

diff --git a/src/main/cpp/src/ParseURIJni.cpp b/src/main/cpp/src/ParseURIJni.cpp
index 9079d99b9d..3af72687b6 100644
--- a/src/main/cpp/src/ParseURIJni.cpp
+++ b/src/main/cpp/src/ParseURIJni.cpp
@@ -47,4 +47,18 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseHost(JNIE
   }
   CATCH_STD(env, 0);
 }
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseQuery(JNIEnv* env,
+                                                                             jclass,
+                                                                             jlong input_column)
+{
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const input = reinterpret_cast<cudf::column_view const*>(input_column);
+    return cudf::jni::ptr_as_jlong(spark_rapids_jni::parse_uri_to_query(*input).release());
+  }
+  CATCH_STD(env, 0);
+}
 }
diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index 13a8effb37..d75dfc18c1 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/convert/convert_urls.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -47,10 +48,20 @@ struct uri_parts {
   string_view userinfo;
   string_view port;
   string_view opaque;
-  bool valid{false};
+  uint32_t valid{0};
 };
 
-enum class URI_chunks : int8_t { PROTOCOL, HOST, AUTHORITY, PATH, QUERY, USERINFO };
+enum class URI_chunks : int8_t {
+  PROTOCOL,
+  HOST,
+  AUTHORITY,
+  PATH,
+  FRAGMENT,
+  QUERY,
+  USERINFO,
+  PORT,
+  OPAQUE
+};
 
 enum class chunk_validity : int8_t { VALID, INVALID, FATAL };
 
@@ -436,7 +447,7 @@ bool __device__ validate_path(string_view path)
   // path can be alphanum and @[]_-!.~'()*?/&,;:$+=
   return validate_chunk(path, [] __device__(string_view::const_iterator iter) {
     auto const c = *iter;
-    if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && !(c >= '@' && c <= 'Z') &&
+    if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && !(c >= '?' && c <= 'Z') &&
         c != '_' && !(c >= 'a' && c <= 'z') && c != '~') {
       return false;
     }
@@ -474,6 +485,7 @@ uri_parts __device__ validate_uri(const char* str, int len)
 {
   uri_parts ret;
 
+  auto const original_str = str;
   // look for :/# characters.
   int col      = -1;
   int slash    = -1;
@@ -503,9 +515,10 @@ uri_parts __device__ validate_uri(const char* str, int len)
   if (hash >= 0) {
     ret.fragment = {str + hash + 1, len - hash - 1};
     if (!validate_fragment(ret.fragment)) {
-      ret.valid = false;
+      ret.valid = 0;
       return ret;
     }
+    ret.valid |= (1 << static_cast<int>(URI_chunks::FRAGMENT));
 
     len = hash;
 
@@ -519,9 +532,10 @@ uri_parts __device__ validate_uri(const char* str, int len)
     // we have a scheme up to the :
     ret.scheme = {str, col};
     if (!validate_scheme(ret.scheme)) {
-      ret.valid = false;
+      ret.valid = 0;
       return ret;
     }
+    ret.valid |= (1 << static_cast<int>(URI_chunks::PROTOCOL));
 
     // skip over scheme
     auto const skip = col + 1;
@@ -534,20 +548,22 @@ uri_parts __device__ validate_uri(const char* str, int len)
 
   // no more string to parse is an error
   if (len <= 0) {
-    ret.valid = false;
+    ret.valid = 0;
     return ret;
   }
 
-  // If we have a '/' as the next character, we have a heirarchical uri. If not it is opaque.
-  bool const heirarchical = str[0] == '/';
+  // If we have a '/' as the next character or this is still the start of the string, we have a
+  // heirarchical uri. If not it is opaque.
+  bool const heirarchical = str[0] == '/' || str == original_str;
   if (heirarchical) {
     // a '?' will break this into query and path/authority
     if (question >= 0) {
       ret.query = {str + question + 1, len - question - 1};
       if (!validate_query(ret.query)) {
-        ret.valid = false;
+        ret.valid = 0;
         return ret;
       }
+      ret.valid |= (1 << static_cast<int>(URI_chunks::QUERY));
     }
     auto const path_len = question >= 0 ? question : len;
 
@@ -567,17 +583,17 @@ uri_parts __device__ validate_uri(const char* str, int len)
       if (next_slash == -1 && ret.authority.size_bytes() == 0 && ret.query.size_bytes() == 0 &&
           ret.fragment.size_bytes() == 0) {
         // invalid! - but spark like to return things as long as you don't have illegal characters
-        // ret.valid = false;
-        ret.valid = true;
+        // ret.valid = 0;
         return ret;
       }
 
       if (ret.authority.size_bytes() > 0) {
         auto ipv6_address = ret.authority.size_bytes() > 2 && *ret.authority.begin() == '[';
         if (!validate_authority(ret.authority, ipv6_address)) {
-          ret.valid = false;
+          ret.valid = 0;
           return ret;
         }
+        ret.valid |= (1 << static_cast<int>(URI_chunks::AUTHORITY));
 
         // Inspect the authority for userinfo, host, and port
         const char* auth   = ret.authority.data();
@@ -604,9 +620,11 @@ uri_parts __device__ validate_uri(const char* str, int len)
         if (amp > 0) {
           ret.userinfo = {auth, amp};
           if (!validate_userinfo(ret.userinfo)) {
-            ret.valid = false;
+            ret.valid = 0;
             return ret;
           }
+          ret.valid |= (1 << static_cast<int>(URI_chunks::USERINFO));
+
           // skip over the @
           amp++;
 
@@ -617,36 +635,39 @@ uri_parts __device__ validate_uri(const char* str, int len)
           // Found a port, attempt to parse it
           ret.port = {auth + last_colon + 1, auth_size - last_colon - 1};
           if (!validate_port(ret.port)) {
-            ret.valid = false;
+            ret.valid = 0;
             return ret;
           }
+          ret.valid |= (1 << static_cast<int>(URI_chunks::PORT));
           ret.host = {auth, last_colon};
         } else {
           ret.host = {auth, auth_size};
         }
         auto host_ret = validate_host(ret.host);
         switch (host_ret) {
-          case chunk_validity::FATAL: ret.valid = false; return ret;
+          case chunk_validity::FATAL: ret.valid = 0; return ret;
           case chunk_validity::INVALID: ret.host = {}; break;
+          case chunk_validity::VALID: ret.valid |= (1 << static_cast<int>(URI_chunks::HOST)); break;
         }
       }
     } else {
       // path with no authority
-      ret.path = {str, len};
+      ret.path = {str, path_len};
     }
     if (!validate_path(ret.path)) {
-      ret.valid = false;
+      ret.valid = 0;
       return ret;
     }
+    ret.valid |= (1 << static_cast<int>(URI_chunks::PATH));
   } else {
     ret.opaque = {str, len};
     if (!validate_opaque(ret.opaque)) {
-      ret.valid = false;
+      ret.valid = 0;
       return ret;
     }
+    ret.valid |= (1 << static_cast<int>(URI_chunks::OPAQUE));
   }
 
-  ret.valid = true;
   return ret;
 }
 
@@ -697,7 +718,7 @@ __global__ void parse_uri_char_counter(column_device_view const in_strings,
     auto const string_length = in_string.size_bytes();
 
     auto const uri = validate_uri(in_chars, string_length);
-    if (!uri.valid) {
+    if ((uri.valid & (1 << static_cast<int>(chunk))) == 0) {
       out_lengths[row_idx] = 0;
       clear_bit(out_validity, row_idx);
     } else {
@@ -727,11 +748,18 @@ __global__ void parse_uri_char_counter(column_device_view const in_strings,
           out_lengths[row_idx] = uri.userinfo.size_bytes();
           out_offsets[row_idx] = uri.userinfo.data() - base_ptr;
           break;
-      }
-
-      if (out_lengths[row_idx] == 0) {
-        // A URI can be valid, but still have no data for a specific chunk
-        clear_bit(out_validity, row_idx);
+        case URI_chunks::PORT:
+          out_lengths[row_idx] = uri.port.size_bytes();
+          out_offsets[row_idx] = uri.port.data() - base_ptr;
+          break;
+        case URI_chunks::FRAGMENT:
+          out_lengths[row_idx] = uri.fragment.size_bytes();
+          out_offsets[row_idx] = uri.fragment.data() - base_ptr;
+          break;
+        case URI_chunks::OPAQUE:
+          out_lengths[row_idx] = uri.opaque.size_bytes();
+          out_offsets[row_idx] = uri.opaque.data() - base_ptr;
+          break;
       }
     }
   }
@@ -858,4 +886,13 @@ std::unique_ptr<column> parse_uri_to_host(strings_column_view const& input,
   return detail::parse_uri(input, detail::URI_chunks::HOST, stream, mr);
 }
 
+std::unique_ptr<column> parse_uri_to_query(strings_column_view const& input,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::parse_uri(
+    input, detail::URI_chunks::QUERY, stream, rmm::mr::get_current_device_resource());
+}
+
 }  // namespace spark_rapids_jni
\ No newline at end of file
diff --git a/src/main/cpp/src/parse_uri.hpp b/src/main/cpp/src/parse_uri.hpp
index 0a76cec1b4..07f6f9cd46 100644
--- a/src/main/cpp/src/parse_uri.hpp
+++ b/src/main/cpp/src/parse_uri.hpp
@@ -49,7 +49,20 @@ std::unique_ptr<cudf::column> parse_uri_to_protocol(
  */
 std::unique_ptr<cudf::column> parse_uri_to_host(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Parse query and copy from the input string column to the output char buffer.
+ *
+ * @param input Input string column of URIs to parse
+ * @param stream Stream on which to operate.
+ * @param mr Memory resource for returned column
+ * @return std::unique_ptr<column> String column of queries parsed.
+ */
+std::unique_ptr<cudf::column> parse_uri_to_query(
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/tests/parse_uri.cpp b/src/main/cpp/tests/parse_uri.cpp
index 1112fea232..36ebbeacc0 100644
--- a/src/main/cpp/tests/parse_uri.cpp
+++ b/src/main/cpp/tests/parse_uri.cpp
@@ -19,10 +19,12 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 struct ParseURIProtocolTests : public cudf::test::BaseFixture {};
 struct ParseURIHostTests : public cudf::test::BaseFixture {};
+struct ParseURIQueryTests : public cudf::test::BaseFixture {};
 
 enum class test_types {
   SIMPLE,
@@ -30,6 +32,7 @@ enum class test_types {
   IPv6,
   IPv4,
   UTF8,
+  QUERY,
 };
 
 namespace {
@@ -123,6 +126,15 @@ cudf::test::strings_column_wrapper get_test_data(test_types t)
         "http://✪↩d⁚f„⁈.ws/123",
         "https:// /path/to/file",
       });
+    case test_types::QUERY:
+      return cudf::test::strings_column_wrapper({
+        "https://www.nvidia.com/path?param0=1&param2=3&param4=5",
+        "https:// /?params=5&cloth=0&metal=1",
+        "https://[2001:db8::2:1]:443/parms/in/the/uri?a=b",
+        "https://[::1]/?invalid=param&f„⁈.=7",
+        "https://[::1]/?invalid=param&~.=!@&^",
+        "userinfo@www.nvidia.com/path?query=1#Ref",
+      });
     default: CUDF_FAIL("Test type unsupported!"); return cudf::test::strings_column_wrapper();
   }
 }
@@ -136,7 +148,7 @@ TEST_F(ParseURIProtocolTests, Simple)
   cudf::test::strings_column_wrapper const expected(
     {"https", "http", "file", "smb", "http", "file", "", "", ""}, {1, 1, 1, 1, 1, 1, 0, 0, 0});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
 }
 
 TEST_F(ParseURIProtocolTests, SparkEdges)
@@ -185,7 +197,7 @@ TEST_F(ParseURIProtocolTests, SparkEdges)
     {1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
      1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
 }
 
 TEST_F(ParseURIProtocolTests, IP6)
@@ -197,7 +209,7 @@ TEST_F(ParseURIProtocolTests, IP6)
     {"https", "https", "https", "https", "http", "https", "https", "https", "", ""},
     {1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
 }
 
 TEST_F(ParseURIProtocolTests, IP4)
@@ -208,7 +220,7 @@ TEST_F(ParseURIProtocolTests, IP4)
   cudf::test::strings_column_wrapper const expected(
     {"https", "https", "https", "https", "https", "https"});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
 }
 
 TEST_F(ParseURIProtocolTests, UTF8)
@@ -218,7 +230,7 @@ TEST_F(ParseURIProtocolTests, UTF8)
 
   cudf::test::strings_column_wrapper const expected({"https", "http", "http", ""}, {1, 1, 1, 0});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
 }
 
 TEST_F(ParseURIHostTests, Simple)
@@ -230,7 +242,7 @@ TEST_F(ParseURIHostTests, Simple)
     {"www.nvidia.com", "www.nvidia.com", "path", "network", "", "", "", "", ""},
     {1, 1, 1, 1, 0, 0, 0, 0, 0});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
 }
 
 TEST_F(ParseURIHostTests, SparkEdges)
@@ -279,7 +291,7 @@ TEST_F(ParseURIHostTests, SparkEdges)
     {1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
      1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
 }
 
 TEST_F(ParseURIHostTests, IP6)
@@ -299,7 +311,7 @@ TEST_F(ParseURIHostTests, IP6)
                                                      ""},
                                                     {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
 }
 
 TEST_F(ParseURIHostTests, IP4)
@@ -310,7 +322,7 @@ TEST_F(ParseURIHostTests, IP4)
   cudf::test::strings_column_wrapper const expected(
     {"192.168.1.100", "192.168.1.100", "", "", "", ""}, {1, 1, 0, 0, 0, 0});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
 }
 
 TEST_F(ParseURIHostTests, UTF8)
@@ -320,5 +332,42 @@ TEST_F(ParseURIHostTests, UTF8)
 
   cudf::test::strings_column_wrapper const expected({"nvidia.com", "", "", ""}, {1, 0, 0, 0});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
+TEST_F(ParseURIQueryTests, Simple)
+{
+  auto const col    = get_test_data(test_types::SIMPLE);
+  auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected({"param1=2", "", "", "", "", "", "", "", ""},
+                                                    {1, 0, 0, 0, 0, 0, 0, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
+TEST_F(ParseURIQueryTests, SparkEdges)
+{
+  auto const col    = get_test_data(test_types::SPARK_EDGES);
+  auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected(
+    {"",  "",   "", "",         "", "", "", "", "", "", "", "", "", "", "",
+     "",  // empty
+     "?", "?/", "", "query;p2", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""},
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
+     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
+TEST_F(ParseURIQueryTests, Queries)
+{
+  auto const col    = get_test_data(test_types::QUERY);
+  auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected(
+    {"param0=1&param2=3&param4=5", "", "a=b", "invalid=param&f„⁈.=7", "", "query=1"},
+    {1, 0, 1, 1, 0, 1});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
index 0e14f388d4..8f82bfc908 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
@@ -49,7 +49,18 @@ public static ColumnVector parseURIHost(ColumnView uriColumn) {
     return new ColumnVector(parseHost(uriColumn.getNativeView()));
   }
 
+  /**
+   * Parse query for each URI from the incoming column.
+   *
+   * @param URIColumn The input strings column in which each row contains a URI.
+   * @return A string column with query data extracted.
+   */
+  public static ColumnVector parseURIQuery(ColumnView uriColumn) {
+    assert uriColumn.getType().equals(DType.STRING) : "Input type must be String";
+    return new ColumnVector(parseQuery(uriColumn.getNativeView()));
+  }
+
   private static native long parseProtocol(long jsonColumnHandle);
   private static native long parseHost(long jsonColumnHandle);
-
+  private static native long parseQuery(long jsonColumnHandle);
 }
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
index c6e3b06ed1..ca76df2bf3 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
@@ -25,9 +25,8 @@
 import ai.rapids.cudf.ColumnVector;
 
 public class ParseURITest {
-  void buildExpectedAndRun(String[] testData) {
+  void testProtocol(String[] testData) {
     String[] expectedProtocolStrings = new String[testData.length];
-    String[] expectedHostStrings = new String[testData.length];
     for (int i=0; i<testData.length; i++) {
       String scheme = null;
       try {
@@ -38,6 +37,18 @@ void buildExpectedAndRun(String[] testData) {
       } catch (NullPointerException ex) {
         // leave the scheme null if URI is null
       }
+      expectedProtocolStrings[i] = scheme;
+    }
+    try (ColumnVector v0 = ColumnVector.fromStrings(testData);
+      ColumnVector expectedProtocol = ColumnVector.fromStrings(expectedProtocolStrings);
+      ColumnVector protocolResult = ParseURI.parseURIProtocol(v0)) {
+      AssertUtils.assertColumnsAreEqual(expectedProtocol, protocolResult);
+    }
+  }
+
+  void testHost(String[] testData) {
+    String[] expectedHostStrings = new String[testData.length];
+    for (int i=0; i<testData.length; i++) {
       String host = null;
       try {
         URI uri = new URI(testData[i]);
@@ -48,25 +59,45 @@ void buildExpectedAndRun(String[] testData) {
         // leave the host null if URI is null
       }
 
-      expectedProtocolStrings[i] = scheme;
       expectedHostStrings[i] = host;
     }
     try (ColumnVector v0 = ColumnVector.fromStrings(testData);
-      ColumnVector expectedProtocol = ColumnVector.fromStrings(expectedProtocolStrings);
       ColumnVector expectedHost = ColumnVector.fromStrings(expectedHostStrings);
-      ColumnVector protocolResult = ParseURI.parseURIProtocol(v0);
       ColumnVector hostResult = ParseURI.parseURIHost(v0)) {
-      AssertUtils.assertColumnsAreEqual(expectedProtocol, protocolResult);
       AssertUtils.assertColumnsAreEqual(expectedHost, hostResult);
     }
   }
-  
+
+  void testQuery(String[] testData) {
+    String[] expectedQueryStrings = new String[testData.length];
+    for (int i=0; i<testData.length; i++) {
+      String query = null;
+      try {
+        URI uri = new URI(testData[i]);
+        query = uri.getRawQuery();
+      } catch (URISyntaxException ex) {
+        // leave the query null if URI is invalid
+      } catch (NullPointerException ex) {
+        // leave the query null if URI is null
+      }
+
+      expectedQueryStrings[i] = query;
+    }
+    try (ColumnVector v0 = ColumnVector.fromStrings(testData);
+      ColumnVector expectedQuery = ColumnVector.fromStrings(expectedQueryStrings);
+      ColumnVector queryResult = ParseURI.parseURIQuery(v0)) {
+      AssertUtils.assertColumnsAreEqual(expectedQuery, queryResult);
+    }
+  }
+
   @Test
-  void parseURIToProtocolSparkTest() {
+  void parseURISparkTest() {
     String[] testData = {
       "https://nvidia.com/https&#://nvidia.com",
       "https://http://www.nvidia.com",
-      "http://www.nvidia.com/object.php?object=ะก-\320%9Fะฑ-ะฟ-ะก\321%82\321%80ะตะป\321%8Cะฝะฐ-\321%83ะป-\320%97ะฐะฒะพะด\321%81ะบะฐ\321%8F.html&sid=5",
+      // commented out until https://github.com/NVIDIA/spark-rapids/issues/10036 is fixed
+      //"http://www.nvidia.com/object.php?object=ะก-Ð%9Fะฑ-ะฟ-ะกÑ%82Ñ%80ะตะปÑ%8Cะฝะฐ-Ñ%83ะป-Ð%97ะฐะฒะพะดÑ%81ะบะฐÑ%8F.htm",
+      "http://www.nvidia.com/object.php?object=ะก-Ðะฑ-ะฟ-ะกÑÑะตะปÑ%20ะฝะฐ-Ñะป-ÐะฐะฒะพะดÑะบะฐÑ.htm",
       "filesystemmagicthing://bob.yaml",
       "nvidia.com:8080",
       "http://thisisinvalid.data/due/to-the_character%s/inside*the#url`~",
@@ -104,29 +135,38 @@ void parseURIToProtocolSparkTest() {
       "http://[::2:3:4:5:6:7:8]",
       "http://[fe80::7:8%eth0]",
       "http://[fe80::7:8%1]",
-      "http://www.nvidia.com/object.php?object=ะก-\320%9Fะฑ-ะฟ-ะก\321%82\321%80ะตะป\321%8Cะฝะฐ-\321%83ะป-\320%97ะฐะฒะพะด\321%81ะบะฐ\321%8F.html&sid=5",
       "http://www.nvidia.com/picshow.asp?id=106&mnid=5080&classname=\271\253ืฐฦช",
       "http://-.~_!$&'()*+,;=:%40:80%2f::::::@nvidia.com:443",
       "http://userid:password@nvidia.com:8080/",
+      "https://www.nvidia.com/path?param0=1&param2=3&param4=5%206",
+      "https:// /?params=5&cloth=0&metal=1",
+      "https://[2001:db8::2:1]:443/parms/in/the/uri?a=b",
+      "https://[::1]/?invalid=param&f„⁈.=7",
+      "https://[::1]/?invalid=param&~.=!@&^",
+      "userinfo@www.nvidia.com/path?query=1#Ref",
       "",
       null};
 
-    buildExpectedAndRun(testData);
+    testProtocol(testData);
+    testHost(testData);
+    testQuery(testData);
   }
 
   @Test
-  void parseURIToProtocolUTF8Test() {
+  void parseURIUTF8Test() {
     String[] testData = {
       "https:// /path/to/file",
       "https://nvidia.com/%4EV%49%44%49%41",
       "http://%77%77%77.%4EV%49%44%49%41.com",
       "http://✪↩d⁚f„⁈.ws/123"};
 
-    buildExpectedAndRun(testData);
+    testProtocol(testData);
+    testHost(testData);
+    testQuery(testData);
   }
 
   @Test
-  void parseURIToProtocolIP4Test() {
+  void parseURIIP4Test() {
     String[] testData = {
       "https://192.168.1.100/",
       "https://192.168.1.100:8443/",
@@ -134,11 +174,14 @@ void parseURIToProtocolIP4Test() {
       "https://192.168.1/",
       "https://280.100.1.1/",
       "https://182.168..100/path/to/file"};
-    buildExpectedAndRun(testData);
+
+    testProtocol(testData);
+    testHost(testData);
+    testQuery(testData);
   }
 
   @Test
-  void parseURIToProtocolIP6Test() {
+  void parseURIIP6Test() {
     String[] testData = {
       "https://[fe80::]",
       "https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]",
@@ -160,6 +203,8 @@ void parseURIToProtocolIP6Test() {
       "http://[fe80::7:8%1]",
     };
     
-    buildExpectedAndRun(testData);
+    testProtocol(testData);
+    testHost(testData);
+    testQuery(testData);
   }
 }

From 763406c0c407408c29c4147dd22304a13ef21470 Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Thu, 21 Dec 2023 13:08:51 -0500
Subject: [PATCH 071/127] Use cuda::proclaim_return_type on device lambdas
 (#1662)

* adding proclaim_return_type to device lambdas

Signed-off-by: Mike Wilson <knobby@burntsheep.com>

* clang-format

Signed-off-by: Mike Wilson <knobby@burntsheep.com>

* No cuda::proclaim_return_type on non-device lambda

Signed-off-by: Mike Wilson <knobby@burntsheep.com>

* Adding Mithun's changes for CCCL 2

Signed-off-by: Mike Wilson <knobby@burntsheep.com>

* linting

Signed-off-by: Mike Wilson <knobby@burntsheep.com>

* updating return type

Signed-off-by: Mike Wilson <knobby@burntsheep.com>

* Update src/main/cpp/CMakeLists.txt

Co-authored-by: Bradley Dice <bdice@bradleydice.com>

* Update jni

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

* Apply suggestions from code review

* Fix styles

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

* linting

Signed-off-by: Mike Wilson <knobby@burntsheep.com>

* Update submodule manually

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

* Fix header

Signed-off-by: Nghia Truong <nghiat@nvidia.com>

---------

Signed-off-by: Mike Wilson <knobby@burntsheep.com>
Signed-off-by: Nghia Truong <nghiat@nvidia.com>
Co-authored-by: Bradley Dice <bdice@bradleydice.com>
Co-authored-by: Nghia Truong <nghiat@nvidia.com>
---
 src/main/cpp/CMakeLists.txt                |    8 +-
 src/main/cpp/benchmarks/row_conversion.cpp |   16 +-
 src/main/cpp/src/RowConversionJni.cpp      |   15 +-
 src/main/cpp/src/bloom_filter.cu           |   19 +-
 src/main/cpp/src/datetime_rebase.cu        |  192 +-
 src/main/cpp/src/map_utils.cu              |  245 +-
 src/main/cpp/src/murmur_hash.cu            |   11 +-
 src/main/cpp/src/parse_uri.cu              |  117 +-
 src/main/cpp/src/row_conversion.cu         | 2595 --------------------
 src/main/cpp/src/row_conversion.hpp        |   53 -
 src/main/cpp/src/utilities.cu              |   27 +-
 src/main/cpp/src/xxhash64.cu               |   11 +-
 src/main/cpp/src/zorder.cu                 |   28 +-
 src/main/cpp/tests/CMakeLists.txt          |    3 -
 src/main/cpp/tests/row_conversion.cpp      | 1043 --------
 thirdparty/cudf                            |    2 +-
 16 files changed, 367 insertions(+), 4018 deletions(-)
 delete mode 100644 src/main/cpp/src/row_conversion.cu
 delete mode 100644 src/main/cpp/src/row_conversion.hpp
 delete mode 100644 src/main/cpp/tests/row_conversion.cpp

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index fee3e60b8e..1ad65687e2 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -94,11 +94,8 @@ include(cmake/Modules/ConfigureCUDA.cmake) # set other CUDA compilation flags
 # ##################################################################################################
 # * dependencies ----------------------------------------------------------------------------------
 
-# find libcu++
-include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
-
-# find thrust/cub
-include(${CUDF_DIR}/cpp/cmake/thirdparty/get_thrust.cmake)
+# find CCCL
+include(${CUDF_DIR}/cpp/cmake/thirdparty/get_cccl.cmake)
 
 # JNI
 find_package(JNI REQUIRED)
@@ -174,7 +171,6 @@ add_library(
   src/map_utils.cu
   src/murmur_hash.cu
   src/parse_uri.cu
-  src/row_conversion.cu
   src/timezones.cu
   src/utilities.cu
   src/xxhash64.cu
diff --git a/src/main/cpp/benchmarks/row_conversion.cpp b/src/main/cpp/benchmarks/row_conversion.cpp
index c625342867..b8694fbcdf 100644
--- a/src/main/cpp/benchmarks/row_conversion.cpp
+++ b/src/main/cpp/benchmarks/row_conversion.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,15 +48,15 @@ void fixed_width(nvbench::state& state)
     bytes_per_row += cudf::size_of(t);
   }
 
-  auto rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(table->view());
+  auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     if (direction == "to row") {
-      auto _rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(table->view());
+      auto _rows = cudf::convert_to_rows_fixed_width_optimized(table->view());
     } else {
       for (auto const& r : rows) {
         cudf::lists_column_view const l(r->view());
-        auto out = spark_rapids_jni::convert_from_rows_fixed_width_optimized(l, schema);
+        auto out = cudf::convert_from_rows_fixed_width_optimized(l, schema);
       }
     }
   });
@@ -117,16 +117,16 @@ static void variable_or_fixed_width(nvbench::state& state)
     }
   }
 
-  auto rows = spark_rapids_jni::convert_to_rows(table->view());
+  auto rows = cudf::convert_to_rows(table->view());
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto new_rows = spark_rapids_jni::convert_to_rows(table->view());
+    auto new_rows = cudf::convert_to_rows(table->view());
     if (direction == "to row") {
-      auto _rows = spark_rapids_jni::convert_to_rows(table->view());
+      auto _rows = cudf::convert_to_rows(table->view());
     } else {
       for (auto const& r : rows) {
         cudf::lists_column_view const l(r->view());
-        auto out = spark_rapids_jni::convert_from_rows(l, schema);
+        auto out = cudf::convert_from_rows(l, schema);
       }
     }
   });
diff --git a/src/main/cpp/src/RowConversionJni.cpp b/src/main/cpp/src/RowConversionJni.cpp
index 1fdb8a86b5..8e900691f1 100644
--- a/src/main/cpp/src/RowConversionJni.cpp
+++ b/src/main/cpp/src/RowConversionJni.cpp
@@ -16,7 +16,8 @@
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
-#include "row_conversion.hpp"
+
+#include <cudf/row_conversion.hpp>
 
 extern "C" {
 
@@ -31,7 +32,7 @@ Java_com_nvidia_spark_rapids_jni_RowConversion_convertToRowsFixedWidthOptimized(
     cudf::jni::auto_set_device(env);
     cudf::table_view const* n_input_table = reinterpret_cast<cudf::table_view const*>(input_table);
     std::vector<std::unique_ptr<cudf::column>> cols =
-      spark_rapids_jni::convert_to_rows_fixed_width_optimized(*n_input_table);
+      cudf::convert_to_rows_fixed_width_optimized(*n_input_table);
     int const num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     std::transform(cols.begin(), cols.end(), outcol_handles.begin(), [](auto& col) {
@@ -50,9 +51,8 @@ Java_com_nvidia_spark_rapids_jni_RowConversion_convertToRows(JNIEnv* env, jclass
   try {
     cudf::jni::auto_set_device(env);
     cudf::table_view const* n_input_table = reinterpret_cast<cudf::table_view const*>(input_table);
-    std::vector<std::unique_ptr<cudf::column>> cols =
-      spark_rapids_jni::convert_to_rows(*n_input_table);
-    int const num_columns = cols.size();
+    std::vector<std::unique_ptr<cudf::column>> cols = cudf::convert_to_rows(*n_input_table);
+    int const num_columns                           = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     std::transform(cols.begin(), cols.end(), outcol_handles.begin(), [](auto& col) {
       return cudf::jni::release_as_jlong(col);
@@ -84,7 +84,7 @@ Java_com_nvidia_spark_rapids_jni_RowConversion_convertFromRowsFixedWidthOptimize
                    std::back_inserter(types_vec),
                    [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); });
     std::unique_ptr<cudf::table> result =
-      spark_rapids_jni::convert_from_rows_fixed_width_optimized(list_input, types_vec);
+      cudf::convert_from_rows_fixed_width_optimized(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
@@ -110,8 +110,7 @@ JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_RowConversion_conv
                    n_scale.begin(),
                    std::back_inserter(types_vec),
                    [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); });
-    std::unique_ptr<cudf::table> result =
-      spark_rapids_jni::convert_from_rows(list_input, types_vec);
+    std::unique_ptr<cudf::table> result = cudf::convert_from_rows(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
diff --git a/src/main/cpp/src/bloom_filter.cu b/src/main/cpp/src/bloom_filter.cu
index 7637c85f10..6270705178 100644
--- a/src/main/cpp/src/bloom_filter.cu
+++ b/src/main/cpp/src/bloom_filter.cu
@@ -34,6 +34,8 @@
 
 #include <thrust/logical.h>
 
+#include <cuda/functional>
+
 #include <byteswap.h>
 
 namespace spark_rapids_jni {
@@ -316,14 +318,15 @@ std::unique_ptr<cudf::list_scalar> bloom_filter_merge(cudf::column_view const& b
     thrust::make_counting_iterator(0),
     thrust::make_counting_iterator(0) + num_words,
     dst,
-    [src, num_buffers = bloom_filters.size(), stride = buf_size] __device__(
-      cudf::size_type word_index) {
-      cudf::bitmask_type out = (reinterpret_cast<cudf::bitmask_type const*>(src))[word_index];
-      for (auto idx = 1; idx < num_buffers; idx++) {
-        out |= (reinterpret_cast<cudf::bitmask_type const*>(src + idx * stride))[word_index];
-      }
-      return out;
-    });
+    cuda::proclaim_return_type<cudf::bitmask_type>(
+      [src, num_buffers = bloom_filters.size(), stride = buf_size] __device__(
+        cudf::size_type word_index) {
+        cudf::bitmask_type out = (reinterpret_cast<cudf::bitmask_type const*>(src))[word_index];
+        for (auto idx = 1; idx < num_buffers; idx++) {
+          out |= (reinterpret_cast<cudf::bitmask_type const*>(src + idx * stride))[word_index];
+        }
+        return out;
+      }));
 
   // create the 1-row list column and move it into a scalar.
   return std::make_unique<cudf::list_scalar>(
diff --git a/src/main/cpp/src/datetime_rebase.cu b/src/main/cpp/src/datetime_rebase.cu
index 9548d09dad..8963acf491 100644
--- a/src/main/cpp/src/datetime_rebase.cu
+++ b/src/main/cpp/src/datetime_rebase.cu
@@ -30,6 +30,8 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
+#include <cuda/functional>
+
 namespace {
 
 // Convert a date in Julian calendar to the number of days since epoch.
@@ -73,28 +75,29 @@ std::unique_ptr<cudf::column> gregorian_to_julian_days(cudf::column_view const&
     thrust::make_counting_iterator(0),
     thrust::make_counting_iterator(input.size()),
     output->mutable_view().begin<cudf::timestamp_D>(),
-    [d_input = input.begin<cudf::timestamp_D>()] __device__(auto const idx) {
-      auto constexpr julian_end = cuda::std::chrono::year_month_day{
-        cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{4}};
-      auto constexpr gregorian_start = cuda::std::chrono::year_month_day{
-        cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{15}};
-
-      auto const days_ts          = d_input[idx].time_since_epoch().count();
-      auto const days_since_epoch = cuda::std::chrono::sys_days(cudf::duration_D{days_ts});
-
-      // Convert the input into local date in Proleptic Gregorian calendar.
-      auto const ymd = cuda::std::chrono::year_month_day(days_since_epoch);
-      if (ymd > julian_end && ymd < gregorian_start) {
-        // This is the same as rebasing from the local date given at `gregorian_start`.
-        return cudf::timestamp_D{cudf::duration_D{-141427}};
-      }
-
-      // No change since this time.
-      if (ymd >= gregorian_start) { return d_input[idx]; }
-
-      // Reinterpret year/month/day as in Julian calendar then compute the days since epoch.
-      return cudf::timestamp_D{cudf::duration_D{days_from_julian(ymd)}};
-    });
+    cuda::proclaim_return_type<cudf::timestamp_D>(
+      [d_input = input.begin<cudf::timestamp_D>()] __device__(auto const idx) {
+        auto constexpr julian_end = cuda::std::chrono::year_month_day{
+          cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{4}};
+        auto constexpr gregorian_start = cuda::std::chrono::year_month_day{
+          cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{15}};
+
+        auto const days_ts          = d_input[idx].time_since_epoch().count();
+        auto const days_since_epoch = cuda::std::chrono::sys_days(cudf::duration_D{days_ts});
+
+        // Convert the input into local date in Proleptic Gregorian calendar.
+        auto const ymd = cuda::std::chrono::year_month_day(days_since_epoch);
+        if (ymd > julian_end && ymd < gregorian_start) {
+          // This is the same as rebasing from the local date given at `gregorian_start`.
+          return cudf::timestamp_D{cudf::duration_D{-141427}};
+        }
+
+        // No change since this time.
+        if (ymd >= gregorian_start) { return d_input[idx]; }
+
+        // Reinterpret year/month/day as in Julian calendar then compute the days since epoch.
+        return cudf::timestamp_D{cudf::duration_D{days_from_julian(ymd)}};
+      }));
 
   return output;
 }
@@ -142,19 +145,20 @@ std::unique_ptr<cudf::column> julian_to_gregorian_days(cudf::column_view const&
                     thrust::make_counting_iterator(0),
                     thrust::make_counting_iterator(input.size()),
                     output->mutable_view().begin<cudf::timestamp_D>(),
-                    [d_input = input.begin<cudf::timestamp_D>()] __device__(auto const idx) {
-                      auto const days_ts = d_input[idx].time_since_epoch().count();
-                      if (days_ts >= -141427) {  // Gregorian start day
-                        return d_input[idx];
-                      }
-
-                      // Reinterpret year/month/day as in Gregorian calendar then compute the days
-                      // since epoch.
-                      auto const ymd = julian_from_days(days_ts);
-                      auto const result =
-                        cuda::std::chrono::local_days{ymd}.time_since_epoch().count();
-                      return cudf::timestamp_D{cudf::duration_D{result}};
-                    });
+                    cuda::proclaim_return_type<cudf::timestamp_D>(
+                      [d_input = input.begin<cudf::timestamp_D>()] __device__(auto const idx) {
+                        auto const days_ts = d_input[idx].time_since_epoch().count();
+                        if (days_ts >= -141427) {  // Gregorian start day
+                          return d_input[idx];
+                        }
+
+                        // Reinterpret year/month/day as in Gregorian calendar then compute the days
+                        // since epoch.
+                        auto const ymd = julian_from_days(days_ts);
+                        auto const result =
+                          cuda::std::chrono::local_days{ymd}.time_since_epoch().count();
+                        return cudf::timestamp_D{cudf::duration_D{result}};
+                      }));
 
   return output;
 }
@@ -242,39 +246,40 @@ std::unique_ptr<cudf::column> gregorian_to_julian_micros(cudf::column_view const
     thrust::make_counting_iterator(0),
     thrust::make_counting_iterator(input.size()),
     output->mutable_view().begin<cudf::timestamp_us>(),
-    [d_input = input.begin<cudf::timestamp_us>()] __device__(auto const idx) {
-      // This timestamp corresponds to October 15th, 1582 UTC.
-      // After this day, there is no difference in microsecond values between Gregorian
-      // and Julian calendars.
-      int64_t constexpr last_switch_gregorian_ts = -12219292800000000L;
-
-      auto const micros_ts = d_input[idx].time_since_epoch().count();
-      if (micros_ts >= last_switch_gregorian_ts) { return d_input[idx]; }
-
-      // Convert the input into local date-time in Proleptic Gregorian calendar.
-      auto const days_since_epoch = cuda::std::chrono::sys_days(static_cast<cudf::duration_D>(
-        cuda::std::chrono::floor<cuda::std::chrono::days>(cudf::duration_us(micros_ts))));
-      auto const ymd              = cuda::std::chrono::year_month_day(days_since_epoch);
-      auto const timeparts        = get_time_components(micros_ts);
-
-      auto constexpr julian_end = cuda::std::chrono::year_month_day{
-        cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{4}};
-      auto constexpr gregorian_start = cuda::std::chrono::year_month_day{
-        cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{15}};
-
-      // Reinterpret the local date-time as in Julian calendar and compute microseconds since
-      // the epoch from that Julian local date-time.
-      // If the input date is outside of both calendars, consider it as it is a local date
-      // given at `gregorian_start` (-141427 Julian days since epoch).
-      auto const julian_days =
-        (ymd > julian_end && ymd < gregorian_start) ? -141427 : days_from_julian(ymd);
-      int64_t result = (julian_days * 24L * 3600L) + (timeparts.hour * 3600L) +
-                       (timeparts.minute * 60L) + timeparts.second;
-      result *= MICROS_PER_SECOND;  // to microseconds
-      result += timeparts.subsecond;
-
-      return cudf::timestamp_us{cudf::duration_us{result}};
-    });
+    cuda::proclaim_return_type<cudf::timestamp_us>(
+      [d_input = input.begin<cudf::timestamp_us>()] __device__(auto const idx) {
+        // This timestamp corresponds to October 15th, 1582 UTC.
+        // After this day, there is no difference in microsecond values between Gregorian
+        // and Julian calendars.
+        int64_t constexpr last_switch_gregorian_ts = -12219292800000000L;
+
+        auto const micros_ts = d_input[idx].time_since_epoch().count();
+        if (micros_ts >= last_switch_gregorian_ts) { return d_input[idx]; }
+
+        // Convert the input into local date-time in Proleptic Gregorian calendar.
+        auto const days_since_epoch = cuda::std::chrono::sys_days(static_cast<cudf::duration_D>(
+          cuda::std::chrono::floor<cuda::std::chrono::days>(cudf::duration_us(micros_ts))));
+        auto const ymd              = cuda::std::chrono::year_month_day(days_since_epoch);
+        auto const timeparts        = get_time_components(micros_ts);
+
+        auto constexpr julian_end = cuda::std::chrono::year_month_day{
+          cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{4}};
+        auto constexpr gregorian_start = cuda::std::chrono::year_month_day{
+          cuda::std::chrono::year{1582}, cuda::std::chrono::month{10}, cuda::std::chrono::day{15}};
+
+        // Reinterpret the local date-time as in Julian calendar and compute microseconds since
+        // the epoch from that Julian local date-time.
+        // If the input date is outside of both calendars, consider it as it is a local date
+        // given at `gregorian_start` (-141427 Julian days since epoch).
+        auto const julian_days =
+          (ymd > julian_end && ymd < gregorian_start) ? -141427 : days_from_julian(ymd);
+        int64_t result = (julian_days * 24L * 3600L) + (timeparts.hour * 3600L) +
+                         (timeparts.minute * 60L) + timeparts.second;
+        result *= MICROS_PER_SECOND;  // to microseconds
+        result += timeparts.subsecond;
+
+        return cudf::timestamp_us{cudf::duration_us{result}};
+      }));
 
   return output;
 }
@@ -304,31 +309,32 @@ std::unique_ptr<cudf::column> julian_to_gregorian_micros(cudf::column_view const
     thrust::make_counting_iterator(0),
     thrust::make_counting_iterator(input.size()),
     output->mutable_view().begin<cudf::timestamp_us>(),
-    [d_input = input.begin<cudf::timestamp_us>()] __device__(auto const idx) {
-      // This timestamp corresponds to October 15th, 1582 UTC.
-      // After this day, there is no difference in microsecond values between Gregorian
-      // and Julian calendars.
-      int64_t constexpr last_switch_gregorian_ts = -12219292800000000L;
-
-      auto const micros_ts = d_input[idx].time_since_epoch().count();
-      if (micros_ts >= last_switch_gregorian_ts) { return d_input[idx]; }
-
-      // Convert the input into local date-time in Julian calendar.
-      auto const days_since_epoch = cuda::std::chrono::sys_days(static_cast<cudf::duration_D>(
-        cuda::std::chrono::floor<cuda::std::chrono::days>(cudf::duration_us(micros_ts))));
-      auto const ymd              = julian_from_days(days_since_epoch.time_since_epoch().count());
-      auto const timeparts        = get_time_components(micros_ts);
-
-      // Reinterpret the local date-time as in Gregorian calendar and compute microseconds since
-      // the epoch from that Gregorian local date-time.
-      auto const gregorian_days = cuda::std::chrono::local_days(ymd).time_since_epoch().count();
-      int64_t result            = (gregorian_days * 24L * 3600L) + (timeparts.hour * 3600L) +
-                       (timeparts.minute * 60L) + timeparts.second;
-      result *= MICROS_PER_SECOND;  // to microseconds
-      result += timeparts.subsecond;
-
-      return cudf::timestamp_us{cudf::duration_us{result}};
-    });
+    cuda::proclaim_return_type<cudf::timestamp_us>(
+      [d_input = input.begin<cudf::timestamp_us>()] __device__(auto const idx) {
+        // This timestamp corresponds to October 15th, 1582 UTC.
+        // After this day, there is no difference in microsecond values between Gregorian
+        // and Julian calendars.
+        int64_t constexpr last_switch_gregorian_ts = -12219292800000000L;
+
+        auto const micros_ts = d_input[idx].time_since_epoch().count();
+        if (micros_ts >= last_switch_gregorian_ts) { return d_input[idx]; }
+
+        // Convert the input into local date-time in Julian calendar.
+        auto const days_since_epoch = cuda::std::chrono::sys_days(static_cast<cudf::duration_D>(
+          cuda::std::chrono::floor<cuda::std::chrono::days>(cudf::duration_us(micros_ts))));
+        auto const ymd              = julian_from_days(days_since_epoch.time_since_epoch().count());
+        auto const timeparts        = get_time_components(micros_ts);
+
+        // Reinterpret the local date-time as in Gregorian calendar and compute microseconds since
+        // the epoch from that Gregorian local date-time.
+        auto const gregorian_days = cuda::std::chrono::local_days(ymd).time_since_epoch().count();
+        int64_t result            = (gregorian_days * 24L * 3600L) + (timeparts.hour * 3600L) +
+                         (timeparts.minute * 60L) + timeparts.second;
+        result *= MICROS_PER_SECOND;  // to microseconds
+        result += timeparts.subsecond;
+
+        return cudf::timestamp_us{cudf::duration_us{result}};
+      }));
 
   return output;
 }
diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/map_utils.cu
index f8ac369973..a51a7de57b 100644
--- a/src/main/cpp/src/map_utils.cu
+++ b/src/main/cpp/src/map_utils.cu
@@ -54,6 +54,8 @@
 //
 #include <cub/device/device_radix_sort.cuh>
 
+#include <cuda/functional>
+
 namespace spark_rapids_jni {
 
 using namespace cudf::io::json;
@@ -179,29 +181,33 @@ rmm::device_uvector<TreeDepthT> compute_node_levels(int64_t num_nodes,
   auto token_levels = rmm::device_uvector<TreeDepthT>(tokens.size(), stream);
 
   // Whether the token pops from the parent node stack.
-  auto const does_pop = [] __device__(PdaTokenT const token) -> bool {
-    switch (token) {
-      case token_t::StructMemberEnd:
-      case token_t::StructEnd:
-      case token_t::ListEnd: return true;
-      default: return false;
-    };
-  };
+  auto const does_pop =
+    cuda::proclaim_return_type<bool>([] __device__(PdaTokenT const token) -> bool {
+      switch (token) {
+        case token_t::StructMemberEnd:
+        case token_t::StructEnd:
+        case token_t::ListEnd: return true;
+        default: return false;
+      };
+    });
 
   // Whether the token pushes onto the parent node stack.
-  auto const does_push = [] __device__(PdaTokenT const token) -> bool {
-    switch (token) {
-      case token_t::FieldNameBegin:
-      case token_t::StructBegin:
-      case token_t::ListBegin: return true;
-      default: return false;
-    };
-  };
+  auto const does_push =
+    cuda::proclaim_return_type<bool>([] __device__(PdaTokenT const token) -> bool {
+      switch (token) {
+        case token_t::FieldNameBegin:
+        case token_t::StructBegin:
+        case token_t::ListBegin: return true;
+        default: return false;
+      };
+    });
 
   auto const push_pop_it = thrust::make_transform_iterator(
-    tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> cudf::size_type {
-      return does_push(token) - does_pop(token);
-    });
+    tokens.begin(),
+    cuda::proclaim_return_type<cudf::size_type>(
+      [does_push, does_pop] __device__(PdaTokenT const token) -> cudf::size_type {
+        return does_push(token) - does_pop(token);
+      }));
   thrust::exclusive_scan(
     rmm::exec_policy(stream), push_pop_it, push_pop_it + tokens.size(), token_levels.begin());
 
@@ -302,20 +308,20 @@ rmm::device_uvector<NodeIndexT> compute_parent_node_ids(
   rmm::device_uvector<NodeIndexT> const& node_token_ids,
   rmm::cuda_stream_view stream)
 {
-  auto const first_childs_parent_token_id = [tokens =
-                                               tokens.begin()] __device__(auto i) -> NodeIndexT {
-    if (i <= 0) { return -1; }
-    if (tokens[i - 1] == token_t::StructBegin || tokens[i - 1] == token_t::ListBegin) {
-      return i - 1;
-    } else if (tokens[i - 1] == token_t::FieldNameEnd) {
-      return i - 2;
-    } else if (tokens[i - 1] == token_t::StructMemberBegin &&
-               (tokens[i - 2] == token_t::StructBegin || tokens[i - 2] == token_t::ListBegin)) {
-      return i - 2;
-    } else {
-      return -1;
-    }
-  };
+  auto const first_childs_parent_token_id = cuda::proclaim_return_type<NodeIndexT>(
+    [tokens = tokens.begin()] __device__(auto i) -> NodeIndexT {
+      if (i <= 0) { return -1; }
+      if (tokens[i - 1] == token_t::StructBegin || tokens[i - 1] == token_t::ListBegin) {
+        return i - 1;
+      } else if (tokens[i - 1] == token_t::FieldNameEnd) {
+        return i - 2;
+      } else if (tokens[i - 1] == token_t::StructMemberBegin &&
+                 (tokens[i - 2] == token_t::StructBegin || tokens[i - 2] == token_t::ListBegin)) {
+        return i - 2;
+      } else {
+        return -1;
+      }
+    });
 
   auto parent_node_ids = rmm::device_uvector<NodeIndexT>(num_nodes, stream);
   thrust::transform(
@@ -323,14 +329,15 @@ rmm::device_uvector<NodeIndexT> compute_parent_node_ids(
     node_token_ids.begin(),
     node_token_ids.end(),
     parent_node_ids.begin(),
-    [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__(
-      NodeIndexT const tid) -> NodeIndexT {
-      auto const pid = first_childs_parent_token_id(tid);
-      return pid < 0
-               ? cudf::io::json::parent_node_sentinel
-               : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
-                   node_ids_gpu;
-    });
+    cuda::proclaim_return_type<NodeIndexT>(
+      [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__(
+        NodeIndexT const tid) -> NodeIndexT {
+        auto const pid = first_childs_parent_token_id(tid);
+        return pid < 0
+                 ? cudf::io::json::parent_node_sentinel
+                 : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
+                     node_ids_gpu;
+      }));
 
   // Propagate parent node to siblings from first sibling - inplace.
   auto const node_levels = compute_node_levels(num_nodes, tokens, stream);
@@ -356,20 +363,21 @@ rmm::device_uvector<int8_t> check_key_or_value_nodes(
     transform_it,
     transform_it + parent_node_ids.size(),
     key_or_value.begin(),
-    [key_sentinel   = key_sentinel,
-     value_sentinel = value_sentinel,
-     parent_ids     = parent_node_ids.begin()] __device__(auto const node_id) -> int8_t {
-      if (parent_ids[node_id] > 0) {
-        auto const grand_parent = parent_ids[parent_ids[node_id]];
-        if (grand_parent == 0) {
-          return key_sentinel;
-        } else if (parent_ids[grand_parent] == 0) {
-          return value_sentinel;
+    cuda::proclaim_return_type<int8_t>(
+      [key_sentinel   = key_sentinel,
+       value_sentinel = value_sentinel,
+       parent_ids     = parent_node_ids.begin()] __device__(auto const node_id) -> int8_t {
+        if (parent_ids[node_id] > 0) {
+          auto const grand_parent = parent_ids[parent_ids[node_id]];
+          if (grand_parent == 0) {
+            return key_sentinel;
+          } else if (parent_ids[grand_parent] == 0) {
+            return value_sentinel;
+          }
         }
-      }
 
-      return 0;
-    });
+        return 0;
+      }));
 
 #ifdef DEBUG_FROM_JSON
   print_debug(key_or_value, "Nodes are key/value (1==key, 2==value)", ", ", stream);
@@ -390,53 +398,58 @@ struct node_ranges_fn {
 
   __device__ thrust::pair<SymbolOffsetT, SymbolOffsetT> operator()(cudf::size_type node_id) const
   {
-    [[maybe_unused]] auto const is_begin_of_section = [] __device__(PdaTokenT const token) {
-      switch (token) {
-        case token_t::StructBegin:
-        case token_t::ListBegin:
-        case token_t::StringBegin:
-        case token_t::ValueBegin:
-        case token_t::FieldNameBegin: return true;
-        default: return false;
-      };
-    };
+    [[maybe_unused]] auto const is_begin_of_section =
+      cuda::proclaim_return_type<bool>([] __device__(PdaTokenT const token) {
+        switch (token) {
+          case token_t::StructBegin:
+          case token_t::ListBegin:
+          case token_t::StringBegin:
+          case token_t::ValueBegin:
+          case token_t::FieldNameBegin: return true;
+          default: return false;
+        };
+      });
 
     // The end-of-* partner token for a given beginning-of-* token
-    auto const end_of_partner = [] __device__(PdaTokenT const token) {
-      switch (token) {
-        case token_t::StructBegin: return token_t::StructEnd;
-        case token_t::ListBegin: return token_t::ListEnd;
-        case token_t::StringBegin: return token_t::StringEnd;
-        case token_t::ValueBegin: return token_t::ValueEnd;
-        case token_t::FieldNameBegin: return token_t::FieldNameEnd;
-        default: return token_t::ErrorBegin;
-      };
-    };
+    auto const end_of_partner =
+      cuda::proclaim_return_type<token_t>([] __device__(PdaTokenT const token) {
+        switch (token) {
+          case token_t::StructBegin: return token_t::StructEnd;
+          case token_t::ListBegin: return token_t::ListEnd;
+          case token_t::StringBegin: return token_t::StringEnd;
+          case token_t::ValueBegin: return token_t::ValueEnd;
+          case token_t::FieldNameBegin: return token_t::FieldNameEnd;
+          default: return token_t::ErrorBegin;
+        };
+      });
 
     // Encode a fixed value for nested node types (list+struct).
-    auto const nested_node_to_value = [] __device__(PdaTokenT const token) -> int32_t {
-      switch (token) {
-        case token_t::StructBegin: return 1;
-        case token_t::StructEnd: return -1;
-        case token_t::ListBegin: return 1 << 8;
-        case token_t::ListEnd: return -(1 << 8);
-        default: return 0;
-      };
-    };
-
-    auto const get_token_index = [include_quote_char = include_quote_char] __device__(
-                                   PdaTokenT const token, SymbolOffsetT const token_index) {
-      constexpr SymbolOffsetT quote_char_size = 1;
-      switch (token) {
-        // Strip off quote char included for StringBegin
-        case token_t::StringBegin: return token_index + (include_quote_char ? 0 : quote_char_size);
-        // Strip off or Include trailing quote char for string values for StringEnd
-        case token_t::StringEnd: return token_index + (include_quote_char ? quote_char_size : 0);
-        // Strip off quote char included for FieldNameBegin
-        case token_t::FieldNameBegin: return token_index + quote_char_size;
-        default: return token_index;
-      };
-    };
+    auto const nested_node_to_value =
+      cuda::proclaim_return_type<int32_t>([] __device__(PdaTokenT const token) -> int32_t {
+        switch (token) {
+          case token_t::StructBegin: return 1;
+          case token_t::StructEnd: return -1;
+          case token_t::ListBegin: return 1 << 8;
+          case token_t::ListEnd: return -(1 << 8);
+          default: return 0;
+        };
+      });
+
+    auto const get_token_index = cuda::proclaim_return_type<SymbolOffsetT>(
+      [include_quote_char = include_quote_char] __device__(PdaTokenT const token,
+                                                           SymbolOffsetT const token_index) {
+        constexpr SymbolOffsetT quote_char_size = 1;
+        switch (token) {
+          // Strip off quote char included for StringBegin
+          case token_t::StringBegin:
+            return token_index + (include_quote_char ? 0 : quote_char_size);
+          // Strip off or Include trailing quote char for string values for StringEnd
+          case token_t::StringEnd: return token_index + (include_quote_char ? quote_char_size : 0);
+          // Strip off quote char included for FieldNameBegin
+          case token_t::FieldNameBegin: return token_index + quote_char_size;
+          default: return token_index;
+        };
+      });
 
     if (key_or_value[node_id] != key_sentinel && key_or_value[node_id] != value_sentinel) {
       return thrust::make_pair(0, 0);
@@ -529,13 +542,15 @@ std::unique_ptr<cudf::column> extract_keys_or_values(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  auto const is_key = [key_or_value = key_or_value.begin()] __device__(auto const node_id) {
-    return key_or_value[node_id] == key_sentinel;
-  };
+  auto const is_key = cuda::proclaim_return_type<bool>(
+    [key_or_value = key_or_value.begin()] __device__(auto const node_id) {
+      return key_or_value[node_id] == key_sentinel;
+    });
 
-  auto const is_value = [key_or_value = key_or_value.begin()] __device__(auto const node_id) {
-    return key_or_value[node_id] == value_sentinel;
-  };
+  auto const is_value = cuda::proclaim_return_type<bool>(
+    [key_or_value = key_or_value.begin()] __device__(auto const node_id) {
+      return key_or_value[node_id] == value_sentinel;
+    });
 
   auto extract_ranges =
     rmm::device_uvector<thrust::pair<SymbolOffsetT, SymbolOffsetT>>(num_nodes, stream, mr);
@@ -574,17 +589,19 @@ rmm::device_uvector<cudf::size_type> compute_list_offsets(
 
   // For the nodes having parent_id == 0 (they are json object given by one input row), set their
   // child counts to zero. Otherwise, set child counts to `-1` (a sentinel number).
-  thrust::transform(rmm::exec_policy(stream),
-                    parent_node_ids.begin(),
-                    parent_node_ids.end(),
-                    node_child_counts.begin(),
-                    [] __device__(auto const parent_id) -> NodeIndexT {
-                      return parent_id == 0 ? 0 : std::numeric_limits<NodeIndexT>::lowest();
-                    });
-
-  auto const is_key = [key_or_value = key_or_value.begin()] __device__(auto const node_id) {
-    return key_or_value[node_id] == key_sentinel;
-  };
+  thrust::transform(
+    rmm::exec_policy(stream),
+    parent_node_ids.begin(),
+    parent_node_ids.end(),
+    node_child_counts.begin(),
+    cuda::proclaim_return_type<NodeIndexT>([] __device__(auto const parent_id) -> NodeIndexT {
+      return parent_id == 0 ? 0 : std::numeric_limits<NodeIndexT>::lowest();
+    }));
+
+  auto const is_key = cuda::proclaim_return_type<bool>(
+    [key_or_value = key_or_value.begin()] __device__(auto const node_id) {
+      return key_or_value[node_id] == key_sentinel;
+    });
 
   // Count the number of keys for each json object using `atomicAdd`.
   auto const transform_it = thrust::counting_iterator<int>(0);
@@ -608,7 +625,7 @@ rmm::device_uvector<cudf::size_type> compute_list_offsets(
     node_child_counts.begin(),
     node_child_counts.end(),
     list_offsets.begin(),
-    [] __device__(auto const count) { return count >= 0; },
+    cuda::proclaim_return_type<bool>([] __device__(auto const count) { return count >= 0; }),
     stream);
   CUDF_EXPECTS(thrust::distance(list_offsets.begin(), copy_end) == static_cast<int64_t>(n_lists),
                "Invalid list size computation.");
diff --git a/src/main/cpp/src/murmur_hash.cu b/src/main/cpp/src/murmur_hash.cu
index 679f521e77..17ec120b5a 100644
--- a/src/main/cpp/src/murmur_hash.cu
+++ b/src/main/cpp/src/murmur_hash.cu
@@ -27,6 +27,8 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/tabulate.h>
 
+#include <cuda/functional>
+
 namespace spark_rapids_jni {
 
 namespace {
@@ -77,10 +79,11 @@ class murmur_device_row_hasher {
       _table.begin(),
       _table.end(),
       _seed,
-      [row_index, nulls = this->_check_nulls] __device__(auto hash, auto column) {
-        return cudf::type_dispatcher(
-          column.type(), element_hasher_adapter<hash_function>{nulls, hash}, column, row_index);
-      });
+      cuda::proclaim_return_type<murmur_hash_value_type>(
+        [row_index, nulls = this->_check_nulls] __device__(auto hash, auto column) {
+          return cudf::type_dispatcher(
+            column.type(), element_hasher_adapter<hash_function>{nulls, hash}, column, row_index);
+        }));
   }
 
  private:
diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index d75dfc18c1..897ebe0208 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -30,6 +30,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/functional>
+
 #include <memory>
 
 namespace spark_rapids_jni {
@@ -395,90 +397,97 @@ chunk_validity __device__ validate_host(string_view host)
 bool __device__ validate_query(string_view query)
 {
   // query can be alphanum and _-!.~'()*,;:$&+=?/[]@"
-  return validate_chunk(query, [] __device__(string_view::const_iterator iter) {
-    auto const c = *iter;
-    if (c != '!' && c != '"' && c != '$' && !(c >= '&' && c <= ';') && c != '=' &&
-        !(c >= '?' && c <= ']' && c != '\\') && !(c >= 'a' && c <= 'z') && c != '_' && c != '~') {
-      return false;
-    }
-    return true;
-  });
-}
-
-bool __device__ validate_authority(string_view authority, bool allow_invalid_escapes)
-{
-  // authority needs to be alphanum and @[]_-!.'()*,;:$&+=
   return validate_chunk(
-    authority,
-    [allow_invalid_escapes] __device__(string_view::const_iterator iter) {
+    query, cuda::proclaim_return_type<bool>([] __device__(string_view::const_iterator iter) {
       auto const c = *iter;
-      if (c != '!' && c != '$' && !(c >= '&' && c <= ';' && c != '/') && c != '=' &&
-          !(c >= '@' && c <= '_' && c != '^' && c != '\\') && !(c >= 'a' && c <= 'z') && c != '~' &&
-          (!allow_invalid_escapes || c != '%')) {
+      if (c != '!' && c != '"' && c != '$' && !(c >= '&' && c <= ';') && c != '=' &&
+          !(c >= '?' && c <= ']' && c != '\\') && !(c >= 'a' && c <= 'z') && c != '_' && c != '~') {
         return false;
       }
       return true;
-    },
-    allow_invalid_escapes);
+    }));
+}
+
+bool __device__ validate_authority(string_view authority, bool allow_invalid_escapes)
+{
+  // authority needs to be alphanum and @[]_-!.'()*,;:$&+=
+  return validate_chunk(authority,
+                        cuda::proclaim_return_type<bool>(
+                          [allow_invalid_escapes] __device__(string_view::const_iterator iter) {
+                            auto const c = *iter;
+                            if (c != '!' && c != '$' && !(c >= '&' && c <= ';' && c != '/') &&
+                                c != '=' && !(c >= '@' && c <= '_' && c != '^' && c != '\\') &&
+                                !(c >= 'a' && c <= 'z') && c != '~' &&
+                                (!allow_invalid_escapes || c != '%')) {
+                              return false;
+                            }
+                            return true;
+                          }),
+                        allow_invalid_escapes);
 }
 
 bool __device__ validate_userinfo(string_view userinfo)
 {
   // can't be ] or [ in here
-  return validate_chunk(userinfo, [] __device__(string_view::const_iterator iter) {
-    auto const c = *iter;
-    if (c == '[' || c == ']') { return false; }
-    return true;
-  });
+  return validate_chunk(
+    userinfo, cuda::proclaim_return_type<bool>([] __device__(string_view::const_iterator iter) {
+      auto const c = *iter;
+      if (c == '[' || c == ']') { return false; }
+      return true;
+    }));
 }
 
 bool __device__ validate_port(string_view port)
 {
   // port is positive numeric >=0 according to spark...shrug
-  return validate_chunk(port, [] __device__(string_view::const_iterator iter) {
-    auto const c = *iter;
-    if (c < '0' && c > '9') { return false; }
-    return true;
-  });
+  return validate_chunk(
+    port, cuda::proclaim_return_type<bool>([] __device__(string_view::const_iterator iter) {
+      auto const c = *iter;
+      if (c < '0' && c > '9') { return false; }
+      return true;
+    }));
 }
 
 bool __device__ validate_path(string_view path)
 {
   // path can be alphanum and @[]_-!.~'()*?/&,;:$+=
-  return validate_chunk(path, [] __device__(string_view::const_iterator iter) {
-    auto const c = *iter;
-    if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && !(c >= '?' && c <= 'Z') &&
-        c != '_' && !(c >= 'a' && c <= 'z') && c != '~') {
-      return false;
-    }
-    return true;
-  });
+  return validate_chunk(
+    path, cuda::proclaim_return_type<bool>([] __device__(string_view::const_iterator iter) {
+      auto const c = *iter;
+      if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' && !(c >= '@' && c <= 'Z') &&
+          c != '_' && !(c >= 'a' && c <= 'z') && c != '~') {
+        return false;
+      }
+      return true;
+    }));
 }
 
 bool __device__ validate_opaque(string_view opaque)
 {
   // opaque can be alphanum and @[]_-!.~'()*?/,;:$@+=
-  return validate_chunk(opaque, [] __device__(string_view::const_iterator iter) {
-    auto const c = *iter;
-    if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' &&
-        !(c >= '?' && c <= ']' && c != '\\') && c != '_' && c != '~' && !(c >= 'a' && c <= 'z')) {
-      return false;
-    }
-    return true;
-  });
+  return validate_chunk(
+    opaque, cuda::proclaim_return_type<bool>([] __device__(string_view::const_iterator iter) {
+      auto const c = *iter;
+      if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' &&
+          !(c >= '?' && c <= ']' && c != '\\') && c != '_' && c != '~' && !(c >= 'a' && c <= 'z')) {
+        return false;
+      }
+      return true;
+    }));
 }
 
 bool __device__ validate_fragment(string_view fragment)
 {
   // fragment can be alphanum and @[]_-!.~'()*?/,;:$&+=
-  return validate_chunk(fragment, [] __device__(string_view::const_iterator iter) {
-    auto const c = *iter;
-    if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' &&
-        !(c >= '?' && c <= ']' && c != '\\') && c != '_' && c != '~' && !(c >= 'a' && c <= 'z')) {
-      return false;
-    }
-    return true;
-  });
+  return validate_chunk(
+    fragment, cuda::proclaim_return_type<bool>([] __device__(string_view::const_iterator iter) {
+      auto const c = *iter;
+      if (c != '!' && c != '$' && !(c >= '&' && c <= ';') && c != '=' &&
+          !(c >= '?' && c <= ']' && c != '\\') && c != '_' && c != '~' && !(c >= 'a' && c <= 'z')) {
+        return false;
+      }
+      return true;
+    }));
 }
 
 uri_parts __device__ validate_uri(const char* str, int len)
diff --git a/src/main/cpp/src/row_conversion.cu b/src/main/cpp/src/row_conversion.cu
deleted file mode 100644
index f2416fb3ab..0000000000
--- a/src/main/cpp/src/row_conversion.cu
+++ /dev/null
@@ -1,2595 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cooperative_groups.h>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/sequence.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/lists/lists_column_device_view.cuh>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-#include <thrust/binary_search.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/scan.h>
-#include <type_traits>
-
-#include "row_conversion.hpp"
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-#define ASYNC_MEMCPY_SUPPORTED
-#endif
-
-#if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED)
-#include <cuda/barrier>
-#endif  // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED)
-
-#include <algorithm>
-#include <cstdarg>
-#include <cstdint>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <optional>
-#include <tuple>
-
-namespace {
-
-constexpr auto JCUDF_ROW_ALIGNMENT = 8;
-
-constexpr auto MAX_BATCH_SIZE = std::numeric_limits<cudf::size_type>::max();
-
-// Number of rows each block processes in the two kernels. Tuned via nsight
-constexpr auto NUM_STRING_ROWS_PER_BLOCK_TO_ROWS   = 1024;
-constexpr auto NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS = 64;
-constexpr auto MIN_STRING_BLOCKS                   = 32;
-constexpr auto MAX_STRING_BLOCKS                   = MAX_BATCH_SIZE;
-
-constexpr auto NUM_WARPS_IN_BLOCK = 32;
-
-}  // anonymous namespace
-
-// needed to suppress warning about cuda::barrier
-#pragma nv_diag_suppress static_var_with_dynamic_init
-
-using namespace cudf;
-using detail::make_device_uvector_async;
-using detail::make_device_uvector_sync;
-using rmm::device_uvector;
-
-#ifdef ASYNC_MEMCPY_SUPPORTED
-using cuda::aligned_size_t;
-#else
-template <std::size_t>
-using aligned_size_t = size_t;  // Local stub for cuda::aligned_size_t.
-#endif  // ASYNC_MEMCPY_SUPPORTED
-
-namespace spark_rapids_jni {
-namespace detail {
-
-/*
- * This module converts data from row-major to column-major and from column-major to row-major. It
- * is a transpose of the data of sorts, but there are a few complicating factors. They are spelled
- * out below:
- *
- * Row Batches:
- * The row data has to fit inside a cuDF column, which limits it to 2 gigs currently. The calling
- * code attempts to keep the data size under 2 gigs, but due to padding this isn't always the case,
- * so being able to break this up into multiple columns is necessary. Internally, this is referred
- * to as the row batch, which is a group of rows that will fit into this 2 gig space requirement.
- * There are typically 1 of these batches, but there can be 2.
- *
- * Async Memcpy:
- * The CUDA blocks are using memcpy_async, which allows for the device to schedule memcpy operations
- * and then wait on them to complete at a later time with a barrier. On Ampere or later hardware
- * there is dedicated hardware to do this copy and on pre-Ampere it should generate the same code
- * that a hand-rolled loop would generate, so performance should be the same or better than a
- * hand-rolled kernel.
- *
- * Tile Info:
- * Each CUDA block will work on a single tile info before exiting. This single tile consumes all
- * available shared memory. The kernel reads data into shared memory and then back out from shared
- * memory to device memory via memcpy_async. This kernel is completely memory bound.
- *
- * Batch Data:
- * This structure contains all the row batches and some book-keeping data necessary for the batches
- * such as row numbers for the batches.
- *
- * Tiles:
- * The tile info describes a tile of data to process. In a GPU with 48KB this equates to about 221
- * bytes in each direction of a table. The tiles are kept as square as possible to attempt to
- * coalesce memory operations. The taller a tile is the better coalescing of columns, but row
- * coalescing suffers. The wider a tile is the better the row coalescing, but columns coalescing
- * suffers. The code attempts to produce a square tile to balance the coalescing. It starts by
- * figuring out the optimal byte length and then adding columns to the data until the tile is too
- * large. Since rows are different width with different alignment requirements, this isn't typically
- * exact. Once a width is found the tiles are generated vertically with that width and height and
- * then the process repeats. This means all the tiles will be the same height, but will have
- * different widths based on what columns they encompass. Tiles in a vertical row will all have the
- * same dimensions.
- *
- *   --------------------------------
- *   | 4   5.0f || True   8   3   1 |
- *   | 3   6.0f || False  3   1   1 |
- *   | 2   7.0f || True   7   4   1 |
- *   | 1   8.0f || False  2   5   1 |
- *   --------------------------------
- *   | 0   9.0f || True   6   7   1 |
- *   ...
- */
-
-/**
- * @brief The CUDA blocks work on one tile_info struct of data.
- *        This structure defines the workspaces for the blocks.
- *
- */
-struct tile_info {
-  int start_col;
-  int start_row;
-  int end_col;
-  int end_row;
-  int batch_number;
-
-  __device__ inline size_type get_shared_row_size(size_type const* const col_offsets,
-                                                  size_type const* const col_sizes) const
-  {
-    // this calculation is invalid if there are holes in the data such as a variable-width column.
-    // It is wrong in a safe way in that it will say this row size is larger than it should be, so
-    // we are not losing data we are just not as efficient as we could be with shared memory. This
-    // may be a problem if the tile is computed without regard to variable width offset/length sizes
-    // in that we overrun shared memory.
-    return util::round_up_unsafe(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col],
-                                 JCUDF_ROW_ALIGNMENT);
-  }
-
-  __device__ inline size_type num_cols() const { return end_col - start_col + 1; }
-
-  __device__ inline size_type num_rows() const { return end_row - start_row + 1; }
-};
-
-/**
- * @brief Returning rows is done in a byte cudf column. This is limited in size by
- *        `size_type` and so output is broken into batches of rows that fit inside
- *        this limit.
- *
- */
-struct row_batch {
-  size_type num_bytes;                    // number of bytes in this batch
-  size_type row_count;                    // number of rows in the batch
-  device_uvector<size_type> row_offsets;  // offsets column of output cudf column
-};
-
-/**
- * @brief Holds information about the batches of data to be processed
- *
- */
-struct batch_data {
-  device_uvector<size_type> batch_row_offsets;       // offsets to each row in incoming data
-  device_uvector<size_type> d_batch_row_boundaries;  // row numbers for the start of each batch
-  std::vector<size_type>
-    batch_row_boundaries;              // row numbers for the start of each batch: 0, 1500, 2700
-  std::vector<row_batch> row_batches;  // information about each batch such as byte count
-};
-
-/**
- * @brief builds row size information for tables that contain strings
- *
- * @param tbl table from which to compute row size information
- * @param fixed_width_and_validity_size size of fixed-width and validity data in this table
- * @param stream cuda stream on which to operate
- * @return pair of device vector of size_types of the row sizes of the table and a device vector of
- * offsets into the string column
- */
-std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<strings_column_view::offset_iterator>>
-build_string_row_offsets(table_view const& tbl,
-                         size_type fixed_width_and_validity_size,
-                         rmm::cuda_stream_view stream)
-{
-  auto const num_rows = tbl.num_rows();
-  rmm::device_uvector<size_type> d_row_sizes(num_rows, stream);
-  thrust::uninitialized_fill(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), 0);
-
-  auto d_offsets_iterators = [&]() {
-    std::vector<strings_column_view::offset_iterator> offsets_iterators;
-    auto offsets_iter = thrust::make_transform_iterator(
-      tbl.begin(), [](auto const& col) -> strings_column_view::offset_iterator {
-        if (!is_fixed_width(col.type())) {
-          CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!");
-          return strings_column_view(col).offsets_begin();
-        } else {
-          return nullptr;
-        }
-      });
-    std::copy_if(offsets_iter,
-                 offsets_iter + tbl.num_columns(),
-                 std::back_inserter(offsets_iterators),
-                 [](auto const& offset_ptr) { return offset_ptr != nullptr; });
-    return make_device_uvector_sync(
-      offsets_iterators, stream, rmm::mr::get_current_device_resource());
-  }();
-
-  auto const num_columns = static_cast<size_type>(d_offsets_iterators.size());
-
-  thrust::for_each(rmm::exec_policy(stream),
-                   thrust::make_counting_iterator(0),
-                   thrust::make_counting_iterator(num_columns * num_rows),
-                   [d_offsets_iterators = d_offsets_iterators.data(),
-                    num_columns,
-                    num_rows,
-                    d_row_sizes = d_row_sizes.data()] __device__(auto element_idx) {
-                     auto const row = element_idx % num_rows;
-                     auto const col = element_idx / num_rows;
-                     auto const val =
-                       d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row];
-                     atomicAdd(&d_row_sizes[row], val);
-                   });
-
-  // transform the row sizes to include fixed width size and alignment
-  thrust::transform(rmm::exec_policy(stream),
-                    d_row_sizes.begin(),
-                    d_row_sizes.end(),
-                    d_row_sizes.begin(),
-                    [fixed_width_and_validity_size] __device__(auto row_size) {
-                      return util::round_up_unsafe(fixed_width_and_validity_size + row_size,
-                                                   JCUDF_ROW_ALIGNMENT);
-                    });
-
-  return {std::move(d_row_sizes), std::move(d_offsets_iterators)};
-}
-
-/**
- * @brief functor to return the offset of a row in a table with string columns
- *
- */
-struct string_row_offset_functor {
-  string_row_offset_functor(device_span<size_type const> d_row_offsets)
-    : d_row_offsets(d_row_offsets){};
-
-  __device__ inline size_type operator()(int row_number, int) const
-  {
-    return d_row_offsets[row_number];
-  }
-
-  device_span<size_type const> d_row_offsets;
-};
-
-/**
- * @brief functor to return the offset of a row in a table with only fixed-width columns
- *
- */
-struct fixed_width_row_offset_functor {
-  fixed_width_row_offset_functor(size_type fixed_width_only_row_size)
-    : _fixed_width_only_row_size(fixed_width_only_row_size){};
-
-  __device__ inline size_type operator()(int row_number, int tile_row_start) const
-  {
-    return (row_number - tile_row_start) * _fixed_width_only_row_size;
-  }
-
-  size_type _fixed_width_only_row_size;
-};
-
-/**
- * @brief Copies data from row-based JCUDF format to column-based cudf format.
- *
- * This optimized version of the conversion is faster for fixed-width tables that do not have more
- * than 100 columns.
- *
- * @param num_rows number of rows in the incoming table
- * @param num_columns number of columns in the incoming table
- * @param row_size length in bytes of each row
- * @param input_offset_in_row offset to each row of data
- * @param num_bytes total number of bytes in the incoming data
- * @param output_data array of pointers to the output data
- * @param output_nm array of pointers to the output null masks
- * @param input_data pointing to the incoming row data
- */
-__global__ void copy_from_rows_fixed_width_optimized(const size_type num_rows,
-                                                     const size_type num_columns,
-                                                     const size_type row_size,
-                                                     const size_type* input_offset_in_row,
-                                                     const size_type* num_bytes,
-                                                     int8_t** output_data,
-                                                     bitmask_type** output_nm,
-                                                     const int8_t* input_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time.
-  // For simplicity we will refer to this as a row_group
-
-  // In practice we have found writing more than 4 columns of data per thread
-  // results in performance loss. As such we are using a 2 dimensional
-  // kernel in terms of threads, but not in terms of blocks. Columns are
-  // controlled by the y dimension (there is no y dimension in blocks). Rows
-  // are controlled by the x dimension (there are multiple blocks in the x
-  // dimension).
-
-  size_type const rows_per_group   = blockDim.x;
-  size_type const row_group_start  = blockIdx.x;
-  size_type const row_group_stride = gridDim.x;
-  size_type const row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
-
-  extern __shared__ int8_t shared_data[];
-
-  // Because we are copying fixed width only data and we stride the rows
-  // this thread will always start copying from shared data in the same place
-  int8_t* row_tmp     = &shared_data[row_size * threadIdx.x];
-  int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
-
-  for (auto row_group_index = row_group_start; row_group_index < row_group_end;
-       row_group_index += row_group_stride) {
-    // Step 1: Copy the data into shared memory
-    // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t* long_shared      = reinterpret_cast<int64_t*>(shared_data);
-    int64_t const* long_input = reinterpret_cast<int64_t const*>(input_data);
-
-    auto const shared_output_index  = threadIdx.x + (threadIdx.y * blockDim.x);
-    auto const shared_output_stride = blockDim.x * blockDim.y;
-    auto const row_index_end        = std::min(num_rows, ((row_group_index + 1) * rows_per_group));
-    auto const num_rows_in_group    = row_index_end - (row_group_index * rows_per_group);
-    auto const shared_length        = row_size * num_rows_in_group;
-
-    size_type const shared_output_end = shared_length / sizeof(int64_t);
-
-    auto const start_input_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
-
-    for (size_type shared_index = shared_output_index; shared_index < shared_output_end;
-         shared_index += shared_output_stride) {
-      long_shared[shared_index] = long_input[start_input_index + shared_index];
-    }
-    // Wait for all of the data to be in shared memory
-    __syncthreads();
-
-    // Step 2 copy the data back out
-
-    // Within the row group there should be 1 thread for each row.  This is a
-    // requirement for launching the kernel
-    auto const row_index = (row_group_index * rows_per_group) + threadIdx.x;
-    // But we might not use all of the threads if the number of rows does not go
-    // evenly into the thread count. We don't want those threads to exit yet
-    // because we may need them to copy data in for the next row group.
-    uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
-    if (row_index < num_rows) {
-      auto const col_index_start  = threadIdx.y;
-      auto const col_index_stride = blockDim.y;
-      for (auto col_index = col_index_start; col_index < num_columns;
-           col_index += col_index_stride) {
-        auto const col_size   = num_bytes[col_index];
-        int8_t const* col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
-        int8_t* col_output    = output_data[col_index];
-        switch (col_size) {
-          case 1: {
-            col_output[row_index] = *col_tmp;
-            break;
-          }
-          case 2: {
-            int16_t* short_col_output   = reinterpret_cast<int16_t*>(col_output);
-            short_col_output[row_index] = *reinterpret_cast<const int16_t*>(col_tmp);
-            break;
-          }
-          case 4: {
-            int32_t* int_col_output   = reinterpret_cast<int32_t*>(col_output);
-            int_col_output[row_index] = *reinterpret_cast<const int32_t*>(col_tmp);
-            break;
-          }
-          case 8: {
-            int64_t* long_col_output   = reinterpret_cast<int64_t*>(col_output);
-            long_col_output[row_index] = *reinterpret_cast<const int64_t*>(col_tmp);
-            break;
-          }
-          default: {
-            auto const output_offset = col_size * row_index;
-            // TODO this should just not be supported for fixed width columns, but just in case...
-            for (auto b = 0; b < col_size; b++) {
-              col_output[b + output_offset] = col_tmp[b];
-            }
-            break;
-          }
-        }
-
-        bitmask_type* nm          = output_nm[col_index];
-        int8_t* valid_byte        = &row_vld_tmp[col_index / 8];
-        size_type byte_bit_offset = col_index % 8;
-        int predicate             = *valid_byte & (1 << byte_bit_offset);
-        uint32_t bitmask          = __ballot_sync(active_mask, predicate);
-        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
-      }  // end column loop
-    }    // end row copy
-    // wait for the row_group to be totally copied before starting on the next row group
-    __syncthreads();
-  }
-}
-
-__global__ void copy_to_rows_fixed_width_optimized(const size_type start_row,
-                                                   const size_type num_rows,
-                                                   const size_type num_columns,
-                                                   const size_type row_size,
-                                                   const size_type* output_offset_in_row,
-                                                   const size_type* num_bytes,
-                                                   const int8_t** input_data,
-                                                   const bitmask_type** input_nm,
-                                                   int8_t* output_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time.
-  // We do not support copying a subset of the columns in a row yet, so we don't
-  // currently support a row that is wider than shared memory.
-  // For simplicity we will refer to this as a row_group
-
-  // In practice we have found reading more than 4 columns of data per thread
-  // results in performance loss. As such we are using a 2 dimensional
-  // kernel in terms of threads, but not in terms of blocks. Columns are
-  // controlled by the y dimension (there is no y dimension in blocks). Rows
-  // are controlled by the x dimension (there are multiple blocks in the x
-  // dimension).
-
-  size_type rows_per_group   = blockDim.x;
-  size_type row_group_start  = blockIdx.x;
-  size_type row_group_stride = gridDim.x;
-  size_type row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
-
-  extern __shared__ int8_t shared_data[];
-
-  // Because we are copying fixed width only data and we stride the rows
-  // this thread will always start copying to shared data in the same place
-  int8_t* row_tmp = &shared_data[row_size * threadIdx.x];
-  int8_t* row_vld_tmp =
-    &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
-
-  for (size_type row_group_index = row_group_start; row_group_index < row_group_end;
-       row_group_index += row_group_stride) {
-    // Within the row group there should be 1 thread for each row.  This is a
-    // requirement for launching the kernel
-    size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
-    // But we might not use all of the threads if the number of rows does not go
-    // evenly into the thread count. We don't want those threads to exit yet
-    // because we may need them to copy data back out.
-    if (row_index < (start_row + num_rows)) {
-      size_type col_index_start  = threadIdx.y;
-      size_type col_index_stride = blockDim.y;
-      for (size_type col_index = col_index_start; col_index < num_columns;
-           col_index += col_index_stride) {
-        size_type col_size      = num_bytes[col_index];
-        int8_t* col_tmp         = &(row_tmp[output_offset_in_row[col_index]]);
-        const int8_t* col_input = input_data[col_index];
-        switch (col_size) {
-          case 1: {
-            *col_tmp = col_input[row_index];
-            break;
-          }
-          case 2: {
-            const int16_t* short_col_input       = reinterpret_cast<const int16_t*>(col_input);
-            *reinterpret_cast<int16_t*>(col_tmp) = short_col_input[row_index];
-            break;
-          }
-          case 4: {
-            const int32_t* int_col_input         = reinterpret_cast<const int32_t*>(col_input);
-            *reinterpret_cast<int32_t*>(col_tmp) = int_col_input[row_index];
-            break;
-          }
-          case 8: {
-            const int64_t* long_col_input        = reinterpret_cast<const int64_t*>(col_input);
-            *reinterpret_cast<int64_t*>(col_tmp) = long_col_input[row_index];
-            break;
-          }
-          default: {
-            size_type input_offset = col_size * row_index;
-            // TODO this should just not be supported for fixed width columns, but just in case...
-            for (size_type b = 0; b < col_size; b++) {
-              col_tmp[b] = col_input[b + input_offset];
-            }
-            break;
-          }
-        }
-        // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
-        // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t* valid_byte        = &row_vld_tmp[col_index / 8];
-        size_type byte_bit_offset = col_index % 8;
-        uint64_t fixup_bytes      = reinterpret_cast<uint64_t>(valid_byte) % 4;
-        int32_t* valid_int        = reinterpret_cast<int32_t*>(valid_byte - fixup_bytes);
-        size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
-        // Now copy validity for the column
-        if (input_nm[col_index]) {
-          if (bit_is_set(input_nm[col_index], row_index)) {
-            atomicOr_block(valid_int, 1 << int_bit_offset);
-          } else {
-            atomicAnd_block(valid_int, ~(1 << int_bit_offset));
-          }
-        } else {
-          // It is valid so just set the bit
-          atomicOr_block(valid_int, 1 << int_bit_offset);
-        }
-      }  // end column loop
-    }    // end row copy
-    // wait for the row_group to be totally copied into shared memory
-    __syncthreads();
-
-    // Step 2: Copy the data back out
-    // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t* long_shared = reinterpret_cast<int64_t*>(shared_data);
-    int64_t* long_output = reinterpret_cast<int64_t*>(output_data);
-
-    size_type shared_input_index  = threadIdx.x + (threadIdx.y * blockDim.x);
-    size_type shared_input_stride = blockDim.x * blockDim.y;
-    size_type row_index_end       = ((row_group_index + 1) * rows_per_group);
-    if (row_index_end > num_rows) { row_index_end = num_rows; }
-    size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    size_type shared_length     = row_size * num_rows_in_group;
-
-    size_type shared_input_end = shared_length / sizeof(int64_t);
-
-    size_type start_output_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
-
-    for (size_type shared_index = shared_input_index; shared_index < shared_input_end;
-         shared_index += shared_input_stride) {
-      long_output[start_output_index + shared_index] = long_shared[shared_index];
-    }
-    __syncthreads();
-    // Go for the next round
-  }
-}
-
-#ifdef ASYNC_MEMCPY_SUPPORTED
-#define MEMCPY(dst, src, size, barrier) cuda::memcpy_async(dst, src, size, barrier)
-#else
-#define MEMCPY(dst, src, size, barrier) memcpy(dst, src, size)
-#endif  // ASYNC_MEMCPY_SUPPORTED
-
-/**
- * @brief copy data from cudf columns into JCUDF format, which is row-based
- *
- * @tparam RowOffsetFunctor iterator that gives the size of a specific row of the table.
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param shmem_used_per_tile shared memory amount each `tile_info` is using
- * @param tile_infos span of `tile_info` structs the define the work
- * @param input_data pointer to raw table data
- * @param col_sizes array of sizes for each element in a column - one per column
- * @param col_offsets offset into input data row for each column's start
- * @param row_offsets offset to a specific row in the output data
- * @param batch_row_boundaries row numbers for batch starts
- * @param output_data pointer to output data
- *
- */
-template <typename RowOffsetFunctor>
-__global__ void copy_to_rows(const size_type num_rows,
-                             const size_type num_columns,
-                             const size_type shmem_used_per_tile,
-                             device_span<const tile_info> tile_infos,
-                             const int8_t** input_data,
-                             const size_type* col_sizes,
-                             const size_type* col_offsets,
-                             RowOffsetFunctor row_offsets,
-                             size_type const* batch_row_boundaries,
-                             int8_t** output_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time.
-  // This has been broken up for us in the tile_info struct, so we don't have
-  // any calculation to do here, but it is important to note.
-
-  auto const group = cooperative_groups::this_thread_block();
-  auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
-  extern __shared__ int8_t shared_data[];
-
-#ifdef ASYNC_MEMCPY_SUPPORTED
-  __shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier;
-  if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); }
-  group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
-
-  auto const tile                   = tile_infos[blockIdx.x];
-  auto const num_tile_cols          = tile.num_cols();
-  auto const num_tile_rows          = tile.num_rows();
-  auto const tile_row_size          = tile.get_shared_row_size(col_offsets, col_sizes);
-  auto const starting_column_offset = col_offsets[tile.start_col];
-
-  // to do the copy we need to do n column copies followed by m element copies OR we have to do m
-  // element copies followed by r row copies. When going from column to row it is much easier to
-  // copy by elements first otherwise we would need a running total of the column sizes for our
-  // tile, which isn't readily available. This makes it more appealing to copy element-wise from
-  // input data into shared matching the end layout and do row-based memcopies out.
-
-  // read each column across the tile
-  // each warp takes a column with each thread of a warp taking a row this is done with cooperative
-  // groups where each column is chosen by the tiled partition and each thread in that partition
-  // works on a row
-  for (int relative_col = warp.meta_group_rank(); relative_col < num_tile_cols;
-       relative_col += warp.meta_group_size()) {
-    auto const absolute_col        = relative_col + tile.start_col;
-    auto const col_size            = col_sizes[absolute_col];
-    auto const col_offset          = col_offsets[absolute_col];
-    auto const relative_col_offset = col_offset - starting_column_offset;
-    auto const col_ptr             = input_data[absolute_col];
-
-    if (col_ptr == nullptr) {
-      // variable-width data column
-      continue;
-    }
-
-    for (int relative_row = warp.thread_rank(); relative_row < num_tile_rows;
-         relative_row += warp.size()) {
-      if (relative_row >= num_tile_rows) {
-        // out of bounds
-        continue;
-      }
-      auto const absolute_row = relative_row + tile.start_row;
-
-      auto const shared_offset = relative_row * tile_row_size + relative_col_offset;
-      auto const input_src     = col_ptr + col_size * absolute_row;
-
-      // copy the element from global memory
-      switch (col_size) {
-        case 2: {
-          const int16_t* short_col_input = reinterpret_cast<const int16_t*>(input_src);
-          *reinterpret_cast<int16_t*>(&shared_data[shared_offset]) = *short_col_input;
-          break;
-        }
-        case 4: {
-          const int32_t* int_col_input = reinterpret_cast<const int32_t*>(input_src);
-          *reinterpret_cast<int32_t*>(&shared_data[shared_offset]) = *int_col_input;
-          break;
-        }
-        case 8: {
-          const int64_t* long_col_input = reinterpret_cast<const int64_t*>(input_src);
-          *reinterpret_cast<int64_t*>(&shared_data[shared_offset]) = *long_col_input;
-          break;
-        }
-        case 1: shared_data[shared_offset] = *input_src; break;
-        default: {
-          for (int i = 0; i < col_size; ++i) {
-            shared_data[shared_offset] = *input_src;
-          }
-          break;
-        }
-      }
-    }
-  }
-
-  auto const tile_output_buffer = output_data[tile.batch_number];
-  auto const row_batch_start = tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
-
-  // no async copies above waiting on the barrier, so we sync the group here to ensure all copies to
-  // shared memory are completed before copying data out
-  group.sync();
-
-  // each warp takes a row
-  for (int copy_row = warp.meta_group_rank(); copy_row < tile.num_rows();
-       copy_row += warp.meta_group_size()) {
-    auto const src = &shared_data[tile_row_size * copy_row];
-    auto const dst = tile_output_buffer + row_offsets(copy_row + tile.start_row, row_batch_start) +
-                     starting_column_offset;
-#ifdef ASYNC_MEMCPY_SUPPORTED
-    cuda::memcpy_async(warp, dst, src, tile_row_size, tile_barrier);
-#else
-    for (int b = warp.thread_rank(); b < tile_row_size; b += warp.size()) {
-      dst[b] = src[b];
-    }
-#endif
-  }
-
-#ifdef ASYNC_MEMCPY_SUPPORTED
-  // wait on the last copies to complete
-  tile_barrier.arrive_and_wait();
-#else
-  group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
-}
-
-/**
- * @brief copy data from row-based format to cudf columns
- *
- * @tparam RowOffsetFunctor iterator that gives the size of a specific row of the table.
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param shmem_used_per_tile amount of shared memory that is used by a tile
- * @param row_offsets offset to a specific row in the output data
- * @param batch_row_boundaries row numbers for batch starts
- * @param output_data pointer to output data, partitioned by data size
- * @param validity_offsets offset into input data row for validity data
- * @param tile_infos information about the tiles of work
- * @param input_nm pointer to input data
- *
- */
-template <typename RowOffsetFunctor>
-__global__ void copy_validity_to_rows(const size_type num_rows,
-                                      const size_type num_columns,
-                                      const size_type shmem_used_per_tile,
-                                      RowOffsetFunctor row_offsets,
-                                      size_type const* batch_row_boundaries,
-                                      int8_t** output_data,
-                                      const size_type validity_offset,
-                                      device_span<const tile_info> tile_infos,
-                                      const bitmask_type** input_nm)
-{
-  extern __shared__ int8_t shared_data[];
-
-  // each thread of warp reads a single int32 of validity - so we read 128 bytes then ballot_sync
-  // the bits and write the result to shmem after we fill shared mem memcpy it out in a blob.
-  auto const group = cooperative_groups::this_thread_block();
-  auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
-
-#ifdef ASYNC_MEMCPY_SUPPORTED
-  // Initialize cuda barriers for each tile.
-  __shared__ cuda::barrier<cuda::thread_scope_block> shared_tile_barrier;
-  if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); }
-  group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
-
-  auto tile                = tile_infos[blockIdx.x];
-  auto const num_tile_cols = tile.num_cols();
-  auto const num_tile_rows = tile.num_rows();
-
-  auto const threads_per_warp = warp.size();
-  auto const rows_per_read    = cudf::detail::size_in_bits<bitmask_type>();
-
-  auto const num_sections_x = util::div_rounding_up_unsafe(num_tile_cols, threads_per_warp);
-  auto const num_sections_y = util::div_rounding_up_unsafe(num_tile_rows, rows_per_read);
-  auto const validity_data_row_length = util::round_up_unsafe(
-    util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
-  auto const total_sections = num_sections_x * num_sections_y;
-
-  // the tile is divided into sections. A warp operates on a section at a time.
-  for (int my_section_idx = warp.meta_group_rank(); my_section_idx < total_sections;
-       my_section_idx += warp.meta_group_size()) {
-    // convert to rows and cols
-    auto const section_x          = my_section_idx % num_sections_x;
-    auto const section_y          = my_section_idx / num_sections_x;
-    auto const relative_col       = section_x * threads_per_warp + warp.thread_rank();
-    auto const relative_row       = section_y * rows_per_read;
-    auto const absolute_col       = relative_col + tile.start_col;
-    auto const absolute_row       = relative_row + tile.start_row;
-    auto const participating      = absolute_col < num_columns && absolute_row < num_rows;
-    auto const participation_mask = __ballot_sync(0xFFFFFFFF, participating);
-
-    if (participating) {
-      auto my_data = input_nm[absolute_col] != nullptr
-                       ? input_nm[absolute_col][word_index(absolute_row)]
-                       : std::numeric_limits<uint32_t>::max();
-
-      // every thread that is participating in the warp has 4 bytes, but it's column-based data and
-      // we need it in row-based. So we shuffle the bits around with ballot_sync to make the bytes
-      // we actually write.
-      bitmask_type dw_mask = 0x1;
-      for (int i = 0; i < threads_per_warp && relative_row + i < num_rows; ++i, dw_mask <<= 1) {
-        auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
-        // lead thread in each warp writes data
-        auto const validity_write_offset =
-          validity_data_row_length * (relative_row + i) + (relative_col / CHAR_BIT);
-        if (warp.thread_rank() == 0) {
-          *reinterpret_cast<bitmask_type*>(&shared_data[validity_write_offset]) = validity_data;
-        }
-      }
-    }
-  }
-
-  auto const output_data_base =
-    output_data[tile.batch_number] + validity_offset + tile.start_col / CHAR_BIT;
-
-  // each warp copies a row at a time
-  auto const row_bytes       = util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT);
-  auto const row_batch_start = tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
-
-  // make sure entire tile has finished copy
-  // Note that this was copied from above just under the for loop due to nsight complaints about
-  // divergent threads
-  group.sync();
-
-  for (int relative_row = warp.meta_group_rank(); relative_row < num_tile_rows;
-       relative_row += warp.meta_group_size()) {
-    auto const src = &shared_data[validity_data_row_length * relative_row];
-    auto const dst = output_data_base + row_offsets(relative_row + tile.start_row, row_batch_start);
-#ifdef ASYNC_MEMCPY_SUPPORTED
-    cuda::memcpy_async(warp, dst, src, row_bytes, shared_tile_barrier);
-#else
-    for (int b = warp.thread_rank(); b < row_bytes; b += warp.size()) {
-      dst[b] = src[b];
-    }
-#endif
-  }
-
-#ifdef ASYNC_MEMCPY_SUPPORTED
-  // wait for tile of data to arrive
-  shared_tile_barrier.arrive_and_wait();
-#else
-  group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
-}
-
-/**
- * @brief kernel to copy string data to JCUDF row format
- *
- * @tparam RowOffsetFunctor iterator for row offsets into the destination data
- * @param num_rows number of rows in this portion of the table
- * @param num_variable_columns number of columns of variable-width data
- * @param variable_input_data variable width data column pointers
- * @param variable_col_output_offsets output offset information for variable-width columns
- * @param variable_col_offsets input offset information for variable-width columns
- * @param fixed_width_row_size offset to variable-width data in a row
- * @param row_offsets offsets for each row in output data
- * @param batch_row_offset row start for this batch
- * @param output_data pointer to output data for this batch
- *
- */
-template <typename RowOffsetFunctor>
-__global__ void copy_strings_to_rows(size_type const num_rows,
-                                     size_type const num_variable_columns,
-                                     int8_t const** variable_input_data,
-                                     size_type const* variable_col_output_offsets,
-                                     size_type const** variable_col_offsets,
-                                     size_type fixed_width_row_size,
-                                     RowOffsetFunctor row_offsets,
-                                     size_type const batch_row_offset,
-                                     int8_t* output_data)
-{
-  // Each block will take a group of rows controlled by NUM_STRING_ROWS_PER_BLOCK_TO_ROWS. Each warp
-  // will copy a row at a time. The base thread will first go through column data and fill out
-  // offset/length information for the column. Then all threads of the warp will participate in the
-  // memcpy of the string data.
-  auto const my_block = cooperative_groups::this_thread_block();
-  auto const warp     = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
-#ifdef ASYNC_MEMCPY_SUPPORTED
-  cuda::barrier<cuda::thread_scope_block> block_barrier;
-#endif
-
-  auto const start_row =
-    blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset;
-  auto const end_row =
-    std::min(num_rows, static_cast<size_type>(start_row + NUM_STRING_ROWS_PER_BLOCK_TO_ROWS));
-
-  for (int row = start_row; row < end_row; row += warp.meta_group_size()) {
-    auto offset                = fixed_width_row_size;  // initial offset to variable-width data
-    auto const base_row_offset = row_offsets(row, 0);
-    for (int col = 0; col < num_variable_columns; ++col) {
-      auto const string_start_offset = variable_col_offsets[col][row];
-      auto const string_length       = variable_col_offsets[col][row + 1] - string_start_offset;
-      if (warp.thread_rank() == 0) {
-        // write the offset/length to column
-        uint32_t* output_dest = reinterpret_cast<uint32_t*>(
-          &output_data[base_row_offset + variable_col_output_offsets[col]]);
-        output_dest[0] = offset;
-        output_dest[1] = string_length;
-      }
-      auto string_output_dest = &output_data[base_row_offset + offset];
-      auto string_output_src  = &variable_input_data[col][string_start_offset];
-      warp.sync();
-#ifdef ASYNC_MEMCPY_SUPPORTED
-      cuda::memcpy_async(warp, string_output_dest, string_output_src, string_length, block_barrier);
-#else
-      for (int c = warp.thread_rank(); c < string_length; c += warp.size()) {
-        string_output_dest[c] = string_output_src[c];
-      }
-#endif
-      offset += string_length;
-    }
-  }
-}
-/**
- * @brief copy data from row-based format to cudf columns
- *
- * @tparam RowOffsetFunctor iterator that gives the size of a specific row of the table.
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param shmem_used_per_tile amount of shared memory that is used by a tile
- * @param row_offsets offset to a specific row in the input data
- * @param batch_row_boundaries row numbers for batch starts
- * @param output_data pointers to column data
- * @param col_sizes array of sizes for each element in a column - one per column
- * @param col_offsets offset into input data row for each column's start
- * @param tile_infos information about the tiles of work
- * @param input_data pointer to input data
- *
- */
-template <typename RowOffsetFunctor>
-__global__ void copy_from_rows(const size_type num_rows,
-                               const size_type num_columns,
-                               const size_type shmem_used_per_tile,
-                               RowOffsetFunctor row_offsets,
-                               size_type const* batch_row_boundaries,
-                               int8_t** output_data,
-                               const size_type* col_sizes,
-                               const size_type* col_offsets,
-                               device_span<const tile_info> tile_infos,
-                               const int8_t* input_data)
-{
-  // We are going to copy the data in two passes.
-  // The first pass copies a chunk of data into shared memory.
-  // The second pass copies that chunk from shared memory out to the final location.
-
-  // Because shared memory is limited we copy a subset of the rows at a time. This has been broken
-  // up for us in the tile_info struct, so we don't have any calculation to do here, but it is
-  // important to note.
-
-  // To speed up some of the random access memory we do, we copy col_sizes and col_offsets to shared
-  // memory for each of the tiles that we work on
-
-  auto const group = cooperative_groups::this_thread_block();
-  auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
-  extern __shared__ int8_t shared[];
-
-#ifdef ASYNC_MEMCPY_SUPPORTED
-  // Initialize cuda barriers for each tile.
-  __shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier;
-  if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); }
-  group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
-
-  {
-    auto const fetch_tile           = tile_infos[blockIdx.x];
-    auto const fetch_tile_start_row = fetch_tile.start_row;
-    auto const starting_col_offset  = col_offsets[fetch_tile.start_col];
-    auto const fetch_tile_row_size  = fetch_tile.get_shared_row_size(col_offsets, col_sizes);
-    auto const row_batch_start =
-      fetch_tile.batch_number == 0 ? 0 : batch_row_boundaries[fetch_tile.batch_number];
-
-    for (int absolute_row = warp.meta_group_rank() + fetch_tile.start_row;
-         absolute_row <= fetch_tile.end_row;
-         absolute_row += warp.meta_group_size()) {
-      warp.sync();
-      auto shared_offset = (absolute_row - fetch_tile_start_row) * fetch_tile_row_size;
-      auto dst           = &shared[shared_offset];
-      auto src = &input_data[row_offsets(absolute_row, row_batch_start) + starting_col_offset];
-      // copy the data
-#ifdef ASYNC_MEMCPY_SUPPORTED
-      cuda::memcpy_async(warp, dst, src, fetch_tile_row_size, tile_barrier);
-#else
-      for (int b = warp.thread_rank(); b < fetch_tile_row_size; b += warp.size()) {
-        dst[b] = src[b];
-      }
-#endif
-    }
-  }
-
-  {
-    auto const tile          = tile_infos[blockIdx.x];
-    auto const rows_in_tile  = tile.num_rows();
-    auto const cols_in_tile  = tile.num_cols();
-    auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes);
-
-#ifdef ASYNC_MEMCPY_SUPPORTED
-    // ensure our data is ready
-    tile_barrier.arrive_and_wait();
-#else
-    group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
-
-    // Now we copy from shared memory to final destination. The data is laid out in rows in shared
-    // memory, so the reads for a column will be "vertical". Because of this and the different sizes
-    // for each column, this portion is handled on row/column basis. to prevent each thread working
-    // on a single row and also to ensure that all threads can do work in the case of more threads
-    // than rows, we do a global index instead of a double for loop with col/row.
-    for (int relative_row = warp.thread_rank(); relative_row < rows_in_tile;
-         relative_row += warp.size()) {
-      auto const absolute_row             = relative_row + tile.start_row;
-      auto const shared_memory_row_offset = tile_row_size * relative_row;
-
-      for (int relative_col = warp.meta_group_rank(); relative_col < cols_in_tile;
-           relative_col += warp.meta_group_size()) {
-        auto const absolute_col = relative_col + tile.start_col;
-
-        auto const shared_memory_offset =
-          col_offsets[absolute_col] - col_offsets[tile.start_col] + shared_memory_row_offset;
-        auto const column_size = col_sizes[absolute_col];
-
-        int8_t* shmem_src = &shared[shared_memory_offset];
-        int8_t* dst       = &output_data[absolute_col][absolute_row * column_size];
-
-        MEMCPY(dst, shmem_src, column_size, tile_barrier);
-      }
-    }
-  }
-
-#ifdef ASYNC_MEMCPY_SUPPORTED
-  // wait on the last copies to complete
-  tile_barrier.arrive_and_wait();
-#else
-  group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
-}
-
-/**
- * @brief copy data from row-based format to cudf columns
- *
- * @tparam RowOffsetFunctor iterator that gives the size of a specific row of the table.
- * @param num_rows total number of rows in the table
- * @param num_columns total number of columns in the table
- * @param shmem_used_per_tile amount of shared memory that is used by a tile
- * @param row_offsets offset to the first column a specific row in the input data
- * @param batch_row_boundaries row numbers for batch starts
- * @param output_nm pointers to null masks for columns
- * @param validity_offsets offset into input data row for validity data
- * @param tile_infos information about the tiles of work
- * @param input_data pointer to input data
- *
- */
-template <typename RowOffsetFunctor>
-__global__ void copy_validity_from_rows(const size_type num_rows,
-                                        const size_type num_columns,
-                                        const size_type shmem_used_per_tile,
-                                        RowOffsetFunctor row_offsets,
-                                        size_type const* batch_row_boundaries,
-                                        bitmask_type** output_nm,
-                                        const size_type validity_offset,
-                                        device_span<const tile_info> tile_infos,
-                                        const int8_t* input_data)
-{
-  extern __shared__ int8_t shared[];
-
-  using cudf::detail::warp_size;
-
-  // each thread of warp reads a single byte of validity - so we read 32 bytes then ballot_sync the
-  // bits and write the result to shmem after we fill shared mem memcpy it out in a blob. Probably
-  // need knobs for number of rows vs columns to balance read/write
-
-  //        C0  C1  C2  C3  C4  C5  C6  C7
-  //  R0    1   0   1   0   0   1   1   0       <-- thread 0 reads byte r0
-  //  R1    1   1   1   1   1   1   1   0       <-- thread 1 reads byte r1
-  //  R2    0   0   1   0   0   1   1   0       <-- thread 2 reads byte r2
-  //  ...
-  //  R31   1   1   1   1   1   1   1   1       <-- thread 31 reads byte r31
-  //        ^
-  //        |  1 bit of each input byte, by column, are swizzled into a single 32 bit word via
-  //        __ballot_sync, representing 32 rows of that column.
-
-  auto const group = cooperative_groups::this_thread_block();
-  auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
-
-#ifdef ASYNC_MEMCPY_SUPPORTED
-  // Initialize cuda barriers for each tile.
-  __shared__ cuda::barrier<cuda::thread_scope_block> shared_tile_barrier;
-  if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); }
-  group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
-
-  auto const tile           = tile_infos[blockIdx.x];
-  auto const tile_start_col = tile.start_col;
-  auto const tile_start_row = tile.start_row;
-  auto const num_tile_cols  = tile.num_cols();
-  auto const num_tile_rows  = tile.num_rows();
-
-  auto const threads_per_warp = warp.size();
-  auto const cols_per_read    = CHAR_BIT;
-
-  auto const rows_per_read            = static_cast<size_type>(threads_per_warp);
-  auto const num_sections_x           = util::div_rounding_up_safe(num_tile_cols, cols_per_read);
-  auto const num_sections_y           = util::div_rounding_up_safe(num_tile_rows, rows_per_read);
-  auto const validity_data_col_length = num_sections_y * 4;  // words to bytes
-  auto const total_sections           = num_sections_x * num_sections_y;
-
-  // the tile is divided into sections. A warp operates on a section at a time.
-  for (int my_section_idx = warp.meta_group_rank(); my_section_idx < total_sections;
-       my_section_idx += warp.meta_group_size()) {
-    // convert section to row and col
-    auto const section_x    = my_section_idx % num_sections_x;
-    auto const section_y    = my_section_idx / num_sections_x;
-    auto const relative_col = section_x * cols_per_read;
-    auto const relative_row = section_y * rows_per_read + warp.thread_rank();
-    auto const absolute_col = relative_col + tile_start_col;
-    auto const absolute_row = relative_row + tile_start_row;
-    auto const row_batch_start =
-      tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
-
-    auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
-
-    if (absolute_row < num_rows) {
-      auto const my_byte = input_data[row_offsets(absolute_row, row_batch_start) + validity_offset +
-                                      (absolute_col / cols_per_read)];
-
-      // so every thread that is participating in the warp has a byte, but it's row-based data and
-      // we need it in column-based. So we shuffle the bits around to make the bytes we actually
-      // write.
-      for (int i = 0, byte_mask = 0x1; (i < cols_per_read) && ((relative_col + i) < num_columns);
-           ++i, byte_mask <<= 1) {
-        auto const validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
-        // lead thread in each warp writes data
-        if (warp.thread_rank() == 0) {
-          auto const validity_write_offset =
-            validity_data_col_length * (relative_col + i) + relative_row / cols_per_read;
-          *reinterpret_cast<bitmask_type*>(&shared[validity_write_offset]) = validity_data;
-        }
-      }
-    }
-  }
-
-  // now memcpy the shared memory out to the final destination
-  auto const col_words = util::div_rounding_up_unsafe(num_tile_rows, CHAR_BIT * 4);
-
-  // make sure entire tile has finished copy
-  group.sync();
-
-  for (int relative_col = warp.meta_group_rank(); relative_col < num_tile_cols;
-       relative_col += warp.meta_group_size()) {
-    auto const absolute_col = relative_col + tile_start_col;
-    auto dst                = output_nm[absolute_col] + word_index(tile_start_row);
-    auto const src =
-      reinterpret_cast<bitmask_type*>(&shared[validity_data_col_length * relative_col]);
-
-#ifdef ASYNC_MEMCPY_SUPPORTED
-    cuda::memcpy_async(
-      warp, dst, src, aligned_size_t<4>(validity_data_col_length), shared_tile_barrier);
-#else
-    for (int b = warp.thread_rank(); b < col_words; b += warp.size()) {
-      dst[b] = src[b];
-    }
-#endif
-  }
-
-#ifdef ASYNC_MEMCPY_SUPPORTED
-  // wait for tile of data to arrive
-  shared_tile_barrier.arrive_and_wait();
-#else
-  group.sync();
-#endif  // ASYNC_MEMCPY_SUPPORTED
-}
-
-/**
- * @brief copies string data from jcudf row format to cudf columns
- *
- * @tparam RowOffsetFunctor iterator for row offsets into the destination data
- * @param row_offsets offsets for each row in input data
- * @param string_row_offsets offset data into jcudf row data for each string
- * @param string_lengths length of each incoming string in each column
- * @param string_column_offsets offset column data for cudf column
- * @param string_col_data output cudf string column data
- * @param row_data jcudf row data
- * @param num_rows number of rows in data
- * @param num_string_columns number of string columns in the table
- */
-template <typename RowOffsetFunctor>
-__global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets,
-                                       int32_t** string_row_offsets,
-                                       int32_t** string_lengths,
-                                       size_type** string_column_offsets,
-                                       char** string_col_data,
-                                       int8_t const* row_data,
-                                       size_type const num_rows,
-                                       size_type const num_string_columns)
-{
-  // Each warp takes a tile, which is a single column and up to ROWS_PER_BLOCK rows. A tile will not
-  // wrap around the bottom of the table. The warp will copy the strings for each row in the tile.
-  // Traversing in row-major order to coalesce the offsets and size reads.
-  auto my_block = cooperative_groups::this_thread_block();
-  auto warp     = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
-#ifdef ASYNC_MEMCPY_SUPPORTED
-  cuda::barrier<cuda::thread_scope_block> block_barrier;
-#endif
-
-  // workaround for not being able to take a reference to a constexpr host variable
-  auto const ROWS_PER_BLOCK = NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS;
-  auto const tiles_per_col  = util::div_rounding_up_unsafe(num_rows, ROWS_PER_BLOCK);
-  auto const starting_tile  = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank();
-  auto const num_tiles      = tiles_per_col * num_string_columns;
-  auto const tile_stride    = warp.meta_group_size() * gridDim.x;
-  // Each warp will copy strings in its tile. This is handled by all the threads of a warp passing
-  // the same parameters to async_memcpy and all threads in the warp participating in the copy.
-  for (auto my_tile = starting_tile; my_tile < num_tiles; my_tile += tile_stride) {
-    auto const starting_row = (my_tile % tiles_per_col) * ROWS_PER_BLOCK;
-    auto const col          = my_tile / tiles_per_col;
-    auto const str_len      = string_lengths[col];
-    auto const str_row_off  = string_row_offsets[col];
-    auto const str_col_off  = string_column_offsets[col];
-    auto str_col_data       = string_col_data[col];
-    for (int row = starting_row; row < starting_row + ROWS_PER_BLOCK && row < num_rows; ++row) {
-      auto const src = &row_data[row_offsets(row, 0) + str_row_off[row]];
-      auto dst       = &str_col_data[str_col_off[row]];
-
-#ifdef ASYNC_MEMCPY_SUPPORTED
-      cuda::memcpy_async(warp, dst, src, str_len[row], block_barrier);
-#else
-      for (int c = warp.thread_rank(); c < str_len[row]; c += warp.size()) {
-        dst[c] = src[c];
-      }
-#endif
-    }
-  }
-}
-
-/**
- * @brief Calculate the dimensions of the kernel for fixed width only columns.
- *
- * @param [in] num_columns the number of columns being copied.
- * @param [in] num_rows the number of rows being copied.
- * @param [in] size_per_row the size each row takes up when padded.
- * @param [out] blocks the size of the blocks for the kernel
- * @param [out] threads the size of the threads for the kernel
- * @return the size in bytes of shared memory needed for each block.
- */
-static int calc_fixed_width_kernel_dims(const size_type num_columns,
-                                        const size_type num_rows,
-                                        const size_type size_per_row,
-                                        dim3& blocks,
-                                        dim3& threads)
-{
-  // We have found speed degrades when a thread handles more than 4 columns.
-  // Each block is 2 dimensional. The y dimension indicates the columns.
-  // We limit this to 32 threads in the y dimension so we can still
-  // have at least 32 threads in the x dimension (1 warp) which should
-  // result in better coalescing of memory operations. We also
-  // want to guarantee that we are processing a multiple of 32 threads
-  // in the x dimension because we use atomic operations at the block
-  // level when writing validity data out to main memory, and that would
-  // need to change if we split a word of validity data between blocks.
-  int const y_block_size          = min(util::div_rounding_up_safe(num_columns, 4), 32);
-  int const x_possible_block_size = 1024 / y_block_size;
-  // 48KB is the default setting for shared memory per block according to the cuda tutorials
-  // If someone configures the GPU to only have 16 KB this might not work.
-  int const max_shared_size = 48 * 1024;
-  // If we don't have enough shared memory there is no point in having more threads
-  // per block that will just sit idle
-  auto const max_block_size = std::min(x_possible_block_size, max_shared_size / size_per_row);
-  // Make sure that the x dimension is a multiple of 32 this not only helps
-  // coalesce memory access it also lets us do a ballot sync for validity to write
-  // the data back out the warp level.  If x is a multiple of 32 then each thread in the y
-  // dimension is associated with one or more warps, that should correspond to the validity
-  // words directly.
-  int const block_size = (max_block_size / 32) * 32;
-  CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory");
-
-  // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
-  // but in practice haveing too many can cause some overhead that I don't totally
-  // understand. Playing around with this haveing as little as 600 blocks appears
-  // to be able to saturate memory on V100, so this is an order of magnitude higher
-  // to try and future proof this a bit.
-  int const num_blocks = std::clamp((num_rows + block_size - 1) / block_size, 1, 10240);
-
-  blocks.x  = num_blocks;
-  blocks.y  = 1;
-  blocks.z  = 1;
-  threads.x = block_size;
-  threads.y = y_block_size;
-  threads.z = 1;
-  return size_per_row * block_size;
-}
-
-/**
- * When converting to rows it is possible that the size of the table was too big to fit
- * in a single column. This creates an output column for a subset of the rows in a table
- * going from start row and containing the next num_rows.  Most of the parameters passed
- * into this function are common between runs and should be calculated once.
- */
-static std::unique_ptr<column> fixed_width_convert_to_rows(
-  const size_type start_row,
-  const size_type num_rows,
-  const size_type num_columns,
-  const size_type size_per_row,
-  rmm::device_uvector<size_type>& column_start,
-  rmm::device_uvector<size_type>& column_size,
-  rmm::device_uvector<const int8_t*>& input_data,
-  rmm::device_uvector<const bitmask_type*>& input_nm,
-  const scalar& zero,
-  const scalar& scalar_size_per_row,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  int64_t const total_allocation = size_per_row * num_rows;
-  // We made a mistake in the split somehow
-  CUDF_EXPECTS(total_allocation < std::numeric_limits<size_type>::max(),
-               "Table is too large to fit!");
-
-  // Allocate and set the offsets row for the byte array
-  std::unique_ptr<column> offsets =
-    cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream, mr);
-
-  std::unique_ptr<column> data = make_numeric_column(data_type(type_id::INT8),
-                                                     static_cast<size_type>(total_allocation),
-                                                     mask_state::UNALLOCATED,
-                                                     stream,
-                                                     mr);
-
-  dim3 blocks;
-  dim3 threads;
-  int shared_size =
-    detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
-
-  copy_to_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
-    start_row,
-    num_rows,
-    num_columns,
-    size_per_row,
-    column_start.data(),
-    column_size.data(),
-    input_data.data(),
-    input_nm.data(),
-    data->mutable_view().data<int8_t>());
-
-  return make_lists_column(num_rows,
-                           std::move(offsets),
-                           std::move(data),
-                           0,
-                           rmm::device_buffer{0, cudf::get_default_stream(), mr},
-                           stream,
-                           mr);
-}
-
-static inline bool are_all_fixed_width(std::vector<data_type> const& schema)
-{
-  return std::all_of(
-    schema.begin(), schema.end(), [](const data_type& t) { return is_fixed_width(t); });
-}
-
-/**
- * @brief Given a set of fixed width columns, calculate how the data will be laid out in memory.
- *
- * @param [in] schema the types of columns that need to be laid out.
- * @param [out] column_start the byte offset where each column starts in the row.
- * @param [out] column_size the size in bytes of the data for each columns in the row.
- * @return the size in bytes each row needs.
- */
-static inline int32_t compute_fixed_width_layout(std::vector<data_type> const& schema,
-                                                 std::vector<size_type>& column_start,
-                                                 std::vector<size_type>& column_size)
-{
-  // We guarantee that the start of each column is 64-bit aligned so anything can go
-  // there, but to make the code simple we will still do an alignment for it.
-  int32_t at_offset = 0;
-  for (auto col = schema.begin(); col < schema.end(); col++) {
-    size_type s = size_of(*col);
-    column_size.emplace_back(s);
-    std::size_t allocation_needed = s;
-    std::size_t alignment_needed  = allocation_needed;  // They are the same for fixed width types
-    at_offset = util::round_up_unsafe(at_offset, static_cast<int32_t>(alignment_needed));
-    column_start.emplace_back(at_offset);
-    at_offset += allocation_needed;
-  }
-
-  // Now we need to add in space for validity
-  // Eventually we can think about nullable vs not nullable, but for now we will just always add
-  // it in
-  int32_t const validity_bytes_needed =
-    util::div_rounding_up_safe<int32_t>(schema.size(), CHAR_BIT);
-  // validity comes at the end and is byte aligned so we can pack more in.
-  at_offset += validity_bytes_needed;
-  // Now we need to pad the end so all rows are 64 bit aligned
-  return util::round_up_unsafe(at_offset, JCUDF_ROW_ALIGNMENT);
-}
-
-/**
- * @brief column sizes and column start offsets for a table
- */
-struct column_info_s {
-  size_type size_per_row;
-  std::vector<size_type> column_starts;
-  std::vector<size_type> column_sizes;
-  std::vector<size_type> variable_width_column_starts;
-
-  column_info_s& operator=(column_info_s const& other) = delete;
-  column_info_s& operator=(column_info_s&& other)      = delete;
-};
-
-/**
- * @brief Compute information about a table such as bytes per row and offsets.
- *
- * @tparam iterator iterator of column schema data
- * @param begin starting iterator of column schema
- * @param end ending iterator of column schema
- * @param column_starts column start offsets
- * @param column_sizes size in bytes of each column
- * @return size of the fixed_width data portion of a row.
- */
-template <typename iterator>
-column_info_s compute_column_information(iterator begin, iterator end)
-{
-  size_type size_per_row = 0;
-  std::vector<size_type> column_starts;
-  std::vector<size_type> column_sizes;
-  std::vector<size_type> variable_width_column_starts;
-
-  column_starts.reserve(std::distance(begin, end) + 1);
-  column_sizes.reserve(std::distance(begin, end));
-
-  for (auto col_type = begin; col_type != end; ++col_type) {
-    bool const compound_type = is_compound(*col_type);
-
-    // a list or string column will write a single uint64 of data here for offset/length
-    auto const col_size = compound_type ? sizeof(uint32_t) + sizeof(uint32_t) : size_of(*col_type);
-
-    // align size for this type - They are the same for fixed width types and 4 bytes for variable
-    // width length/offset combos
-    size_type const alignment_needed = compound_type ? __alignof(uint32_t) : col_size;
-    size_per_row                     = util::round_up_unsafe(size_per_row, alignment_needed);
-    if (compound_type) { variable_width_column_starts.push_back(size_per_row); }
-    column_starts.push_back(size_per_row);
-    column_sizes.push_back(col_size);
-    size_per_row += col_size;
-  }
-
-  // add validity offset to the end of fixed_width offsets
-  auto validity_offset = size_per_row;
-  column_starts.push_back(validity_offset);
-
-  // validity is byte-aligned in the JCUDF format
-  size_per_row +=
-    util::div_rounding_up_safe(static_cast<size_type>(std::distance(begin, end)), CHAR_BIT);
-
-  return {size_per_row,
-          std::move(column_starts),
-          std::move(column_sizes),
-          std::move(variable_width_column_starts)};
-}
-
-/**
- * @brief Build `tile_info` for the validity data to break up the work.
- *
- * @param num_columns number of columns in the table
- * @param num_rows number of rows in the table
- * @param shmem_limit_per_tile size of shared memory available to a single gpu tile
- * @param row_batches batched row information for multiple output locations
- * @return vector of `tile_info` structs for validity data
- */
-std::vector<detail::tile_info> build_validity_tile_infos(size_type const& num_columns,
-                                                         size_type const& num_rows,
-                                                         size_type const& shmem_limit_per_tile,
-                                                         std::vector<row_batch> const& row_batches)
-{
-  auto const desired_rows_and_columns = static_cast<int>(sqrt(shmem_limit_per_tile));
-  auto const column_stride            = util::round_up_unsafe(
-    [&]() {
-      if (desired_rows_and_columns > num_columns) {
-        // not many columns, build a single tile for table width and ship it off
-        return num_columns;
-      } else {
-        return util::round_down_safe(desired_rows_and_columns, CHAR_BIT);
-      }
-    }(),
-    JCUDF_ROW_ALIGNMENT);
-
-  // we fit as much as we can given the column stride note that an element in the table takes just 1
-  // bit, but a row with a single element still takes 8 bytes!
-  auto const bytes_per_row =
-    util::round_up_safe(util::div_rounding_up_unsafe(column_stride, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
-  auto const row_stride =
-    std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64));
-  std::vector<detail::tile_info> validity_tile_infos;
-  validity_tile_infos.reserve(num_columns / column_stride * num_rows / row_stride);
-  for (int col = 0; col < num_columns; col += column_stride) {
-    int current_tile_row_batch = 0;
-    int rows_left_in_batch     = row_batches[current_tile_row_batch].row_count;
-    int row                    = 0;
-    while (row < num_rows) {
-      if (rows_left_in_batch == 0) {
-        current_tile_row_batch++;
-        rows_left_in_batch = row_batches[current_tile_row_batch].row_count;
-      }
-      int const tile_height = std::min(row_stride, rows_left_in_batch);
-      validity_tile_infos.emplace_back(
-        detail::tile_info{col,
-                          row,
-                          std::min(col + column_stride - 1, num_columns - 1),
-                          row + tile_height - 1,
-                          current_tile_row_batch});
-      row += tile_height;
-      rows_left_in_batch -= tile_height;
-    }
-  }
-
-  return validity_tile_infos;
-}
-
-/**
- * @brief functor that returns the size of a row or 0 is row is greater than the number of rows in
- * the table
- *
- * @tparam RowSize iterator that returns the size of a specific row
- */
-template <typename RowSize>
-struct row_size_functor {
-  row_size_functor(size_type row_end, RowSize row_sizes, size_type last_row_end)
-    : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end)
-  {
-  }
-
-  __device__ inline uint64_t operator()(int i) const
-  {
-    return i >= _row_end ? 0 : _row_sizes[i + _last_row_end];
-  }
-
-  size_type _row_end;
-  RowSize _row_sizes;
-  size_type _last_row_end;
-};
-
-/**
- * @brief Builds batches of rows that will fit in the size limit of a column.
- *
- * @tparam RowSize iterator that gives the size of a specific row of the table.
- * @param num_rows Total number of rows in the table
- * @param row_sizes iterator that gives the size of a specific row of the table.
- * @param all_fixed_width bool indicating all data in this table is fixed width
- * @param stream stream to operate on for this work
- * @param mr memory resource used to allocate any returned data
- * @returns vector of size_type's that indicate row numbers for batch boundaries and a
- * device_uvector of row offsets
- */
-template <typename RowSize>
-batch_data build_batches(size_type num_rows,
-                         RowSize row_sizes,
-                         bool all_fixed_width,
-                         rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
-{
-  auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows);
-  auto const num_batches = static_cast<int32_t>(
-    util::div_rounding_up_safe(total_size, static_cast<uint64_t>(MAX_BATCH_SIZE)));
-  auto const num_offsets = num_batches + 1;
-  std::vector<row_batch> row_batches;
-  std::vector<size_type> batch_row_boundaries;
-  device_uvector<size_type> batch_row_offsets(all_fixed_width ? 0 : num_rows, stream);
-
-  // at most max gpu memory / 2GB iterations.
-  batch_row_boundaries.reserve(num_offsets);
-  batch_row_boundaries.push_back(0);
-  size_type last_row_end = 0;
-  device_uvector<uint64_t> cumulative_row_sizes(num_rows, stream);
-
-  // Evaluate the row size values before calling `inclusive_scan` to workaround
-  // memory issue in https://github.com/NVIDIA/spark-rapids-jni/issues/1567.
-  thrust::copy(
-    rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, cumulative_row_sizes.begin());
-  thrust::inclusive_scan(rmm::exec_policy(stream),
-                         cumulative_row_sizes.begin(),
-                         cumulative_row_sizes.end(),
-                         cumulative_row_sizes.begin());
-
-  // This needs to be split this into 2 gig batches. Care must be taken to avoid a batch larger than
-  // 2 gigs. Imagine a table with 900 meg rows. The batches should occur every 2 rows, but if a
-  // lower bound is run at 2 gigs, 4 gigs, 6 gigs. the batches will be 2 rows, 2 rows, 3 rows, which
-  // will be invalid. The previous batch size must be taken into account when building a new batch.
-  // One way is to pull the batch size back to the host and add it to MAX_BATCH_SIZE for the lower
-  // bound search. The other method involves keeping everything on device, but subtracting the
-  // previous batch from cumulative_row_sizes based on index. This involves no synchronization
-  // between GPU and CPU, but involves more work on the GPU. These further need to be broken on a
-  // 32-row boundary to match the fixed_width optimized versions.
-
-  while (last_row_end < num_rows) {
-    auto offset_row_sizes = thrust::make_transform_iterator(
-      cumulative_row_sizes.begin(),
-      [last_row_end, cumulative_row_sizes = cumulative_row_sizes.data()] __device__(auto i) {
-        return i - cumulative_row_sizes[last_row_end];
-      });
-    auto search_start = offset_row_sizes + last_row_end;
-    auto search_end   = offset_row_sizes + num_rows;
-
-    // find the next MAX_BATCH_SIZE boundary
-    auto const lb =
-      thrust::lower_bound(rmm::exec_policy(stream), search_start, search_end, MAX_BATCH_SIZE);
-    size_type const batch_size = lb - search_start;
-
-    size_type const row_end = lb == search_end
-                                ? batch_size + last_row_end
-                                : last_row_end + util::round_down_safe(batch_size, 32);
-
-    // build offset list for each row in this batch
-    auto const num_rows_in_batch = row_end - last_row_end;
-
-    // build offset list for each row in this batch
-    auto const num_entries = row_end - last_row_end + 1;
-    device_uvector<size_type> output_batch_row_offsets(num_entries, stream, mr);
-
-    auto row_size_iter_bounded = cudf::detail::make_counting_transform_iterator(
-      0, row_size_functor(row_end, row_sizes, last_row_end));
-
-    thrust::exclusive_scan(rmm::exec_policy(stream),
-                           row_size_iter_bounded,
-                           row_size_iter_bounded + num_entries,
-                           output_batch_row_offsets.begin());
-
-    auto const batch_bytes = output_batch_row_offsets.element(num_rows_in_batch, stream);
-
-    // The output_batch_row_offsets vector is used as the offset column of the returned data. This
-    // needs to be individually allocated, but the kernel needs a contiguous array of offsets or
-    // more global lookups are necessary.
-    if (!all_fixed_width) {
-      cudaMemcpy(batch_row_offsets.data() + last_row_end,
-                 output_batch_row_offsets.data(),
-                 num_rows_in_batch * sizeof(size_type),
-                 cudaMemcpyDeviceToDevice);
-    }
-
-    batch_row_boundaries.push_back(row_end);
-    row_batches.push_back({batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)});
-
-    last_row_end = row_end;
-  }
-
-  return {
-    std::move(batch_row_offsets),
-    make_device_uvector_async(batch_row_boundaries, stream, rmm::mr::get_current_device_resource()),
-    std::move(batch_row_boundaries),
-    std::move(row_batches)};
-}
-
-/**
- * @brief Computes the number of tiles necessary given a tile height and batch offsets
- *
- * @param batch_row_boundaries row boundaries for each batch
- * @param desired_tile_height height of each tile in the table
- * @param stream stream to use
- * @return number of tiles necessary
- */
-int compute_tile_counts(device_span<size_type const> const& batch_row_boundaries,
-                        int desired_tile_height,
-                        rmm::cuda_stream_view stream)
-{
-  size_type const num_batches = batch_row_boundaries.size() - 1;
-  device_uvector<size_type> num_tiles(num_batches, stream);
-  auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    iter,
-    iter + num_batches,
-    num_tiles.begin(),
-    [desired_tile_height,
-     batch_row_boundaries = batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
-      return util::div_rounding_up_unsafe(
-        batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index],
-        desired_tile_height);
-    });
-  return thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end());
-}
-
-/**
- * @brief Builds the `tile_info` structs for a given table.
- *
- * @param tiles span of tiles to populate
- * @param batch_row_boundaries boundary to row batches
- * @param column_start starting column of the tile
- * @param column_end ending column of the tile
- * @param desired_tile_height height of the tile
- * @param total_number_of_rows total number of rows in the table
- * @param stream stream to use
- * @return number of tiles created
- */
-size_type build_tiles(
-  device_span<tile_info> tiles,
-  device_uvector<size_type> const& batch_row_boundaries,  // comes from build_batches
-  int column_start,
-  int column_end,
-  int desired_tile_height,
-  int total_number_of_rows,
-  rmm::cuda_stream_view stream)
-{
-  size_type const num_batches = batch_row_boundaries.size() - 1;
-  device_uvector<size_type> num_tiles(num_batches, stream);
-  auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    iter,
-    iter + num_batches,
-    num_tiles.begin(),
-    [desired_tile_height,
-     batch_row_boundaries = batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
-      return util::div_rounding_up_unsafe(
-        batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index],
-        desired_tile_height);
-    });
-
-  size_type const total_tiles =
-    thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end());
-
-  device_uvector<size_type> tile_starts(num_batches + 1, stream);
-  auto tile_iter = cudf::detail::make_counting_transform_iterator(
-    0, [num_tiles = num_tiles.data(), num_batches] __device__(auto i) {
-      return (i < num_batches) ? num_tiles[i] : 0;
-    });
-  thrust::exclusive_scan(rmm::exec_policy(stream),
-                         tile_iter,
-                         tile_iter + num_batches + 1,
-                         tile_starts.begin());  // in tiles
-
-  thrust::transform(
-    rmm::exec_policy(stream),
-    iter,
-    iter + total_tiles,
-    tiles.begin(),
-    [                     =,
-     tile_starts          = tile_starts.data(),
-     batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type tile_index) {
-      // what batch this tile falls in
-      auto const batch_index_iter =
-        thrust::upper_bound(thrust::seq, tile_starts, tile_starts + num_batches, tile_index);
-      auto const batch_index = std::distance(tile_starts, batch_index_iter) - 1;
-      // local index within the tile
-      int const local_tile_index = tile_index - tile_starts[batch_index];
-      // the start row for this batch.
-      int const batch_row_start = batch_row_boundaries[batch_index];
-      // the start row for this tile
-      int const tile_row_start = batch_row_start + (local_tile_index * desired_tile_height);
-      // the end row for this tile
-      int const max_row = std::min(total_number_of_rows - 1,
-                                   batch_index + 1 > num_batches
-                                     ? std::numeric_limits<size_type>::max()
-                                     : static_cast<int>(batch_row_boundaries[batch_index + 1]) - 1);
-      int const tile_row_end =
-        std::min(batch_row_start + ((local_tile_index + 1) * desired_tile_height) - 1, max_row);
-
-      // stuff the tile
-      return tile_info{
-        column_start, tile_row_start, column_end, tile_row_end, static_cast<int>(batch_index)};
-    });
-
-  return total_tiles;
-}
-
-/**
- * @brief Determines what data should be operated on by each tile for the incoming table.
- *
- * @tparam TileCallback Callback that receives the start and end columns of tiles
- * @param column_sizes vector of the size of each column
- * @param column_starts vector of the offset of each column
- * @param first_row_batch_size size of the first row batch to limit max tile size since a tile
- * is unable to span batches
- * @param total_number_of_rows total number of rows in the table
- * @param shmem_limit_per_tile shared memory allowed per tile
- * @param f callback function called when building a tile
- */
-template <typename TileCallback>
-void determine_tiles(std::vector<size_type> const& column_sizes,
-                     std::vector<size_type> const& column_starts,
-                     size_type const first_row_batch_size,
-                     size_type const total_number_of_rows,
-                     size_type const& shmem_limit_per_tile,
-                     TileCallback f)
-{
-  // tile infos are organized with the tile going "down" the columns this provides the most
-  // coalescing of memory access
-  int current_tile_width     = 0;
-  int current_tile_start_col = 0;
-
-  // the ideal tile height has lots of 8-byte reads and 8-byte writes. The optimal read/write would
-  // be memory cache line sized access, but since other tiles will read/write the edges this may not
-  // turn out to be overly important. For now, we will attempt to build a square tile as far as byte
-  // sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we want them
-  // equal, so height and width are sqrt(shared_mem_size). The trick is that it's in bytes, not rows
-  // or columns.
-  auto const square_bias         = 32;  // bias towards columns for performance reasons
-  auto const optimal_square_len  = static_cast<size_type>(sqrt(shmem_limit_per_tile));
-  auto const desired_tile_height = util::round_up_safe<int>(
-    std::min(optimal_square_len / square_bias, total_number_of_rows), cudf::detail::warp_size);
-  auto const tile_height = std::clamp(desired_tile_height, 1, first_row_batch_size);
-
-  int row_size = 0;
-
-  // march each column and build the tiles of appropriate sizes
-  for (uint col = 0; col < column_sizes.size(); ++col) {
-    auto const col_size = column_sizes[col];
-
-    // align size for this type
-    auto const alignment_needed       = col_size;  // They are the same for fixed width types
-    auto const row_size_aligned       = util::round_up_unsafe(row_size, alignment_needed);
-    auto const row_size_with_this_col = row_size_aligned + col_size;
-    auto const row_size_with_end_pad =
-      util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT);
-
-    if (row_size_with_end_pad * tile_height > shmem_limit_per_tile) {
-      // too large, close this tile, generate vertical tiles and restart
-      f(current_tile_start_col, col == 0 ? col : col - 1, tile_height);
-
-      row_size =
-        util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-      row_size += col_size;  // alignment required for shared memory tile boundary to match
-                             // alignment of output row
-      current_tile_start_col = col;
-      current_tile_width     = 0;
-    } else {
-      row_size = row_size_with_this_col;
-      current_tile_width++;
-    }
-  }
-
-  // build last set of tiles
-  if (current_tile_width > 0) {
-    f(current_tile_start_col, static_cast<int>(column_sizes.size()) - 1, tile_height);
-  }
-}
-
-/**
- * @brief convert cudf table into JCUDF row format
- *
- * @tparam offsetFunctor functor type for offset functor
- * @param tbl table to convert to JCUDF row format
- * @param batch_info information about the batches of data
- * @param offset_functor functor that returns the starting offset of each row
- * @param column_info information about incoming columns
- * @param variable_width_offsets optional vector of offsets for variable-with columns
- * @param stream stream used
- * @param mr selected memory resource for returned data
- * @return vector of list columns containing byte columns of the JCUDF row data
- */
-template <typename offsetFunctor>
-std::vector<std::unique_ptr<column>> convert_to_rows(
-  table_view const& tbl,
-  batch_data& batch_info,
-  offsetFunctor offset_functor,
-  column_info_s const& column_info,
-  std::optional<rmm::device_uvector<strings_column_view::offset_iterator>> variable_width_offsets,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  int device_id;
-  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
-  int total_shmem_in_bytes;
-  CUDF_CUDA_TRY(
-    cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
-
-#ifndef __CUDA_ARCH__  // __host__ code.
-  // Need to reduce total shmem available by the size of barriers in the kernel's shared memory
-  total_shmem_in_bytes -=
-    util::round_up_unsafe(sizeof(cuda::barrier<cuda::thread_scope_block>), 16ul);
-#endif  // __CUDA_ARCH__
-
-  auto const shmem_limit_per_tile = total_shmem_in_bytes;
-
-  auto const num_rows         = tbl.num_rows();
-  auto const fixed_width_only = !variable_width_offsets.has_value();
-
-  auto select_columns = [](auto const& tbl, auto column_predicate) {
-    std::vector<column_view> cols;
-    std::copy_if(tbl.begin(), tbl.end(), std::back_inserter(cols), [&](auto c) {
-      return column_predicate(c);
-    });
-    return table_view(cols);
-  };
-
-  auto dev_col_sizes = make_device_uvector_async(
-    column_info.column_sizes, stream, rmm::mr::get_current_device_resource());
-  auto dev_col_starts = make_device_uvector_async(
-    column_info.column_starts, stream, rmm::mr::get_current_device_resource());
-
-  // Get the pointers to the input columnar data ready
-  auto const data_begin = thrust::make_transform_iterator(tbl.begin(), [](auto const& c) {
-    return is_compound(c.type()) ? nullptr : c.template data<int8_t>();
-  });
-  std::vector<int8_t const*> input_data(data_begin, data_begin + tbl.num_columns());
-
-  // validity code handles variable and fixed-width data, so give it everything
-  auto const nm_begin =
-    thrust::make_transform_iterator(tbl.begin(), [](auto const& c) { return c.null_mask(); });
-  std::vector<bitmask_type const*> input_nm(nm_begin, nm_begin + tbl.num_columns());
-
-  auto dev_input_data =
-    make_device_uvector_async(input_data, stream, rmm::mr::get_current_device_resource());
-  auto dev_input_nm =
-    make_device_uvector_async(input_nm, stream, rmm::mr::get_current_device_resource());
-
-  // the first batch always exists unless we were sent an empty table
-  auto const first_batch_size = batch_info.row_batches[0].row_count;
-
-  std::vector<rmm::device_buffer> output_buffers;
-  std::vector<int8_t*> output_data;
-  output_data.reserve(batch_info.row_batches.size());
-  output_buffers.reserve(batch_info.row_batches.size());
-  std::transform(
-    batch_info.row_batches.begin(),
-    batch_info.row_batches.end(),
-    std::back_inserter(output_buffers),
-    [&](auto const& batch) { return rmm::device_buffer(batch.num_bytes, stream, mr); });
-  std::transform(
-    output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data), [](auto& buf) {
-      return static_cast<int8_t*>(buf.data());
-    });
-
-  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-
-  int info_count = 0;
-  detail::determine_tiles(
-    column_info.column_sizes,
-    column_info.column_starts,
-    first_batch_size,
-    num_rows,
-    shmem_limit_per_tile,
-    [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &info_count, &stream](
-      int const start_col, int const end_col, int const tile_height) {
-      int i = detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream);
-      info_count += i;
-    });
-
-  // allocate space for tiles
-  device_uvector<detail::tile_info> gpu_tile_infos(info_count, stream);
-  int tile_offset = 0;
-
-  detail::determine_tiles(
-    column_info.column_sizes,
-    column_info.column_starts,
-    first_batch_size,
-    num_rows,
-    shmem_limit_per_tile,
-    [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries,
-     &gpu_tile_infos,
-     num_rows,
-     &tile_offset,
-     stream](int const start_col, int const end_col, int const tile_height) {
-      tile_offset += detail::build_tiles(
-        {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
-        gpu_batch_row_boundaries,
-        start_col,
-        end_col,
-        tile_height,
-        num_rows,
-        stream);
-    });
-
-  // build validity tiles for ALL columns, variable and fixed width.
-  auto validity_tile_infos = detail::build_validity_tile_infos(
-    tbl.num_columns(), num_rows, shmem_limit_per_tile, batch_info.row_batches);
-
-  auto dev_validity_tile_infos =
-    make_device_uvector_async(validity_tile_infos, stream, rmm::mr::get_current_device_resource());
-
-  auto const validity_offset = column_info.column_starts.back();
-
-  // blast through the entire table and convert it
-  detail::copy_to_rows<<<gpu_tile_infos.size(),
-                         NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                         total_shmem_in_bytes,
-                         stream.value()>>>(num_rows,
-                                           tbl.num_columns(),
-                                           shmem_limit_per_tile,
-                                           gpu_tile_infos,
-                                           dev_input_data.data(),
-                                           dev_col_sizes.data(),
-                                           dev_col_starts.data(),
-                                           offset_functor,
-                                           batch_info.d_batch_row_boundaries.data(),
-                                           reinterpret_cast<int8_t**>(dev_output_data.data()));
-
-  // note that validity gets the entire table and not the fixed-width portion
-  detail::copy_validity_to_rows<<<validity_tile_infos.size(),
-                                  NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                                  total_shmem_in_bytes,
-                                  stream.value()>>>(num_rows,
-                                                    tbl.num_columns(),
-                                                    shmem_limit_per_tile,
-                                                    offset_functor,
-                                                    batch_info.d_batch_row_boundaries.data(),
-                                                    dev_output_data.data(),
-                                                    validity_offset,
-                                                    dev_validity_tile_infos,
-                                                    dev_input_nm.data());
-
-  if (!fixed_width_only) {
-    // build table view for variable-width data only
-    auto const variable_width_table =
-      select_columns(tbl, [](auto col) { return is_compound(col.type()); });
-
-    CUDF_EXPECTS(!variable_width_table.is_empty(), "No variable-width columns when expected!");
-    CUDF_EXPECTS(variable_width_offsets.has_value(), "No variable width offset data!");
-
-    auto const variable_data_begin =
-      thrust::make_transform_iterator(variable_width_table.begin(), [](auto const& c) {
-        strings_column_view const scv{c};
-        return is_compound(c.type()) ? scv.chars().template data<int8_t>() : nullptr;
-      });
-    std::vector<int8_t const*> variable_width_input_data(
-      variable_data_begin, variable_data_begin + variable_width_table.num_columns());
-
-    auto dev_variable_input_data = make_device_uvector_async(
-      variable_width_input_data, stream, rmm::mr::get_current_device_resource());
-    auto dev_variable_col_output_offsets = make_device_uvector_async(
-      column_info.variable_width_column_starts, stream, rmm::mr::get_current_device_resource());
-
-    for (uint i = 0; i < batch_info.row_batches.size(); i++) {
-      auto const batch_row_offset = batch_info.batch_row_boundaries[i];
-      auto const batch_num_rows   = batch_info.row_batches[i].row_count;
-
-      dim3 const string_blocks(
-        std::min(MAX_STRING_BLOCKS,
-                 util::div_rounding_up_unsafe(batch_num_rows, NUM_STRING_ROWS_PER_BLOCK_TO_ROWS)));
-
-      detail::copy_strings_to_rows<<<string_blocks,
-                                     NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                                     0,
-                                     stream.value()>>>(batch_num_rows,
-                                                       variable_width_table.num_columns(),
-                                                       dev_variable_input_data.data(),
-                                                       dev_variable_col_output_offsets.data(),
-                                                       variable_width_offsets->data(),
-                                                       column_info.size_per_row,
-                                                       offset_functor,
-                                                       batch_row_offset,
-                                                       reinterpret_cast<int8_t*>(output_data[i]));
-    }
-  }
-
-  // split up the output buffer into multiple buffers based on row batch sizes and create list of
-  // byte columns
-  std::vector<std::unique_ptr<column>> ret;
-  ret.reserve(batch_info.row_batches.size());
-  auto counting_iter = thrust::make_counting_iterator(0);
-  std::transform(counting_iter,
-                 counting_iter + batch_info.row_batches.size(),
-                 std::back_inserter(ret),
-                 [&](auto batch) {
-                   auto const offset_count = batch_info.row_batches[batch].row_offsets.size();
-                   auto offsets =
-                     std::make_unique<column>(data_type{type_id::INT32},
-                                              (size_type)offset_count,
-                                              batch_info.row_batches[batch].row_offsets.release(),
-                                              rmm::device_buffer{},
-                                              0);
-                   auto data = std::make_unique<column>(data_type{type_id::INT8},
-                                                        batch_info.row_batches[batch].num_bytes,
-                                                        std::move(output_buffers[batch]),
-                                                        rmm::device_buffer{},
-                                                        0);
-
-                   return make_lists_column(batch_info.row_batches[batch].row_count,
-                                            std::move(offsets),
-                                            std::move(data),
-                                            0,
-                                            rmm::device_buffer{0, cudf::get_default_stream(), mr},
-                                            stream,
-                                            mr);
-                 });
-
-  return ret;
-}
-
-}  // namespace detail
-
-/**
- * @brief convert a cudf table to JCUDF row format
- *
- * @param tbl incoming table to convert
- * @param stream stream to use for operations
- * @param mr memory resource used for returned data
- * @return vector of list columns containing byte columns of the JCUDF row data
- */
-std::vector<std::unique_ptr<column>> convert_to_rows(table_view const& tbl,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
-{
-  auto const num_columns = tbl.num_columns();
-  auto const num_rows    = tbl.num_rows();
-
-  auto const fixed_width_only = std::all_of(
-    tbl.begin(), tbl.end(), [](column_view const& c) { return is_fixed_width(c.type()); });
-
-  // Break up the work into tiles, which are a starting and ending row/col #. This tile size is
-  // calculated based on the shared memory size available we want a single tile to fill up the
-  // entire shared memory space available for the transpose-like conversion.
-
-  // There are two different processes going on here. The GPU conversion of the data and the writing
-  // of the data into the list of byte columns that are a maximum of 2 gigs each due to offset
-  // maximum size. The GPU conversion portion has to understand this limitation because the column
-  // must own the data inside and as a result it must be a distinct allocation for that column.
-  // Copying the data into these final buffers would be prohibitively expensive, so care is taken to
-  // ensure the GPU writes to the proper buffer. The tiles are broken at the boundaries of specific
-  // rows based on the row sizes up to that point. These are row batches and they are decided first
-  // before building the tiles so the tiles can be properly cut around them.
-
-  auto schema_column_iter =
-    thrust::make_transform_iterator(tbl.begin(), [](auto const& i) { return i.type(); });
-
-  auto column_info =
-    detail::compute_column_information(schema_column_iter, schema_column_iter + num_columns);
-  auto const size_per_row = column_info.size_per_row;
-  if (fixed_width_only) {
-    // total encoded row size. This includes fixed-width data and validity only. It does not include
-    // variable-width data since it isn't copied with the fixed-width and validity kernel.
-    auto row_size_iter = thrust::make_constant_iterator<uint64_t>(
-      util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT));
-
-    auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr);
-
-    detail::fixed_width_row_offset_functor offset_functor(
-      util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT));
-
-    return detail::convert_to_rows(
-      tbl, batch_info, offset_functor, std::move(column_info), std::nullopt, stream, mr);
-  } else {
-    auto offset_data = detail::build_string_row_offsets(tbl, size_per_row, stream);
-    auto& row_sizes  = std::get<0>(offset_data);
-
-    auto row_size_iter = cudf::detail::make_counting_transform_iterator(
-      0, detail::row_size_functor(num_rows, row_sizes.data(), 0));
-
-    auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr);
-
-    detail::string_row_offset_functor offset_functor(batch_info.batch_row_offsets);
-
-    return detail::convert_to_rows(tbl,
-                                   batch_info,
-                                   offset_functor,
-                                   std::move(column_info),
-                                   std::make_optional(std::move(std::get<1>(offset_data))),
-                                   stream,
-                                   mr);
-  }
-}
-
-std::vector<std::unique_ptr<column>> convert_to_rows_fixed_width_optimized(
-  table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
-{
-  auto const num_columns = tbl.num_columns();
-
-  std::vector<data_type> schema;
-  schema.resize(num_columns);
-  std::transform(
-    tbl.begin(), tbl.end(), schema.begin(), [](auto i) -> data_type { return i.type(); });
-
-  if (detail::are_all_fixed_width(schema)) {
-    std::vector<size_type> column_start;
-    std::vector<size_type> column_size;
-
-    int32_t const size_per_row =
-      detail::compute_fixed_width_layout(schema, column_start, column_size);
-    auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
-    auto dev_column_size  = make_device_uvector_async(column_size, stream, mr);
-
-    // Make the number of rows per batch a multiple of 32 so we don't have to worry about splitting
-    // validity at a specific row offset.  This might change in the future.
-    auto const max_rows_per_batch =
-      util::round_down_safe(std::numeric_limits<size_type>::max() / size_per_row, 32);
-
-    auto const num_rows = tbl.num_rows();
-
-    // Get the pointers to the input columnar data ready
-    std::vector<const int8_t*> input_data;
-    std::vector<bitmask_type const*> input_nm;
-    for (size_type column_number = 0; column_number < num_columns; column_number++) {
-      column_view cv = tbl.column(column_number);
-      input_data.emplace_back(cv.data<int8_t>());
-      input_nm.emplace_back(cv.null_mask());
-    }
-    auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-    auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
-
-    using ScalarType = scalar_type_t<size_type>;
-    auto zero        = make_numeric_scalar(data_type(type_id::INT32), stream.value());
-    zero->set_valid_async(true, stream);
-    static_cast<ScalarType*>(zero.get())->set_value(0, stream);
-
-    auto step = make_numeric_scalar(data_type(type_id::INT32), stream.value());
-    step->set_valid_async(true, stream);
-    static_cast<ScalarType*>(step.get())->set_value(static_cast<size_type>(size_per_row), stream);
-
-    std::vector<std::unique_ptr<column>> ret;
-    for (size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
-      size_type row_count = num_rows - row_start;
-      row_count           = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
-      ret.emplace_back(detail::fixed_width_convert_to_rows(row_start,
-                                                           row_count,
-                                                           num_columns,
-                                                           size_per_row,
-                                                           dev_column_start,
-                                                           dev_column_size,
-                                                           dev_input_data,
-                                                           dev_input_nm,
-                                                           *zero,
-                                                           *step,
-                                                           stream,
-                                                           mr));
-    }
-
-    return ret;
-  } else {
-    CUDF_FAIL("Only fixed width types are currently supported");
-  }
-}
-
-namespace {
-
-/// @brief Calculates and sets null counts for specified columns
-void fixup_null_counts(std::vector<std::unique_ptr<column>>& output_columns,
-                       rmm::cuda_stream_view stream)
-{
-  for (auto& col : output_columns) {
-    col->set_null_count(cudf::detail::null_count(col->view().null_mask(), 0, col->size(), stream));
-  }
-}
-
-}  // namespace
-
-/**
- * @brief convert from JCUDF row format to cudf columns
- *
- * @param input vector of list columns containing byte columns of the JCUDF row data
- * @param schema incoming schema of the data
- * @param stream stream to use for compute
- * @param mr memory resource for returned data
- * @return cudf table of the data
- */
-std::unique_ptr<table> convert_from_rows(lists_column_view const& input,
-                                         std::vector<data_type> const& schema,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  // verify that the types are what we expect
-  column_view child    = input.child();
-  auto const list_type = child.type().id();
-  CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8,
-               "Only a list of bytes is supported as input");
-
-  // convert any strings in the schema to two int32 columns
-  // This allows us to leverage the fixed-width copy code to fill in our offset and string length
-  // data.
-  std::vector<data_type> string_schema;
-  string_schema.reserve(schema.size());
-  for (auto i : schema) {
-    if (i.id() == type_id::STRING) {
-      string_schema.push_back(data_type(type_id::INT32));
-      string_schema.push_back(data_type(type_id::INT32));
-    } else {
-      string_schema.push_back(i);
-    }
-  }
-
-  auto const num_columns = string_schema.size();
-  auto const num_rows    = input.parent().size();
-
-  int device_id;
-  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
-  int total_shmem_in_bytes;
-  CUDF_CUDA_TRY(
-    cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
-
-#ifndef __CUDA_ARCH__  // __host__ code.
-  // Need to reduce total shmem available by the size of barriers in the kernel's shared memory
-  total_shmem_in_bytes -=
-    util::round_up_unsafe(sizeof(cuda::barrier<cuda::thread_scope_block>), 16ul);
-#endif  // __CUDA_ARCH__
-
-  auto const shmem_limit_per_tile = total_shmem_in_bytes;
-
-  auto column_info = detail::compute_column_information(string_schema.begin(), string_schema.end());
-  auto const size_per_row = util::round_up_unsafe(column_info.size_per_row, JCUDF_ROW_ALIGNMENT);
-
-  // Ideally we would check that the offsets are all the same, etc. but for now this is probably
-  // fine
-  CUDF_EXPECTS(size_per_row * num_rows <= child.size(), "The layout of the data appears to be off");
-  auto dev_col_starts = make_device_uvector_async(
-    column_info.column_starts, stream, rmm::mr::get_current_device_resource());
-  auto dev_col_sizes = make_device_uvector_async(
-    column_info.column_sizes, stream, rmm::mr::get_current_device_resource());
-
-  // Allocate the columns we are going to write into
-  std::vector<std::unique_ptr<column>> output_columns;
-  std::vector<std::unique_ptr<column>> string_row_offset_columns;
-  std::vector<std::unique_ptr<column>> string_length_columns;
-  std::vector<int8_t*> output_data;
-  std::vector<bitmask_type*> output_nm;
-  std::vector<int32_t*> string_row_offsets;
-  std::vector<int32_t*> string_lengths;
-  for (auto i : schema) {
-    auto make_col = [&output_data, &output_nm](data_type type,
-                                               size_type num_rows,
-                                               bool include_nm,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr) {
-      auto column =
-        make_fixed_width_column(type,
-                                num_rows,
-                                include_nm ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED,
-                                stream,
-                                mr);
-      auto mut = column->mutable_view();
-      output_data.emplace_back(mut.data<int8_t>());
-      if (include_nm) { output_nm.emplace_back(mut.null_mask()); }
-      return column;
-    };
-    if (i.id() == type_id::STRING) {
-      auto const int32type = data_type(type_id::INT32);
-      auto offset_col =
-        make_col(int32type, num_rows, true, stream, rmm::mr::get_current_device_resource());
-      string_row_offsets.push_back(offset_col->mutable_view().data<int32_t>());
-      string_row_offset_columns.emplace_back(std::move(offset_col));
-      auto length_col =
-        make_col(int32type, num_rows, false, stream, rmm::mr::get_current_device_resource());
-      string_lengths.push_back(length_col->mutable_view().data<int32_t>());
-      string_length_columns.emplace_back(std::move(length_col));
-      // placeholder
-      output_columns.emplace_back(make_empty_column(type_id::STRING));
-    } else {
-      output_columns.emplace_back(make_col(i, num_rows, true, stream, mr));
-    }
-  }
-
-  auto dev_string_row_offsets =
-    make_device_uvector_async(string_row_offsets, stream, rmm::mr::get_current_device_resource());
-  auto dev_string_lengths =
-    make_device_uvector_async(string_lengths, stream, rmm::mr::get_current_device_resource());
-
-  // build the row_batches from the passed in list column
-  std::vector<detail::row_batch> row_batches;
-  row_batches.push_back(
-    {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
-
-  auto dev_output_data =
-    make_device_uvector_async(output_data, stream, rmm::mr::get_current_device_resource());
-  auto dev_output_nm =
-    make_device_uvector_async(output_nm, stream, rmm::mr::get_current_device_resource());
-
-  // only ever get a single batch when going from rows, so boundaries are 0, num_rows
-  constexpr auto num_batches = 2;
-  device_uvector<size_type> gpu_batch_row_boundaries(num_batches, stream);
-
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator(0),
-                    thrust::make_counting_iterator(num_batches),
-                    gpu_batch_row_boundaries.begin(),
-                    [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; });
-
-  int info_count = 0;
-  detail::determine_tiles(column_info.column_sizes,
-                          column_info.column_starts,
-                          num_rows,
-                          num_rows,
-                          shmem_limit_per_tile,
-                          [&gpu_batch_row_boundaries, &info_count, &stream](
-                            int const start_col, int const end_col, int const tile_height) {
-                            info_count += detail::compute_tile_counts(
-                              gpu_batch_row_boundaries, tile_height, stream);
-                          });
-
-  // allocate space for tiles
-  device_uvector<detail::tile_info> gpu_tile_infos(info_count, stream);
-
-  int tile_offset = 0;
-  detail::determine_tiles(
-    column_info.column_sizes,
-    column_info.column_starts,
-    num_rows,
-    num_rows,
-    shmem_limit_per_tile,
-    [&gpu_batch_row_boundaries, &gpu_tile_infos, num_rows, &tile_offset, stream](
-      int const start_col, int const end_col, int const tile_height) {
-      tile_offset += detail::build_tiles(
-        {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
-        gpu_batch_row_boundaries,
-        start_col,
-        end_col,
-        tile_height,
-        num_rows,
-        stream);
-    });
-
-  dim3 const blocks(gpu_tile_infos.size());
-
-  // validity needs to be calculated based on the actual number of final table columns
-  auto validity_tile_infos =
-    detail::build_validity_tile_infos(schema.size(), num_rows, shmem_limit_per_tile, row_batches);
-
-  auto dev_validity_tile_infos =
-    make_device_uvector_async(validity_tile_infos, stream, rmm::mr::get_current_device_resource());
-
-  dim3 const validity_blocks(validity_tile_infos.size());
-
-  if (dev_string_row_offsets.size() == 0) {
-    detail::fixed_width_row_offset_functor offset_functor(size_per_row);
-
-    detail::copy_from_rows<<<gpu_tile_infos.size(),
-                             NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                             total_shmem_in_bytes,
-                             stream.value()>>>(num_rows,
-                                               num_columns,
-                                               shmem_limit_per_tile,
-                                               offset_functor,
-                                               gpu_batch_row_boundaries.data(),
-                                               dev_output_data.data(),
-                                               dev_col_sizes.data(),
-                                               dev_col_starts.data(),
-                                               gpu_tile_infos,
-                                               child.data<int8_t>());
-
-    detail::copy_validity_from_rows<<<validity_tile_infos.size(),
-                                      NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                                      total_shmem_in_bytes,
-                                      stream.value()>>>(num_rows,
-                                                        num_columns,
-                                                        shmem_limit_per_tile,
-                                                        offset_functor,
-                                                        gpu_batch_row_boundaries.data(),
-                                                        dev_output_nm.data(),
-                                                        column_info.column_starts.back(),
-                                                        dev_validity_tile_infos,
-                                                        child.data<int8_t>());
-
-  } else {
-    detail::string_row_offset_functor offset_functor(device_span<size_type const>{input.offsets()});
-    detail::copy_from_rows<<<gpu_tile_infos.size(),
-                             NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                             total_shmem_in_bytes,
-                             stream.value()>>>(num_rows,
-                                               num_columns,
-                                               shmem_limit_per_tile,
-                                               offset_functor,
-                                               gpu_batch_row_boundaries.data(),
-                                               dev_output_data.data(),
-                                               dev_col_sizes.data(),
-                                               dev_col_starts.data(),
-                                               gpu_tile_infos,
-                                               child.data<int8_t>());
-
-    detail::copy_validity_from_rows<<<validity_tile_infos.size(),
-                                      NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                                      total_shmem_in_bytes,
-                                      stream.value()>>>(num_rows,
-                                                        num_columns,
-                                                        shmem_limit_per_tile,
-                                                        offset_functor,
-                                                        gpu_batch_row_boundaries.data(),
-                                                        dev_output_nm.data(),
-                                                        column_info.column_starts.back(),
-                                                        dev_validity_tile_infos,
-                                                        child.data<int8_t>());
-
-    std::vector<device_uvector<size_type>> string_col_offsets;
-    std::vector<rmm::device_uvector<char>> string_data_cols;
-    std::vector<size_type*> string_col_offset_ptrs;
-    std::vector<char*> string_data_col_ptrs;
-    for (auto& col_string_lengths : string_lengths) {
-      device_uvector<size_type> output_string_offsets(num_rows + 1, stream, mr);
-      auto tmp = [num_rows, col_string_lengths] __device__(auto const& i) {
-        return i < num_rows ? col_string_lengths[i] : 0;
-      };
-      auto bounded_iter = cudf::detail::make_counting_transform_iterator(0, tmp);
-      thrust::exclusive_scan(rmm::exec_policy(stream),
-                             bounded_iter,
-                             bounded_iter + num_rows + 1,
-                             output_string_offsets.begin());
-
-      // allocate destination string column
-      rmm::device_uvector<char> string_data(
-        output_string_offsets.element(num_rows, stream), stream, mr);
-
-      string_col_offset_ptrs.push_back(output_string_offsets.data());
-      string_data_col_ptrs.push_back(string_data.data());
-      string_col_offsets.push_back(std::move(output_string_offsets));
-      string_data_cols.push_back(std::move(string_data));
-    }
-    auto dev_string_col_offsets = make_device_uvector_async(
-      string_col_offset_ptrs, stream, rmm::mr::get_current_device_resource());
-    auto dev_string_data_cols = make_device_uvector_async(
-      string_data_col_ptrs, stream, rmm::mr::get_current_device_resource());
-
-    dim3 const string_blocks(
-      std::min(std::max(MIN_STRING_BLOCKS, num_rows / NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS),
-               MAX_STRING_BLOCKS));
-
-    detail::copy_strings_from_rows<<<string_blocks,
-                                     NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                                     0,
-                                     stream.value()>>>(
-      offset_functor,
-      dev_string_row_offsets.data(),
-      dev_string_lengths.data(),
-      dev_string_col_offsets.data(),
-      dev_string_data_cols.data(),
-      child.data<int8_t>(),
-      num_rows,
-      static_cast<cudf::size_type>(string_col_offsets.size()));
-
-    // merge strings back into output_columns
-    int string_idx = 0;
-    for (int i = 0; i < static_cast<int>(schema.size()); ++i) {
-      if (schema[i].id() == type_id::STRING) {
-        // stuff real string column
-        auto string_data  = string_row_offset_columns[string_idx].release()->release();
-        output_columns[i] = make_strings_column(num_rows,
-                                                std::move(string_col_offsets[string_idx]),
-                                                std::move(string_data_cols[string_idx]),
-                                                std::move(*string_data.null_mask.release()),
-                                                0);
-        // Null count set to 0, temporarily. Will be fixed up before return.
-        string_idx++;
-      }
-    }
-  }
-
-  // Set null counts, because output_columns are modified via mutable-view,
-  // in the kernel above.
-  // TODO(future): Consider setting null count in the kernel itself.
-  fixup_null_counts(output_columns, stream);
-
-  return std::make_unique<table>(std::move(output_columns));
-}
-
-std::unique_ptr<table> convert_from_rows_fixed_width_optimized(lists_column_view const& input,
-                                                               std::vector<data_type> const& schema,
-                                                               rmm::cuda_stream_view stream,
-                                                               rmm::mr::device_memory_resource* mr)
-{
-  // verify that the types are what we expect
-  column_view child    = input.child();
-  auto const list_type = child.type().id();
-  CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8,
-               "Only a list of bytes is supported as input");
-
-  auto const num_columns = schema.size();
-
-  if (detail::are_all_fixed_width(schema)) {
-    std::vector<size_type> column_start;
-    std::vector<size_type> column_size;
-
-    auto const num_rows     = input.parent().size();
-    auto const size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
-
-    // Ideally we would check that the offsets are all the same, etc. but for now this is probably
-    // fine
-    CUDF_EXPECTS(size_per_row * num_rows == child.size(),
-                 "The layout of the data appears to be off");
-    auto dev_column_start =
-      make_device_uvector_async(column_start, stream, rmm::mr::get_current_device_resource());
-    auto dev_column_size =
-      make_device_uvector_async(column_size, stream, rmm::mr::get_current_device_resource());
-
-    // Allocate the columns we are going to write into
-    std::vector<std::unique_ptr<column>> output_columns;
-    std::vector<int8_t*> output_data;
-    std::vector<bitmask_type*> output_nm;
-    for (int i = 0; i < static_cast<int>(num_columns); i++) {
-      auto column =
-        make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr);
-      auto mut = column->mutable_view();
-      output_data.emplace_back(mut.data<int8_t>());
-      output_nm.emplace_back(mut.null_mask());
-      output_columns.emplace_back(std::move(column));
-    }
-
-    auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-    auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
-
-    dim3 blocks;
-    dim3 threads;
-    int shared_size =
-      detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
-
-    detail::copy_from_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
-      num_rows,
-      num_columns,
-      size_per_row,
-      dev_column_start.data(),
-      dev_column_size.data(),
-      dev_output_data.data(),
-      dev_output_nm.data(),
-      child.data<int8_t>());
-
-    // Set null counts, because output_columns are modified via mutable-view,
-    // in the kernel above.
-    // TODO(future): Consider setting null count in the kernel itself.
-    fixup_null_counts(output_columns, stream);
-
-    return std::make_unique<table>(std::move(output_columns));
-  } else {
-    CUDF_FAIL("Only fixed width types are currently supported");
-  }
-}
-
-}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/row_conversion.hpp b/src/main/cpp/src/row_conversion.hpp
deleted file mode 100644
index 6e9835e3d2..0000000000
--- a/src/main/cpp/src/row_conversion.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <memory>
-
-namespace spark_rapids_jni {
-
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
-  cudf::table_view const& tbl,
-  // TODO need something for validity
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
-  cudf::table_view const& tbl,
-  // TODO need something for validity
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
-  cudf::lists_column_view const& input,
-  std::vector<cudf::data_type> const& schema,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-std::unique_ptr<cudf::table> convert_from_rows(
-  cudf::lists_column_view const& input,
-  std::vector<cudf::data_type> const& schema,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/utilities.cu b/src/main/cpp/src/utilities.cu
index c66ee5cbcb..7c202a1bec 100644
--- a/src/main/cpp/src/utilities.cu
+++ b/src/main/cpp/src/utilities.cu
@@ -25,6 +25,8 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
+#include <cuda/functional>
+
 namespace spark_rapids_jni {
 
 std::unique_ptr<rmm::device_buffer> bitmask_bitwise_or(
@@ -51,18 +53,19 @@ std::unique_ptr<rmm::device_buffer> bitmask_bitwise_or(
 
   std::unique_ptr<rmm::device_buffer> out =
     std::make_unique<rmm::device_buffer>(mask_size * sizeof(cudf::bitmask_type), stream, mr);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator(0),
-    thrust::make_counting_iterator(0) + mask_size,
-    static_cast<cudf::bitmask_type*>(out->data()),
-    [buffers = d_input.data(), num_buffers = input.size()] __device__(cudf::size_type word_index) {
-      cudf::bitmask_type out = buffers[0][word_index];
-      for (auto idx = 1; idx < num_buffers; idx++) {
-        out |= buffers[idx][word_index];
-      }
-      return out;
-    });
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(0) + mask_size,
+                    static_cast<cudf::bitmask_type*>(out->data()),
+                    cuda::proclaim_return_type<cudf::bitmask_type>(
+                      [buffers     = d_input.data(),
+                       num_buffers = input.size()] __device__(cudf::size_type word_index) {
+                        cudf::bitmask_type out = buffers[0][word_index];
+                        for (auto idx = 1; idx < num_buffers; idx++) {
+                          out |= buffers[idx][word_index];
+                        }
+                        return out;
+                      }));
 
   return out;
 }
diff --git a/src/main/cpp/src/xxhash64.cu b/src/main/cpp/src/xxhash64.cu
index 561aa49862..8c0b1b8766 100644
--- a/src/main/cpp/src/xxhash64.cu
+++ b/src/main/cpp/src/xxhash64.cu
@@ -25,6 +25,8 @@
 
 #include <thrust/tabulate.h>
 
+#include <cuda/functional>
+
 namespace spark_rapids_jni {
 
 namespace {
@@ -286,10 +288,11 @@ class device_row_hasher {
       _table.begin(),
       _table.end(),
       _seed,
-      [row_index, nulls = _check_nulls] __device__(auto hash, auto column) {
-        return cudf::type_dispatcher(
-          column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
-      });
+      cuda::proclaim_return_type<hash_value_type>(
+        [row_index, nulls = _check_nulls] __device__(auto hash, auto column) {
+          return cudf::type_dispatcher(
+            column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
+        }));
   }
 
   /**
diff --git a/src/main/cpp/src/zorder.cu b/src/main/cpp/src/zorder.cu
index c0f21b9b3a..f9c2d4da07 100644
--- a/src/main/cpp/src/zorder.cu
+++ b/src/main/cpp/src/zorder.cu
@@ -28,6 +28,8 @@
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <cuda/functional>
+
 namespace {
 
 // pretends to be an array of uint32_t, but really only stores
@@ -253,18 +255,20 @@ std::unique_ptr<cudf::column> hilbert_index(int32_t const num_bits_per_entry,
     thrust::make_counting_iterator<cudf::size_type>(0),
     thrust::make_counting_iterator<cudf::size_type>(0) + num_rows,
     output_dv_ptr->begin<int64_t>(),
-    [num_bits_per_entry, num_columns, input = *input_dv] __device__(cudf::size_type row_index) {
-      uint_backed_array<uint64_t> row(num_bits_per_entry);
-      for (cudf::size_type column_index = 0; column_index < num_columns; column_index++) {
-        auto const column   = input.column(column_index);
-        uint32_t const data = column.is_valid(row_index) ? column.data<uint32_t>()[row_index] : 0;
-        row.set(column_index, data);
-      }
-
-      auto const transposed_index = hilbert_transposed_index(row, num_bits_per_entry, num_columns);
-      return static_cast<int64_t>(
-        to_hilbert_index(transposed_index, num_bits_per_entry, num_columns));
-    });
+    cuda::proclaim_return_type<int64_t>(
+      [num_bits_per_entry, num_columns, input = *input_dv] __device__(cudf::size_type row_index) {
+        uint_backed_array<uint64_t> row(num_bits_per_entry);
+        for (cudf::size_type column_index = 0; column_index < num_columns; column_index++) {
+          auto const column   = input.column(column_index);
+          uint32_t const data = column.is_valid(row_index) ? column.data<uint32_t>()[row_index] : 0;
+          row.set(column_index, data);
+        }
+
+        auto const transposed_index =
+          hilbert_transposed_index(row, num_bits_per_entry, num_columns);
+        return static_cast<int64_t>(
+          to_hilbert_index(transposed_index, num_bits_per_entry, num_columns));
+      }));
 
   return output_data_col;
 }
diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt
index b34b1b8b01..617df6dfde 100644
--- a/src/main/cpp/tests/CMakeLists.txt
+++ b/src/main/cpp/tests/CMakeLists.txt
@@ -60,9 +60,6 @@ ConfigureTest(CAST_FLOAT_TO_STRING
 ConfigureTest(DATETIME_REBASE
     datetime_rebase.cpp)
 
-ConfigureTest(ROW_CONVERSION
-    row_conversion.cpp)
-
 ConfigureTest(HASH
     hash.cpp)
 
diff --git a/src/main/cpp/tests/row_conversion.cpp b/src/main/cpp/tests/row_conversion.cpp
deleted file mode 100644
index 7e104c3871..0000000000
--- a/src/main/cpp/tests/row_conversion.cpp
+++ /dev/null
@@ -1,1043 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/column/column_view.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/io/parquet.hpp>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/types.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <row_conversion.hpp>
-
-#include <rmm/exec_policy.hpp>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <limits>
-
-struct ColumnToRowTests : public cudf::test::BaseFixture {};
-struct RowToColumnTests : public cudf::test::BaseFixture {};
-
-TEST_F(ColumnToRowTests, Single)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, SimpleString)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1, 0, -1});
-  cudf::test::strings_column_wrapper b(
-    {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"});
-  cudf::table_view in(std::vector<cudf::column_view>{a, b});
-  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
-
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  EXPECT_EQ(new_rows[0]->size(), 5);
-}
-
-TEST_F(ColumnToRowTests, DoubleString)
-{
-  cudf::test::strings_column_wrapper a(
-    {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"});
-  cudf::test::fixed_width_column_wrapper<int32_t> b({0, 1, 2, 3, 4});
-  cudf::test::strings_column_wrapper c({"world",
-                                        "hello",
-                                        "this string isn't as long",
-                                        "this one isn't so short though when you think about it",
-                                        "dlrow"});
-  cudf::table_view in(std::vector<cudf::column_view>{a, b, c});
-
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  EXPECT_EQ(new_rows[0]->size(), 5);
-}
-
-TEST_F(ColumnToRowTests, BigStrings)
-{
-  char const* TEST_STRINGS[] = {
-    "These",
-    "are",
-    "the",
-    "test",
-    "strings",
-    "that",
-    "we",
-    "have",
-    "some are really long",
-    "and some are kinda short",
-    "They are all over on purpose with different sizes for the strings in order to test the code "
-    "on all different lengths of strings",
-    "a",
-    "good test",
-    "is required to produce reasonable confidence that this is working"};
-  auto num_generator =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  auto string_generator =
-    cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* {
-      return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))];
-    });
-
-  auto const num_rows = 50;
-  auto const num_cols = 50;
-  std::vector<cudf::data_type> schema;
-
-  std::vector<cudf::test::detail::column_wrapper> cols;
-  std::vector<cudf::column_view> views;
-
-  for (auto col = 0; col < num_cols; ++col) {
-    if (rand() % 2) {
-      cols.emplace_back(
-        cudf::test::fixed_width_column_wrapper<int32_t>(num_generator, num_generator + num_rows));
-      views.push_back(cols.back());
-      schema.emplace_back(cudf::data_type{cudf::type_id::INT32});
-    } else {
-      cols.emplace_back(
-        cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows));
-      views.push_back(cols.back());
-      schema.emplace_back(cudf::type_id::STRING);
-    }
-  }
-
-  cudf::table_view in(views);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  EXPECT_EQ(new_rows[0]->size(), num_rows);
-}
-
-TEST_F(ColumnToRowTests, ManyStrings)
-{
-  char const* TEST_STRINGS[] = {
-    "These",
-    "are",
-    "the",
-    "test",
-    "strings",
-    "that",
-    "we",
-    "have",
-    "some are really long",
-    "and some are kinda short",
-    "They are all over on purpose with different sizes for the strings in order to test the code "
-    "on all different lengths of strings",
-    "a",
-    "good test",
-    "is required to produce reasonable confidence that this is working",
-    "some strings",
-    "are split into multiple strings",
-    "some strings have all their data",
-    "lots of choices of strings and sizes is sure to test the offset calculation code to ensure "
-    "that even a really long string ends up in the correct spot for the final destination allowing "
-    "for even crazy run-on sentences to be inserted into the data"};
-  auto num_generator =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  auto string_generator =
-    cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* {
-      return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))];
-    });
-
-  auto const num_rows = 1000000;
-  auto const num_cols = 50;
-  std::vector<cudf::data_type> schema;
-
-  std::vector<cudf::test::detail::column_wrapper> cols;
-  std::vector<cudf::column_view> views;
-
-  for (auto col = 0; col < num_cols; ++col) {
-    if (rand() % 2) {
-      cols.emplace_back(
-        cudf::test::fixed_width_column_wrapper<int32_t>(num_generator, num_generator + num_rows));
-      views.push_back(cols.back());
-      schema.emplace_back(cudf::data_type{cudf::type_id::INT32});
-    } else {
-      cols.emplace_back(
-        cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows));
-      views.push_back(cols.back());
-      schema.emplace_back(cudf::type_id::STRING);
-    }
-  }
-
-  cudf::table_view in(views);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  EXPECT_EQ(new_rows[0]->size(), num_rows);
-}
-
-TEST_F(ColumnToRowTests, Simple)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Tall)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32}};
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Wide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({rand()}));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, SingleByteWide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
-    views.push_back(cols.back());
-
-    schema.push_back(cudf::data_type{cudf::type_id::INT8});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Non2Power)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  constexpr auto num_rows = 6 * 1024 + 557;
-  for (int i = 0; i < 131; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
-    }
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Big)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 28 columns of 1 million rows
-  constexpr auto num_rows = 1024 * 1024;
-  for (int i = 0; i < 28; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
-    }
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Bigger)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 128 columns of 1 million rows
-  constexpr auto num_rows = 1024 * 1024;
-  for (int i = 0; i < 128; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
-    }
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(ColumnToRowTests, Biggest)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 128 columns of 2 million rows
-  constexpr auto num_rows = 2 * 1024 * 1024;
-  for (int i = 0; i < 128; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  EXPECT_EQ(old_rows.size(), new_rows.size());
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    for (int j = 0; j < old_tbl->num_columns(); ++j) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
-    }
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Single)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = spark_rapids_jni::convert_to_rows(in);
-  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Simple)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1});
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT32}};
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Tall)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  cudf::test::fixed_width_column_wrapper<int32_t> a(r, r + (size_t)4096);
-  cudf::table_view in(std::vector<cudf::column_view>{a});
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Wide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>({i}));  // rand()}));
-    views.push_back(cols.back());
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, SingleByteWide)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int8_t>> cols;
-  std::vector<cudf::column_view> views;
-
-  for (int i = 0; i < 256; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int8_t>({rand()}));
-    views.push_back(cols.back());
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  std::vector<cudf::data_type> schema;
-  schema.reserve(in.num_columns());
-  for (auto col = in.begin(); col < in.end(); ++col) {
-    schema.push_back(col->type());
-  }
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, AllTypes)
-{
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema{cudf::data_type{cudf::type_id::INT64},
-                                      cudf::data_type{cudf::type_id::FLOAT64},
-                                      cudf::data_type{cudf::type_id::INT8},
-                                      cudf::data_type{cudf::type_id::BOOL8},
-                                      cudf::data_type{cudf::type_id::FLOAT32},
-                                      cudf::data_type{cudf::type_id::INT8},
-                                      cudf::data_type{cudf::type_id::INT32},
-                                      cudf::data_type{cudf::type_id::INT64}};
-
-  cudf::test::fixed_width_column_wrapper<int64_t> c0({3, 9, 4, 2, 20, 0}, {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<double> c1({5.0, 9.5, 0.9, 7.23, 2.8, 0.0},
-                                                    {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<int8_t> c2({5, 1, 0, 2, 7, 0}, {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> c3({true, false, false, true, false, false},
-                                                  {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<float> c4({1.0f, 3.5f, 5.9f, 7.1f, 9.8f, 0.0f},
-                                                   {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<int8_t> c5({2, 3, 4, 5, 9, 0}, {1, 1, 1, 1, 1, 0});
-  cudf::test::fixed_point_column_wrapper<int32_t> c6(
-    {-300, 500, 950, 90, 723, 0}, {1, 1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-2});
-  cudf::test::fixed_point_column_wrapper<int64_t> c7(
-    {-80, 30, 90, 20, 200, 0}, {1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-1});
-
-  cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7});
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, AllTypesLarge)
-{
-  std::vector<cudf::column> cols;
-  std::vector<cudf::data_type> schema{};
-
-  // 15 columns of each type with 1 million entries
-  constexpr int num_rows{1024 * 1024 * 1};
-
-  std::default_random_engine re;
-  std::uniform_real_distribution<double> rand_double(std::numeric_limits<double>::min(),
-                                                     std::numeric_limits<double>::max());
-  std::uniform_int_distribution<int64_t> rand_int64(std::numeric_limits<int64_t>::min(),
-                                                    std::numeric_limits<int64_t>::max());
-  auto r = cudf::detail::make_counting_transform_iterator(
-    0, [&](auto i) -> int64_t { return rand_int64(re); });
-  auto d = cudf::detail::make_counting_transform_iterator(
-    0, [&](auto i) -> double { return rand_double(re); });
-
-  auto all_valid  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
-  auto none_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; });
-  auto most_valid = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return rand() % 2 == 0 ? 0 : 1; });
-  auto few_valid = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return rand() % 13 == 0 ? 1 : 0; });
-
-  for (int i = 0; i < 15; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<int8_t>(r, r + num_rows, all_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::INT8});
-  }
-
-  for (int i = 0; i < 15; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<int16_t>(r, r + num_rows, few_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::INT16});
-  }
-
-  for (int i = 0; i < 15; ++i) {
-    if (i < 5) {
-      cols.push_back(*cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows, few_valid)
-                        .release()
-                        .release());
-    } else {
-      cols.push_back(*cudf::test::fixed_width_column_wrapper<int32_t>(r, r + num_rows, none_valid)
-                        .release()
-                        .release());
-    }
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-
-  for (int i = 0; i < 15; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<float>(d, d + num_rows, most_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::FLOAT32});
-  }
-
-  for (int i = 0; i < 15; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<double>(d, d + num_rows, most_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::FLOAT64});
-  }
-
-  for (int i = 0; i < 15; ++i) {
-    cols.push_back(*cudf::test::fixed_width_column_wrapper<bool>(r, r + num_rows, few_valid)
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::BOOL8});
-  }
-
-  for (int i = 0; i < 15; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
-         r, r + num_rows, all_valid)
-         .release()
-         .release());
-    schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS});
-  }
-
-  for (int i = 0; i < 15; ++i) {
-    cols.push_back(
-      *cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>(
-         r, r + num_rows, most_valid)
-         .release()
-         .release());
-    schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS});
-  }
-
-  for (int i = 0; i < 15; ++i) {
-    cols.push_back(*cudf::test::fixed_point_column_wrapper<int32_t>(
-                      r, r + num_rows, all_valid, numeric::scale_type{-2})
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32});
-  }
-
-  for (int i = 0; i < 15; ++i) {
-    cols.push_back(*cudf::test::fixed_point_column_wrapper<int64_t>(
-                      r, r + num_rows, most_valid, numeric::scale_type{-1})
-                      .release()
-                      .release());
-    schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64});
-  }
-
-  std::vector<cudf::column_view> views(cols.begin(), cols.end());
-  cudf::table_view in(views);
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Non2Power)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  constexpr auto num_rows = 6 * 1024 + 557;
-  for (int i = 0; i < 131; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Big)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 28 columns of 1 million rows
-  constexpr auto num_rows = 1024 * 1024;
-  for (int i = 0; i < 28; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Bigger)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 28 columns of 1 million rows
-  constexpr auto num_rows = 1024 * 1024;
-  for (int i = 0; i < 128; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, Biggest)
-{
-  auto r =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  std::vector<cudf::test::fixed_width_column_wrapper<int32_t>> cols;
-  std::vector<cudf::column_view> views;
-  std::vector<cudf::data_type> schema;
-
-  // 128 columns of 1 million rows
-  constexpr auto num_rows = 5 * 1024 * 1024;
-  for (int i = 0; i < 128; ++i) {
-    cols.push_back(cudf::test::fixed_width_column_wrapper<int32_t>(r + num_rows * i,
-                                                                   r + num_rows * i + num_rows));
-    views.push_back(cols.back());
-    schema.push_back(cudf::data_type{cudf::type_id::INT32});
-  }
-  cudf::table_view in(views);
-
-  auto old_rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(in);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  for (uint i = 0; i < old_rows.size(); ++i) {
-    auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
-      cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
-  }
-}
-
-TEST_F(RowToColumnTests, SimpleString)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> a({-1, 0, 1, 0, -1});
-  cudf::test::strings_column_wrapper b(
-    {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"});
-  cudf::table_view in(std::vector<cudf::column_view>{a, b});
-  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::INT32},
-                                         cudf::data_type{cudf::type_id::STRING}};
-
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-  EXPECT_EQ(new_rows.size(), 1);
-  for (auto& row : new_rows) {
-    auto new_cols = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*row), schema);
-
-    EXPECT_EQ(row->size(), 5);
-    auto const num_columns = new_cols->num_columns();
-
-    cudf::strings_column_view str_col = new_cols->get_column(1).view();
-    std::vector<thrust::host_vector<int8_t>> col_data;
-    std::vector<thrust::host_vector<cudf::size_type>> offset_data;
-    for (int i = 0; i < num_columns; ++i) {
-      offset_data.emplace_back(
-        std::get<0>(cudf::test::to_host<cudf::size_type>(str_col.offsets())));
-      col_data.emplace_back(std::get<0>(cudf::test::to_host<int8_t>(str_col.chars())));
-    }
-
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols);
-  }
-}
-
-TEST_F(RowToColumnTests, DoubleString)
-{
-  cudf::test::strings_column_wrapper a(
-    {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"});
-  cudf::test::fixed_width_column_wrapper<int32_t> b({0, 1, 2, 3, 4});
-  cudf::test::strings_column_wrapper c({"world",
-                                        "hello",
-                                        "this string isn't as long",
-                                        "this one isn't so short though when you think about it",
-                                        "dlrow"});
-  cudf::table_view in(std::vector<cudf::column_view>{a, b, c});
-  std::vector<cudf::data_type> schema = {cudf::data_type{cudf::type_id::STRING},
-                                         cudf::data_type{cudf::type_id::INT32},
-                                         cudf::data_type{cudf::type_id::STRING}};
-
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  for (uint i = 0; i < new_rows.size(); ++i) {
-    auto new_cols =
-      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
-
-    EXPECT_EQ(new_rows[0]->size(), 5);
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols);
-  }
-}
-
-TEST_F(RowToColumnTests, BigStrings)
-{
-  char const* TEST_STRINGS[] = {
-    "These",
-    "are",
-    "the",
-    "test",
-    "strings",
-    "that",
-    "we",
-    "have",
-    "some are really long",
-    "and some are kinda short",
-    "They are all over on purpose with different sizes for the strings in order to test the code "
-    "on all different lengths of strings",
-    "a",
-    "good test",
-    "is required to produce reasonable confidence that this is working"};
-  auto num_generator =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  auto string_generator =
-    cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* {
-      return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))];
-    });
-
-  auto const num_rows = 50;
-  auto const num_cols = 50;
-  std::vector<cudf::data_type> schema;
-
-  std::vector<cudf::test::detail::column_wrapper> cols;
-  std::vector<cudf::column_view> views;
-
-  for (auto col = 0; col < num_cols; ++col) {
-    if (rand() % 2) {
-      cols.emplace_back(
-        cudf::test::fixed_width_column_wrapper<int32_t>(num_generator, num_generator + num_rows));
-      views.push_back(cols.back());
-      schema.emplace_back(cudf::data_type{cudf::type_id::INT32});
-    } else {
-      cols.emplace_back(
-        cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows));
-      views.push_back(cols.back());
-      schema.emplace_back(cudf::type_id::STRING);
-    }
-  }
-
-  cudf::table_view in(views);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  for (auto& i : new_rows) {
-    auto new_cols = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*i), schema);
-
-    auto in_view = cudf::slice(in, {0, new_cols->num_rows()});
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in_view[0], *new_cols);
-  }
-}
-
-TEST_F(RowToColumnTests, ManyStrings)
-{
-  char const* TEST_STRINGS[] = {
-    "These",
-    "are",
-    "the",
-    "test",
-    "strings",
-    "that",
-    "we",
-    "have",
-    "some are really long",
-    "and some are kinda short",
-    "They are all over on purpose with different sizes for the strings in order to test the code "
-    "on all different lengths of strings",
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "this string is the longest string because it is duplicated more than you can imagine "
-    "a",
-    "good test",
-    "is required to produce reasonable confidence that this is working",
-    "some strings",
-    "are split into multiple strings",
-    "some strings have all their data",
-    "lots of choices of strings and sizes is sure to test the offset calculation code to ensure "
-    "that even a really long string ends up in the correct spot for the final destination allowing "
-    "for even crazy run-on sentences to be inserted into the data"};
-  auto num_generator =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); });
-  auto string_generator =
-    cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* {
-      return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))];
-    });
-
-  auto const num_rows = 500000;
-  auto const num_cols = 50;
-  std::vector<cudf::data_type> schema;
-
-  std::vector<cudf::test::detail::column_wrapper> cols;
-  std::vector<cudf::column_view> views;
-
-  for (auto col = 0; col < num_cols; ++col) {
-    if (rand() % 2) {
-      cols.emplace_back(
-        cudf::test::fixed_width_column_wrapper<int32_t>(num_generator, num_generator + num_rows));
-      views.push_back(cols.back());
-      schema.emplace_back(cudf::data_type{cudf::type_id::INT32});
-    } else {
-      cols.emplace_back(
-        cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows));
-      views.push_back(cols.back());
-      schema.emplace_back(cudf::type_id::STRING);
-    }
-  }
-
-  cudf::table_view in(views);
-  auto new_rows = spark_rapids_jni::convert_to_rows(in);
-
-  for (auto& i : new_rows) {
-    auto new_cols = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*i), schema);
-
-    auto in_view = cudf::slice(in, {0, new_cols->num_rows()});
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in_view[0], *new_cols);
-  }
-}
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8b695e3403..36f56c97b9 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8b695e340355d43261800a1cff876369e916ae90
+Subproject commit 36f56c97b94446f29fef5d2ddd8818275a28e406

From 38e503c5df1d9d8d6f84f3578abc2252e3b767f7 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 28 Dec 2023 11:30:23 +0800
Subject: [PATCH 072/127] Update submodule cudf to
 72e6f9b08d3c52ca96ed64d963305ab9005ebff6 (#1669)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 36f56c97b9..72e6f9b08d 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 36f56c97b94446f29fef5d2ddd8818275a28e406
+Subproject commit 72e6f9b08d3c52ca96ed64d963305ab9005ebff6

From 4358580f1f54e409786149049981b8bec16d2817 Mon Sep 17 00:00:00 2001
From: Navin Kumar <97137715+NVnavkumar@users.noreply.github.com>
Date: Tue, 2 Jan 2024 07:49:12 -0800
Subject: [PATCH 073/127] Make the GpuTimeZoneDB class idempotent, such that
 when it is shutdown, it can be recovered and useable again (#1670)

---
 src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index 0eb56100e4..b63a9dc282 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -90,6 +90,8 @@ public Thread newThread(Runnable r) {
   public static void shutdown() {
     if (instance.isLoaded()) {
       instance.close();
+      // Recreate a new instance to reload the database if necessary
+      instance = new GpuTimeZoneDB();
     }
   }
 

From 2b50b5415cc5482fe121e23b217a5fe60f3944e8 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 2 Jan 2024 14:19:53 -0600
Subject: [PATCH 074/127] Update copyright date in NOTICE file (#1673)

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
---
 NOTICE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NOTICE b/NOTICE
index a0975c00c8..5e01c7e14c 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,5 +1,5 @@
 RAPIDS Accelerator JNI For Apache Spark
-Copyright (c) 2022-2023, NVIDIA CORPORATION
+Copyright (c) 2022-2024, NVIDIA CORPORATION
 
 --------------------------------------------------------------------------------
 

From cbbf553b4d8fe43b6c326bfbf1157f98b15cb4e1 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 3 Jan 2024 11:26:05 +0800
Subject: [PATCH 075/127] Update submodule cudf to
 af65d52c7d4ca41606482926bdcc001644b7d108 (#1674)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 72e6f9b08d..af65d52c7d 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 72e6f9b08d3c52ca96ed64d963305ab9005ebff6
+Subproject commit af65d52c7d4ca41606482926bdcc001644b7d108

From fd95e5c2032daab28a812c37f70bc63fcbd2bb48 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 4 Jan 2024 12:05:34 +0800
Subject: [PATCH 076/127] Update submodule cudf to
 4c01e9513c28ef590184d34c0c54292743562c8f (#1675)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index af65d52c7d..4c01e9513c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit af65d52c7d4ca41606482926bdcc001644b7d108
+Subproject commit 4c01e9513c28ef590184d34c0c54292743562c8f

From 1c34077a5afee0fe707d6b45bccf255e6967218a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 5 Jan 2024 05:31:14 +0800
Subject: [PATCH 077/127] Update submodule cudf to
 fab5af24afd36a2a58fc18492bc79b4212762b96 (#1677)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 4c01e9513c..fab5af24af 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 4c01e9513c28ef590184d34c0c54292743562c8f
+Subproject commit fab5af24afd36a2a58fc18492bc79b4212762b96

From e3fe4158690ec584ac826fbc1d0a0a77387369cd Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 5 Jan 2024 09:10:08 +0800
Subject: [PATCH 078/127] Fix a bug in format_float kernel (#1676)

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/ftos_converter.cuh | 44 ++++++++++++++---------------
 src/main/cpp/tests/format_float.cpp |  6 +++-
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
index e684f73921..cbbf28e749 100644
--- a/src/main/cpp/src/ftos_converter.cuh
+++ b/src/main/cpp/src/ftos_converter.cuh
@@ -1202,11 +1202,8 @@ __device__ inline T round_half_even(T const input, int const olength, int const
 {
   // "round" a integer to digits digits, with the half-even rounding mode.
   if (digits > olength) {
-    T num = input;
-    for (int i = 0; i < digits - olength; i++) {
-      num *= 10;
-    }
-    return num;
+    // trailing zeros will be handled later
+    return input;
   }
   T div = POW10_TABLE[olength - digits];
   T mod = input % div;
@@ -1215,10 +1212,10 @@ __device__ inline T round_half_even(T const input, int const olength, int const
   return num;
 }
 
-__device__ inline int to_formated_chars(floating_decimal_64 const v,
-                                        bool const sign,
-                                        char* const result,
-                                        int digits)
+__device__ inline int to_formated_double_chars(floating_decimal_64 const v,
+                                               bool const sign,
+                                               char* const result,
+                                               int digits)
 {
   int index = 0;
   if (sign) { result[index++] = '-'; }
@@ -1289,9 +1286,10 @@ __device__ inline int to_formated_chars(floating_decimal_64 const v,
       result[index++] = '0';
     }
   } else {
+    // 0 <= exp < olength - 1
     uint32_t temp_d = digits, tailing_zero = 0;
-    if (exp + digits > olength) {
-      temp_d       = olength - exp;
+    if (exp + digits + 1 > olength) {
+      temp_d       = olength - exp - 1;
       tailing_zero = digits - temp_d;
     }
     uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1);
@@ -1329,7 +1327,7 @@ __device__ inline int to_formated_chars(floating_decimal_64 const v,
   return index;
 }
 
-__device__ inline int format_float_size(floating_decimal_64 const v, bool const sign, int digits)
+__device__ inline int format_double_size(floating_decimal_64 const v, bool const sign, int digits)
 {
   int index = 0;
   if (sign) { index++; }
@@ -1342,7 +1340,7 @@ __device__ inline int format_float_size(floating_decimal_64 const v, bool const
     index += exp + 1 + exp / 3 + 1 + digits;
   } else {
     uint32_t temp_d = digits;
-    if (exp + digits > olength) { temp_d = olength - exp; }
+    if (exp + digits + 1 > olength) { temp_d = olength - exp - 1; }
     uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1);
     uint64_t pow10          = POW10_TABLE[temp_d];
     uint64_t integer        = rounded_output / pow10;
@@ -1353,10 +1351,10 @@ __device__ inline int format_float_size(floating_decimal_64 const v, bool const
   return index;
 }
 
-__device__ inline int to_formated_chars(floating_decimal_32 const v,
-                                        bool const sign,
-                                        char* const result,
-                                        int digits)
+__device__ inline int to_formated_float_chars(floating_decimal_32 const v,
+                                              bool const sign,
+                                              char* const result,
+                                              int digits)
 {
   int index = 0;
   if (sign) { result[index++] = '-'; }
@@ -1428,8 +1426,8 @@ __device__ inline int to_formated_chars(floating_decimal_32 const v,
     }
   } else {
     uint32_t temp_d = digits, tailing_zero = 0;
-    if (exp + digits > olength) {
-      temp_d       = olength - exp;
+    if (exp + digits + 1 > olength) {
+      temp_d       = olength - exp - 1;
       tailing_zero = digits - temp_d;
     }
     uint32_t rounded_output = round_half_even(output, olength, exp + temp_d + 1);
@@ -1480,7 +1478,7 @@ __device__ inline int format_float_size(floating_decimal_32 const v, bool const
     index += exp + 1 + exp / 3 + 1 + digits;
   } else {
     uint32_t temp_d = digits;
-    if (exp + digits > olength) { temp_d = olength - exp; }
+    if (exp + digits + 1 > olength) { temp_d = olength - exp - 1; }
     uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1);
     uint64_t pow10          = POW10_TABLE[temp_d];
     uint64_t integer        = rounded_output / pow10;
@@ -1539,7 +1537,7 @@ __device__ inline int compute_format_float_size(double value, int digits, bool i
   } else {
     floating_decimal_64 v = d2d(value, sign, special);
     if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); }
-    return format_float_size(v, sign, digits);
+    return format_double_size(v, sign, digits);
   }
 }
 
@@ -1549,11 +1547,11 @@ __device__ inline int format_float(double value, int digits, bool is_float, char
   if (is_float) {
     floating_decimal_32 v = f2d(value, sign, special);
     if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); }
-    return to_formated_chars(v, sign, output, digits);
+    return to_formated_float_chars(v, sign, output, digits);
   } else {
     floating_decimal_64 v = d2d(value, sign, special);
     if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); }
-    return to_formated_chars(v, sign, output, digits);
+    return to_formated_double_chars(v, sign, output, digits);
   }
 }
 
diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp
index b9d77593db..20989a8c20 100644
--- a/src/main/cpp/tests/format_float.cpp
+++ b/src/main/cpp/tests/format_float.cpp
@@ -68,6 +68,8 @@ TEST_F(FormatFloatTests, FormatFloats64)
                                                    -4.0d,
                                                    std::numeric_limits<double>::quiet_NaN(),
                                                    839542223232.794248339d,
+                                                   3232.794248339d,
+                                                   11234000000.0d,
                                                    -0.0d};
 
   auto const expected = cudf::test::strings_column_wrapper{"100.00000",
@@ -80,9 +82,11 @@ TEST_F(FormatFloatTests, FormatFloats64)
                                                            "-4.00000",
                                                            "\xEF\xBF\xBD",
                                                            "839,542,223,232.79420",
+                                                           "3,232.79425",
+                                                           "11,234,000,000.00000",
                                                            "-0.00000"};
 
   auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
-}
\ No newline at end of file
+}

From ce9a6cc07f86df278c568313dbbce49c22189ce2 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 5 Jan 2024 17:30:56 +0800
Subject: [PATCH 079/127] Update submodule cudf to
 b83ab433fb1eb6eb832cf72cb3574909e270edaf (#1680)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index fab5af24af..b83ab433fb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit fab5af24afd36a2a58fc18492bc79b4212762b96
+Subproject commit b83ab433fb1eb6eb832cf72cb3574909e270edaf

From b32430372cd63b0fc3449b000a93241e03b5e8ea Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 5 Jan 2024 23:29:55 +0800
Subject: [PATCH 080/127] Update submodule cudf to
 9c7b05b75de5ebdb438643ad1bdf4fa0b821a480 (#1681)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index b83ab433fb..9c7b05b75d 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit b83ab433fb1eb6eb832cf72cb3574909e270edaf
+Subproject commit 9c7b05b75de5ebdb438643ad1bdf4fa0b821a480

From 82de17bd73d28fe82f1918a5bef5518d69d4340f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 6 Jan 2024 11:25:36 +0800
Subject: [PATCH 081/127] Update submodule cudf to
 6083efa73b3282a457c963f68a8cab94d41cdd70 (#1682)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 9c7b05b75d..6083efa73b 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 9c7b05b75de5ebdb438643ad1bdf4fa0b821a480
+Subproject commit 6083efa73b3282a457c963f68a8cab94d41cdd70

From 473270bbfdfca5b0b8ceb914f1d06060003483ab Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 9 Jan 2024 05:29:53 +0800
Subject: [PATCH 082/127] Update submodule cudf to
 ba7550a17f57d17d6d6decec3b2f8a0a5f687aa8 (#1684)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 6083efa73b..ba7550a17f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 6083efa73b3282a457c963f68a8cab94d41cdd70
+Subproject commit ba7550a17f57d17d6d6decec3b2f8a0a5f687aa8

From 4c6de74cbb71accec464b6be97963fb3df85f680 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 9 Jan 2024 09:24:56 +0800
Subject: [PATCH 083/127] Fix compile warnings and refactor ftos_converter.cuh
 (#1679)

* Fix compile warnings and refactor ftos_converter.cuh

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Apply suggestions from code review

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>

* address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

---------

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 src/main/cpp/src/ftos_converter.cuh | 423 ++++++++++------------------
 1 file changed, 146 insertions(+), 277 deletions(-)

diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
index cbbf28e749..c2fa07377c 100644
--- a/src/main/cpp/src/ftos_converter.cuh
+++ b/src/main/cpp/src/ftos_converter.cuh
@@ -147,9 +147,25 @@ __constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {1ull,
 
 //===== common.h from ryu =====
 
-// Returns the number of decimal digits in v, which must not contain more than 9 digits.
-__device__ inline uint32_t decimalLength9(uint32_t const v)
+// Returns the number of decimal digits in v, which must not contain more than 9/17 digits.
+template <typename T>
+__device__ inline uint32_t decimal_length(T const v)
 {
+  static_assert(std::is_integral_v<T> && !std::is_signed_v<T>);
+  if constexpr (sizeof(T) == sizeof(int64_t)) {
+    // The average output length is 16.38 digits, so we check high-to-low.
+    // Function precondition: v is not an 18, 19, or 20-digit number.
+    // (17 digits are sufficient for round-tripping.)
+    assert(v < 100000000000000000L);
+    if (v >= 10000000000000000L) { return 17; }
+    if (v >= 1000000000000000L) { return 16; }
+    if (v >= 100000000000000L) { return 15; }
+    if (v >= 10000000000000L) { return 14; }
+    if (v >= 1000000000000L) { return 13; }
+    if (v >= 100000000000L) { return 12; }
+    if (v >= 10000000000L) { return 11; }
+    if (v >= 1000000000L) { return 10; }
+  }
   // Function precondition: v is not a 10-digit number.
   // (f2s: 9 digits are sufficient for round-tripping.)
   // (d2fixed: We print 9-digit blocks.)
@@ -173,7 +189,7 @@ __device__ inline int32_t pow5bits(int32_t const e)
   // than 2^9297.
   assert(e >= 0);
   assert(e <= 3528);
-  return (int32_t)(((((uint32_t)e) * 1217359) >> 19) + 1);
+  return static_cast<int32_t>((((static_cast<uint32_t>(e)) * 1217359) >> 19) + 1);
 }
 
 // Returns floor(log_10(2^e)); requires 0 <= e <= 1650.
@@ -182,7 +198,7 @@ __device__ inline uint32_t log10Pow2(int32_t const e)
   // The first value this approximation fails for is 2^1651 which is just greater than 10^297.
   assert(e >= 0);
   assert(e <= 1650);
-  return (((uint32_t)e) * 78913) >> 18;
+  return ((static_cast<uint32_t>(e)) * 78913) >> 18;
 }
 
 // Returns floor(log_10(5^e)); requires 0 <= e <= 2620.
@@ -191,7 +207,7 @@ __device__ inline uint32_t log10Pow5(int32_t const e)
   // The first value this approximation fails for is 5^2621 which is just greater than 10^1832.
   assert(e >= 0);
   assert(e <= 2620);
-  return (((uint32_t)e) * 732923) >> 20;
+  return (static_cast<uint32_t>(e) * 732923) >> 20;
 }
 
 __device__ inline uint32_t pow5factor_32(uint32_t value)
@@ -229,15 +245,15 @@ __device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, i
 
   // The casts here help MSVC to avoid calls to the __allmul library
   // function.
-  uint32_t const factorLo = (uint32_t)(factor);
-  uint32_t const factorHi = (uint32_t)(factor >> 32);
-  uint64_t const bits0    = (uint64_t)m * factorLo;
-  uint64_t const bits1    = (uint64_t)m * factorHi;
+  uint32_t const factorLo = static_cast<uint32_t>(factor);
+  uint32_t const factorHi = static_cast<uint32_t>(factor >> 32);
+  uint64_t const bits0    = static_cast<uint64_t>(m) * factorLo;
+  uint64_t const bits1    = static_cast<uint64_t>(m) * factorHi;
 
   uint64_t const sum        = (bits0 >> 32) + bits1;
   uint64_t const shiftedSum = sum >> (shift - 32);
   assert(shiftedSum <= UINT32_MAX);
-  return (uint32_t)shiftedSum;
+  return static_cast<uint32_t>(shiftedSum);
 }
 
 __device__ inline int copy_special_str(char* const result,
@@ -284,29 +300,29 @@ __device__ inline uint64_t double_to_bits(double const d)
 __device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* const productHi)
 {
   // The casts here help MSVC to avoid calls to the __allmul library function.
-  uint32_t const aLo = (uint32_t)a;
-  uint32_t const aHi = (uint32_t)(a >> 32);
-  uint32_t const bLo = (uint32_t)b;
-  uint32_t const bHi = (uint32_t)(b >> 32);
+  uint32_t const aLo = static_cast<uint32_t>(a);
+  uint32_t const aHi = static_cast<uint32_t>(a >> 32);
+  uint32_t const bLo = static_cast<uint32_t>(b);
+  uint32_t const bHi = static_cast<uint32_t>(b >> 32);
 
-  uint64_t const b00 = (uint64_t)aLo * bLo;
-  uint64_t const b01 = (uint64_t)aLo * bHi;
-  uint64_t const b10 = (uint64_t)aHi * bLo;
-  uint64_t const b11 = (uint64_t)aHi * bHi;
+  uint64_t const b00 = static_cast<uint64_t>(aLo) * bLo;
+  uint64_t const b01 = static_cast<uint64_t>(aLo) * bHi;
+  uint64_t const b10 = static_cast<uint64_t>(aHi) * bLo;
+  uint64_t const b11 = static_cast<uint64_t>(aHi) * bHi;
 
-  uint32_t const b00Lo = (uint32_t)b00;
-  uint32_t const b00Hi = (uint32_t)(b00 >> 32);
+  uint32_t const b00Lo = static_cast<uint32_t>(b00);
+  uint32_t const b00Hi = static_cast<uint32_t>(b00 >> 32);
 
   uint64_t const mid1   = b10 + b00Hi;
-  uint32_t const mid1Lo = (uint32_t)(mid1);
-  uint32_t const mid1Hi = (uint32_t)(mid1 >> 32);
+  uint32_t const mid1Lo = static_cast<uint32_t>(mid1);
+  uint32_t const mid1Hi = static_cast<uint32_t>(mid1 >> 32);
 
   uint64_t const mid2   = b01 + mid1Lo;
-  uint32_t const mid2Lo = (uint32_t)(mid2);
-  uint32_t const mid2Hi = (uint32_t)(mid2 >> 32);
+  uint32_t const mid2Lo = static_cast<uint32_t>(mid2);
+  uint32_t const mid2Hi = static_cast<uint32_t>(mid2 >> 32);
 
   uint64_t const pHi = b11 + mid1Hi + mid2Hi;
-  uint64_t const pLo = ((uint64_t)mid2Lo << 32) | b00Lo;
+  uint64_t const pLo = (static_cast<uint64_t>(mid2Lo) << 32) | b00Lo;
 
   *productHi = pHi;
   return pLo;
@@ -461,42 +477,16 @@ __device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, in
 
 //===== d2s.c and f2s.c from ryu =====
 
-__device__ inline uint32_t decimalLength17(uint64_t const v)
-{
-  // This is slightly faster than a loop.
-  // The average output length is 16.38 digits, so we check high-to-low.
-  // Function precondition: v is not an 18, 19, or 20-digit number.
-  // (17 digits are sufficient for round-tripping.)
-  assert(v < 100000000000000000L);
-  if (v >= 10000000000000000L) { return 17; }
-  if (v >= 1000000000000000L) { return 16; }
-  if (v >= 100000000000000L) { return 15; }
-  if (v >= 10000000000000L) { return 14; }
-  if (v >= 1000000000000L) { return 13; }
-  if (v >= 100000000000L) { return 12; }
-  if (v >= 10000000000L) { return 11; }
-  if (v >= 1000000000L) { return 10; }
-  if (v >= 100000000L) { return 9; }
-  if (v >= 10000000L) { return 8; }
-  if (v >= 1000000L) { return 7; }
-  if (v >= 100000L) { return 6; }
-  if (v >= 10000L) { return 5; }
-  if (v >= 1000L) { return 4; }
-  if (v >= 100L) { return 3; }
-  if (v >= 10L) { return 2; }
-  return 1;
-}
-
 __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t const ieeeExponent)
 {
   int32_t e2;
   uint64_t m2;
   if (ieeeExponent == 0) {
     // We subtract 2 so that the bounds computation has 2 additional bits.
-    e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
+    e2 = static_cast<int32_t>(1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2);
     m2 = ieeeMantissa;
   } else {
-    e2 = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
+    e2 = static_cast<int32_t>(ieeeExponent) - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
     m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
   }
   bool const even         = (m2 & 1) == 0;
@@ -519,9 +509,9 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
     // I tried special-casing q == 0, but there was no effect on performance.
     // This expression is slightly faster than max(0, log10Pow2(e2) - 1).
     uint32_t const q = log10Pow2(e2) - (e2 > 3);
-    e10              = (int32_t)q;
-    int32_t const k  = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1;
-    int32_t const i  = -e2 + (int32_t)q + k;
+    e10              = static_cast<int32_t>(q);
+    int32_t const k  = DOUBLE_POW5_INV_BITCOUNT + pow5bits(static_cast<int32_t>(q)) - 1;
+    int32_t const i  = -e2 + static_cast<int32_t>(q) + k;
     uint64_t pow5[2];
     double_computeInvPow5(q, pow5);
     vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift);
@@ -530,7 +520,7 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
       // This should use q <= 22, but I think 21 is also safe. Smaller values
       // may still be safe, but it's more difficult to reason about them.
       // Only one of mp, mv, and mm can be a multiple of 5, if any.
-      uint32_t const mvMod5 = ((uint32_t)mv) - 5 * ((uint32_t)div5(mv));
+      uint32_t const mvMod5 = (static_cast<uint32_t>(mv)) - 5 * (static_cast<uint32_t>(div5(mv)));
       if (mvMod5 == 0) {
         vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
       } else if (acceptBounds) {
@@ -546,10 +536,10 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
   } else {
     // This expression is slightly faster than max(0, log10Pow5(-e2) - 1).
     uint32_t const q = log10Pow5(-e2) - (-e2 > 1);
-    e10              = (int32_t)q + e2;
-    int32_t const i  = -e2 - (int32_t)q;
+    e10              = static_cast<int32_t>(q) + e2;
+    int32_t const i  = -e2 - static_cast<int32_t>(q);
     int32_t const k  = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
-    int32_t const j  = (int32_t)q - k;
+    int32_t const j  = static_cast<int32_t>(q) - k;
 
     uint64_t pow5[2];
     double_computePow5(i, pow5);
@@ -586,12 +576,12 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
       uint64_t const vpDiv10 = div10(vp);
       uint64_t const vmDiv10 = div10(vm);
       if (vpDiv10 <= vmDiv10) { break; }
-      uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10);
+      uint32_t const vmMod10 = (static_cast<uint32_t>(vm)) - 10 * (static_cast<uint32_t>(vmDiv10));
       uint64_t const vrDiv10 = div10(vr);
-      uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10);
+      uint32_t const vrMod10 = (static_cast<uint32_t>(vr)) - 10 * (static_cast<uint32_t>(vrDiv10));
       vmIsTrailingZeros &= vmMod10 == 0;
       vrIsTrailingZeros &= lastRemovedDigit == 0;
-      lastRemovedDigit = (uint8_t)vrMod10;
+      lastRemovedDigit = static_cast<uint8_t>(vrMod10);
       vr               = vrDiv10;
       vp               = vpDiv10;
       vm               = vmDiv10;
@@ -601,13 +591,15 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
     if (vmIsTrailingZeros) {
       for (;;) {
         uint64_t const vmDiv10 = div10(vm);
-        uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10);
+        uint32_t const vmMod10 =
+          (static_cast<uint32_t>(vm)) - 10 * (static_cast<uint32_t>(vmDiv10));
         if (vmMod10 != 0) { break; }
         uint64_t const vpDiv10 = div10(vp);
         uint64_t const vrDiv10 = div10(vr);
-        uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10);
+        uint32_t const vrMod10 =
+          (static_cast<uint32_t>(vr)) - 10 * (static_cast<uint32_t>(vrDiv10));
         vrIsTrailingZeros &= lastRemovedDigit == 0;
-        lastRemovedDigit = (uint8_t)vrMod10;
+        lastRemovedDigit = static_cast<uint8_t>(vrMod10);
         vr               = vrDiv10;
         vp               = vpDiv10;
         vm               = vmDiv10;
@@ -628,11 +620,12 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
     uint64_t const vmDiv100 = div100(vm);
     if (vpDiv100 > vmDiv100) {  // Optimization: remove two digits at a time (~86.2%).
       uint64_t const vrDiv100 = div100(vr);
-      uint32_t const vrMod100 = ((uint32_t)vr) - 100 * ((uint32_t)vrDiv100);
-      roundUp                 = vrMod100 >= 50;
-      vr                      = vrDiv100;
-      vp                      = vpDiv100;
-      vm                      = vmDiv100;
+      uint32_t const vrMod100 =
+        (static_cast<uint32_t>(vr)) - 100 * (static_cast<uint32_t>(vrDiv100));
+      roundUp = vrMod100 >= 50;
+      vr      = vrDiv100;
+      vp      = vpDiv100;
+      vm      = vmDiv100;
       removed += 2;
     }
     // Loop iterations below (approximately), without optimization above:
@@ -644,7 +637,7 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
       uint64_t const vmDiv10 = div10(vm);
       if (vpDiv10 <= vmDiv10) { break; }
       uint64_t const vrDiv10 = div10(vr);
-      uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10);
+      uint32_t const vrMod10 = (static_cast<uint32_t>(vr)) - 10 * (static_cast<uint32_t>(vrDiv10));
       roundUp                = vrMod10 >= 5;
       vr                     = vrDiv10;
       vp                     = vpDiv10;
@@ -669,10 +662,10 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
   uint32_t m2;
   if (ieeeExponent == 0) {
     // We subtract 2 so that the bounds computation has 2 additional bits.
-    e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
+    e2 = static_cast<int32_t>(1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2);
     m2 = ieeeMantissa;
   } else {
-    e2 = (int32_t)ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
+    e2 = static_cast<int32_t>(ieeeExponent) - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
     m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa;
   }
   bool const even         = (m2 & 1) == 0;
@@ -693,9 +686,9 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
   uint8_t lastRemovedDigit = 0;
   if (e2 >= 0) {
     uint32_t const q = log10Pow2(e2);
-    e10              = (int32_t)q;
-    int32_t const k  = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1;
-    int32_t const i  = -e2 + (int32_t)q + k;
+    e10              = static_cast<int32_t>(q);
+    int32_t const k  = FLOAT_POW5_INV_BITCOUNT + pow5bits(static_cast<int32_t>(q)) - 1;
+    int32_t const i  = -e2 + static_cast<int32_t>(q) + k;
     vr               = mulPow5InvDivPow2(mv, q, i);
     vp               = mulPow5InvDivPow2(mp, q, i);
     vm               = mulPow5InvDivPow2(mm, q, i);
@@ -703,8 +696,9 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
       // We need to know one removed digit even if we are not going to loop below. We could use
       // q = X - 1 above, except that would require 33 bits for the result, and we've found that
       // 32-bit arithmetic is faster even on 64-bit machines.
-      int32_t const l  = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)(q - 1)) - 1;
-      lastRemovedDigit = (uint8_t)(mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t)q - 1 + l) % 10);
+      int32_t const l  = FLOAT_POW5_INV_BITCOUNT + pow5bits(static_cast<int32_t>(q - 1)) - 1;
+      lastRemovedDigit = static_cast<uint8_t>(
+        mulPow5InvDivPow2(mv, q - 1, -e2 + static_cast<int32_t>(q) - 1 + l) % 10);
     }
     if (q <= 9) {
       // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well.
@@ -719,16 +713,17 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
     }
   } else {
     uint32_t const q = log10Pow5(-e2);
-    e10              = (int32_t)q + e2;
-    int32_t const i  = -e2 - (int32_t)q;
+    e10              = static_cast<int32_t>(q) + e2;
+    int32_t const i  = -e2 - static_cast<int32_t>(q);
     int32_t const k  = pow5bits(i) - FLOAT_POW5_BITCOUNT;
-    int32_t j        = (int32_t)q - k;
-    vr               = mulPow5divPow2(mv, (uint32_t)i, j);
-    vp               = mulPow5divPow2(mp, (uint32_t)i, j);
-    vm               = mulPow5divPow2(mm, (uint32_t)i, j);
+    int32_t j        = static_cast<int32_t>(q) - k;
+    vr               = mulPow5divPow2(mv, static_cast<uint32_t>(i), j);
+    vp               = mulPow5divPow2(mp, static_cast<uint32_t>(i), j);
+    vm               = mulPow5divPow2(mm, static_cast<uint32_t>(i), j);
     if (q != 0 && (vp - 1) / 10 <= vm / 10) {
-      j                = (int32_t)q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT);
-      lastRemovedDigit = (uint8_t)(mulPow5divPow2(mv, (uint32_t)(i + 1), j) % 10);
+      j = static_cast<int32_t>(q) - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT);
+      lastRemovedDigit =
+        static_cast<uint8_t>(mulPow5divPow2(mv, static_cast<uint32_t>(i + 1), j) % 10);
     }
     if (q <= 1) {
       // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
@@ -754,7 +749,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
     while (vp / 10 > vm / 10) {
       vmIsTrailingZeros &= vm % 10 == 0;
       vrIsTrailingZeros &= lastRemovedDigit == 0;
-      lastRemovedDigit = (uint8_t)(vr % 10);
+      lastRemovedDigit = static_cast<uint8_t>(vr % 10);
       vr /= 10;
       vp /= 10;
       vm /= 10;
@@ -763,7 +758,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
     if (vmIsTrailingZeros) {
       while (vm % 10 == 0) {
         vrIsTrailingZeros &= lastRemovedDigit == 0;
-        lastRemovedDigit = (uint8_t)(vr % 10);
+        lastRemovedDigit = static_cast<uint8_t>(vr % 10);
         vr /= 10;
         vp /= 10;
         vm /= 10;
@@ -781,7 +776,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
     // Loop iterations below (approximately):
     // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01%
     while (vp / 10 > vm / 10) {
-      lastRemovedDigit = (uint8_t)(vr % 10);
+      lastRemovedDigit = static_cast<uint8_t>(vr % 10);
       vr /= 10;
       vp /= 10;
       vm /= 10;
@@ -805,8 +800,8 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
   if (sign) { result[index++] = '-'; }
 
   uint64_t output         = v.mantissa;
-  uint32_t const olength  = decimalLength17(output);
-  int32_t exp             = v.exponent + (int32_t)olength - 1;
+  uint32_t const olength  = decimal_length(output);
+  int32_t exp             = v.exponent + static_cast<int32_t>(olength) - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
   // Values in the interval [1E-3, 1E7) are special.
@@ -885,8 +880,8 @@ __device__ inline int d2s_size(floating_decimal_64 const v, bool const sign)
   if (sign) { index++; }
 
   uint64_t output         = v.mantissa;
-  uint32_t const olength  = decimalLength17(output);
-  int32_t exp             = v.exponent + (int32_t)olength - 1;
+  uint32_t const olength  = decimal_length(output);
+  int32_t exp             = v.exponent + static_cast<int32_t>(olength) - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
   if (scientificNotation) {
@@ -925,7 +920,7 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha
   if (sign) { result[index++] = '-'; }
 
   uint32_t output         = v.mantissa;
-  uint32_t const olength  = decimalLength9(output);
+  uint32_t const olength  = decimal_length(output);
   int32_t exp             = v.exponent + olength - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
@@ -1000,7 +995,7 @@ __device__ inline int f2s_size(floating_decimal_32 const v, bool const sign)
   if (sign) { index++; }
 
   uint32_t output         = v.mantissa;
-  uint32_t const olength  = decimalLength9(output);
+  uint32_t const olength  = decimal_length(output);
   int32_t exp             = v.exponent + olength - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
@@ -1036,7 +1031,7 @@ __device__ inline bool d2d_small_int(uint64_t const ieeeMantissa,
                                      floating_decimal_64* const v)
 {
   uint64_t const m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
-  int32_t const e2  = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
+  int32_t const e2  = static_cast<int32_t>(ieeeExponent) - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
 
   if (e2 > 0) {
     // f = m2 * 2^e2 >= 2^53 is an integer.
@@ -1057,7 +1052,7 @@ __device__ inline bool d2d_small_int(uint64_t const ieeeMantissa,
 
   // f is an integer in the range [1, 2^53).
   // Note: mantissa might contain trailing (decimal) 0's.
-  // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17().
+  // Note: since 2^53 < 10^16, there is no need to adjust decimal_length().
   v->mantissa = m2 >> -e2;
   v->exponent = 0;
   return true;
@@ -1072,12 +1067,12 @@ __device__ inline floating_decimal_64 d2d(double f, bool& ieeeSign, bool& specia
   ieeeSign                    = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
   uint64_t const ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
   uint32_t const ieeeExponent =
-    (uint32_t)((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
+    static_cast<uint32_t>((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
   // Case distinction; exit early for the easy cases.
   if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) ||
       (ieeeExponent == 0 && ieeeMantissa == 0)) {
     special = true;
-    return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent};
+    return floating_decimal_64{ieeeMantissa, static_cast<int32_t>(ieeeExponent)};
   }
   special = false;
   floating_decimal_64 v;
@@ -1089,7 +1084,7 @@ __device__ inline floating_decimal_64 d2d(double f, bool& ieeeSign, bool& specia
     // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.)
     for (;;) {
       uint64_t const q = div10(v.mantissa);
-      uint32_t const r = ((uint32_t)v.mantissa) - 10 * ((uint32_t)q);
+      uint32_t const r = (static_cast<uint32_t>(v.mantissa)) - 10 * (static_cast<uint32_t>(q));
       if (r != 0) { break; }
       v.mantissa = q;
       ++v.exponent;
@@ -1122,7 +1117,7 @@ __device__ inline floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special
   if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) ||
       (ieeeExponent == 0 && ieeeMantissa == 0)) {
     special = true;
-    return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent};
+    return floating_decimal_32{ieeeMantissa, static_cast<int32_t>(ieeeExponent)};
   }
   special = false;
   return f2d(ieeeMantissa, ieeeExponent);
@@ -1212,48 +1207,57 @@ __device__ inline T round_half_even(T const input, int const olength, int const
   return num;
 }
 
-__device__ inline int to_formated_double_chars(floating_decimal_64 const v,
-                                               bool const sign,
-                                               char* const result,
-                                               int digits)
+/*
+ * Convert a floating_decimal_32/64 to a formatted string as the default format (#,###,###.##)
+ * of format_number in Spark.
+ *
+ * @param v The input floating_decimal_32/64 value
+ * @param sign Sign of the number
+ * @param result Output string
+ * @param digits Number of digits after decimal point
+ */
+template <typename T>
+__device__ inline int to_formatted_chars(T const v, bool const sign, char* const result, int digits)
 {
+  static_assert(std::is_same_v<T, floating_decimal_32> || std::is_same_v<T, floating_decimal_64>);
+  using U   = std::conditional_t<std::is_same_v<T, floating_decimal_32>, uint32_t, uint64_t>;
   int index = 0;
   if (sign) { result[index++] = '-'; }
-  uint64_t output        = v.mantissa;
-  const uint32_t olength = decimalLength17(output);
-  int32_t exp            = v.exponent + (int32_t)olength - 1;
+  U output               = v.mantissa;
+  uint32_t const olength = decimal_length(output);
+  int32_t exp            = v.exponent + static_cast<int32_t>(olength) - 1;
   if (exp < 0) {
     // Decimal dot is before any of the digits.
     int index_for_carrier = index;
     result[index++]       = '0';
     if (digits == 0) { return index; }
-    result[index++]   = '.';
-    int actural_round = digits;
+    result[index++]  = '.';
+    int actual_round = digits;
     for (int i = -1; i > exp; i--) {
       index_for_carrier = index;
       result[index++]   = '0';
-      actural_round--;
-      if (actural_round == 0) {
+      actual_round--;
+      if (actual_round == 0) {
         if (i != exp + 1) { return index; }  // else, possible carry
         break;
       }
     }
-    int actural_olength     = fmin(int(olength), actural_round);
-    uint64_t rounded_output = round_half_even(output, olength, actural_round);
+    int actual_olength = fmin(int(olength), actual_round);
+    U rounded_output   = round_half_even(output, olength, actual_round);
     // check if carry
-    if (rounded_output >= POW10_TABLE[actural_olength]) {
+    if (rounded_output >= POW10_TABLE[actual_olength]) {
       result[index_for_carrier] = '1';
-      rounded_output -= POW10_TABLE[actural_olength];
+      rounded_output -= POW10_TABLE[actual_olength];
     }
     int current = index;
-    for (int i = 0; i < actural_olength; i++) {
-      result[current + actural_olength - i - 1] = (char)('0' + rounded_output % 10);
+    for (int i = 0; i < actual_olength; i++) {
+      result[current + actual_olength - i - 1] = (char)('0' + rounded_output % 10);
       rounded_output /= 10;
       index++;
     }
-    actural_round -= actural_olength;
-    if (actural_round > 0) {
-      for (int i = 0; i < actural_round; i++) {
+    actual_round -= actual_olength;
+    if (actual_round > 0) {
+      for (int i = 0; i < actual_round; i++) {
         result[index++] = '0';
       }
     }
@@ -1292,12 +1296,12 @@ __device__ inline int to_formated_double_chars(floating_decimal_64 const v,
       temp_d       = olength - exp - 1;
       tailing_zero = digits - temp_d;
     }
-    uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1);
-    uint64_t pow10          = POW10_TABLE[temp_d];
-    uint64_t integer        = rounded_output / pow10;
-    uint64_t decimal        = rounded_output % pow10;
+    U rounded_output = round_half_even(output, olength, exp + temp_d + 1);
+    U pow10          = POW10_TABLE[temp_d];
+    U integer        = rounded_output / pow10;
+    U decimal        = rounded_output % pow10;
     // calculate integer length after format to cover carry case
-    uint32_t integer_len          = decimalLength17(integer);
+    uint32_t integer_len          = decimal_length(integer);
     uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
     uint32_t sep_cnt              = 0;
     int rev_index                 = 0;
@@ -1327,151 +1331,16 @@ __device__ inline int to_formated_double_chars(floating_decimal_64 const v,
   return index;
 }
 
-__device__ inline int format_double_size(floating_decimal_64 const v, bool const sign, int digits)
-{
-  int index = 0;
-  if (sign) { index++; }
-  uint64_t output        = v.mantissa;
-  const uint32_t olength = decimalLength17(output);
-  int32_t exp            = v.exponent + (int32_t)olength - 1;
-  if (exp < 0) {
-    index += 2 + digits;
-  } else if (exp + 1 >= olength) {
-    index += exp + 1 + exp / 3 + 1 + digits;
-  } else {
-    uint32_t temp_d = digits;
-    if (exp + digits + 1 > olength) { temp_d = olength - exp - 1; }
-    uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1);
-    uint64_t pow10          = POW10_TABLE[temp_d];
-    uint64_t integer        = rounded_output / pow10;
-    uint32_t integer_len    = decimalLength17(integer);
-    index += integer_len + (integer_len - 1) / 3 + 1 + digits;
-  }
-  if (digits == 0) { index--; }
-  return index;
-}
-
-__device__ inline int to_formated_float_chars(floating_decimal_32 const v,
-                                              bool const sign,
-                                              char* const result,
-                                              int digits)
-{
-  int index = 0;
-  if (sign) { result[index++] = '-'; }
-  uint32_t output        = v.mantissa;
-  uint32_t const olength = decimalLength9(output);
-  int32_t exp            = v.exponent + (int32_t)olength - 1;
-  if (exp < 0) {
-    // Decimal dot is before any of the digits.
-    int index_for_carrier = index;
-    result[index++]       = '0';
-    if (digits == 0) { return index; }
-    result[index++]   = '.';
-    int actural_round = digits;
-    for (int i = -1; i > exp; i--) {
-      index_for_carrier = index;
-      result[index++]   = '0';
-      actural_round--;
-      if (actural_round == 0) {
-        if (i != exp + 1) { return index; }  // else, possible carry
-        break;
-      }
-    }
-    int actural_olength     = fmin(int(olength), actural_round);
-    uint64_t rounded_output = round_half_even(output, olength, actural_round);
-    // check if carry
-    if (rounded_output >= POW10_TABLE[actural_olength]) {
-      result[index_for_carrier] = '1';
-      rounded_output -= POW10_TABLE[actural_olength];
-    }
-    int current = index;
-    for (int i = 0; i < actural_olength; i++) {
-      result[current + actural_olength - i - 1] = (char)('0' + rounded_output % 10);
-      rounded_output /= 10;
-      index++;
-    }
-    actural_round -= actural_olength;
-    if (actural_round > 0) {
-      for (int i = 0; i < actural_round; i++) {
-        result[index++] = '0';
-      }
-    }
-  } else if (exp + 1 >= olength) {
-    // Decimal dot is after any of the digits.
-    int integer_len = index + exp + 1 + exp / 3;
-    int sep_cnt     = 0;
-    int rev_index   = 0;
-    for (int i = olength; i < exp + 1; i++) {
-      result[integer_len - (rev_index++) - 1] = '0';
-      sep_cnt++;
-      if (sep_cnt == 3) {
-        result[integer_len - (rev_index++) - 1] = ',';
-        sep_cnt                                 = 0;
-      }
-    }
-    for (int i = 0; i < olength; i++) {
-      if (sep_cnt == 3) {
-        result[integer_len - (rev_index++) - 1] = ',';
-        sep_cnt                                 = 0;
-      }
-      result[integer_len - (rev_index++) - 1] = (char)('0' + output % 10);
-      sep_cnt++;
-      output /= 10;
-    }
-    index = integer_len;
-    if (digits == 0) { return index; }
-    result[index++] = '.';
-    for (int i = 0; i < digits; i++) {
-      result[index++] = '0';
-    }
-  } else {
-    uint32_t temp_d = digits, tailing_zero = 0;
-    if (exp + digits + 1 > olength) {
-      temp_d       = olength - exp - 1;
-      tailing_zero = digits - temp_d;
-    }
-    uint32_t rounded_output = round_half_even(output, olength, exp + temp_d + 1);
-    uint32_t pow10          = POW10_TABLE[temp_d];
-    uint32_t integer        = rounded_output / pow10;
-    uint32_t decimal        = rounded_output % pow10;
-    // calculate integer length after format to cover carry case
-    uint32_t integer_len          = decimalLength9(integer);
-    uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
-    uint32_t sep_cnt              = 0;
-    int rev_index                 = 0;
-    for (int i = 0; i < integer_len; i++) {
-      if (sep_cnt == 3) {
-        result[formated_integer_len - (rev_index++) - 1] = ',';
-        sep_cnt                                          = 0;
-      }
-      result[formated_integer_len - (rev_index++) - 1] = (char)('0' + integer % 10);
-      sep_cnt++;
-      integer /= 10;
-    }
-    index = formated_integer_len;
-    if (digits == 0) { return index; }
-    result[index++] = '.';
-    int current     = index;
-    for (int i = 0; i < tailing_zero; i++) {
-      result[current + digits - i - 1] = '0';
-      index++;
-    }
-    for (int i = tailing_zero; i < digits; i++) {
-      result[current + digits - i - 1] = (char)('0' + decimal % 10);
-      decimal /= 10;
-      index++;
-    }
-  }
-  return index;
-}
-
-__device__ inline int format_float_size(floating_decimal_32 const v, bool const sign, int digits)
+template <typename T>
+__device__ inline int format_size(T const v, bool const sign, int digits)
 {
+  static_assert(std::is_same_v<T, floating_decimal_32> || std::is_same_v<T, floating_decimal_64>);
+  using U   = std::conditional_t<std::is_same_v<T, floating_decimal_32>, uint32_t, uint64_t>;
   int index = 0;
   if (sign) { index++; }
-  uint64_t output        = v.mantissa;
-  uint32_t const olength = decimalLength9(output);
-  int32_t exp            = v.exponent + (int32_t)olength - 1;
+  U output               = v.mantissa;
+  uint32_t const olength = decimal_length(output);
+  int32_t exp            = v.exponent + static_cast<int32_t>(olength) - 1;
   if (exp < 0) {
     index += 2 + digits;
   } else if (exp + 1 >= olength) {
@@ -1479,10 +1348,10 @@ __device__ inline int format_float_size(floating_decimal_32 const v, bool const
   } else {
     uint32_t temp_d = digits;
     if (exp + digits + 1 > olength) { temp_d = olength - exp - 1; }
-    uint64_t rounded_output = round_half_even(output, olength, exp + temp_d + 1);
-    uint64_t pow10          = POW10_TABLE[temp_d];
-    uint64_t integer        = rounded_output / pow10;
-    uint32_t integer_len    = decimalLength9(integer);
+    U rounded_output     = round_half_even(output, olength, exp + temp_d + 1);
+    U pow10              = POW10_TABLE[temp_d];
+    U integer            = rounded_output / pow10;
+    uint32_t integer_len = decimal_length(integer);
     index += integer_len + (integer_len - 1) / 3 + 1 + digits;
   }
   if (digits == 0) { index--; }
@@ -1533,11 +1402,11 @@ __device__ inline int compute_format_float_size(double value, int digits, bool i
   if (is_float) {
     floating_decimal_32 v = f2d(value, sign, special);
     if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); }
-    return format_float_size(v, sign, digits);
+    return format_size<floating_decimal_32>(v, sign, digits);
   } else {
     floating_decimal_64 v = d2d(value, sign, special);
     if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); }
-    return format_double_size(v, sign, digits);
+    return format_size<floating_decimal_64>(v, sign, digits);
   }
 }
 
@@ -1547,11 +1416,11 @@ __device__ inline int format_float(double value, int digits, bool is_float, char
   if (is_float) {
     floating_decimal_32 v = f2d(value, sign, special);
     if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); }
-    return to_formated_float_chars(v, sign, output, digits);
+    return to_formatted_chars<floating_decimal_32>(v, sign, output, digits);
   } else {
     floating_decimal_64 v = d2d(value, sign, special);
     if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); }
-    return to_formated_double_chars(v, sign, output, digits);
+    return to_formatted_chars<floating_decimal_64>(v, sign, output, digits);
   }
 }
 

From f42872a4ad3443605030f7bc8f07f7aa9f7451da Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Tue, 9 Jan 2024 10:06:09 +0800
Subject: [PATCH 084/127] Download boost package for CI jobs (#1672)

* Download boost package for CI jobs

To fix: https://github.com/NVIDIA/spark-rapids-jni/issues/1671

The old Boost package is not available to build CI Docker container.
[boost_1_79_0.tar.gz] (https://github.com/NVIDIA/spark-rapids-jni/blob/branch-24.02/ci/Dockerfile#L64) not available.

Update the correct Boost linkage for CI scripts, to PASS the spark-rapids-jni nightly build/test CI jobs.

Signed-off-by: Tim Liu <timl@nvidia.com>

* Update copyright

Signed-off-by: Tim Liu <timl@nvidia.com>

---------

Signed-off-by: Tim Liu <timl@nvidia.com>
---
 ci/Dockerfile       | 4 ++--
 ci/Dockerfile.multi | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/Dockerfile b/ci/Dockerfile
index 7d59fef5f5..e3b703a11e 100755
--- a/ci/Dockerfile
+++ b/ci/Dockerfile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -61,7 +61,7 @@ RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v
    rm -rf ccache-${CCACHE_VERSION}
 
 ## install a version of boost that is needed for arrow/parquet to work
-RUN cd /usr/local && wget --quiet https://boostorg.jfrog.io/artifactory/main/release/1.79.0/source/boost_1_79_0.tar.gz && \
+RUN cd /usr/local && wget --quiet https://archives.boost.io/release/1.79.0/source/boost_1_79_0.tar.gz && \
   tar -xzf boost_1_79_0.tar.gz && \
   rm boost_1_79_0.tar.gz && \
   cd boost_1_79_0 && \
diff --git a/ci/Dockerfile.multi b/ci/Dockerfile.multi
index 720c9bc4df..d3b198530b 100644
--- a/ci/Dockerfile.multi
+++ b/ci/Dockerfile.multi
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v
    rm -rf ccache-${CCACHE_VERSION}
 
 ## install a version of boost that is needed for arrow/parquet to work
-RUN cd /usr/local && wget --quiet https://boostorg.jfrog.io/artifactory/main/release/1.79.0/source/boost_1_79_0.tar.gz && \
+RUN cd /usr/local && wget --quiet https://archives.boost.io/release/1.79.0/source/boost_1_79_0.tar.gz && \
   tar -xzf boost_1_79_0.tar.gz && \
   rm boost_1_79_0.tar.gz && \
   cd boost_1_79_0 && \

From 4a27fdb7e57a405d5f48a023eb529d0e8dad5594 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 9 Jan 2024 11:31:18 +0800
Subject: [PATCH 085/127] Update submodule cudf to
 3a1601d61437b339c47a015dab7a830998b182f9 (#1685)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index ba7550a17f..3a1601d614 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit ba7550a17f57d17d6d6decec3b2f8a0a5f687aa8
+Subproject commit 3a1601d61437b339c47a015dab7a830998b182f9

From f6972922986eb5f0f23b3456fb3e8a39349709c3 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 10 Jan 2024 11:30:59 +0800
Subject: [PATCH 086/127] Update submodule cudf to
 6a23775db29dc4b38820994297c94201c9287aaf (#1688)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 3a1601d614..6a23775db2 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 3a1601d61437b339c47a015dab7a830998b182f9
+Subproject commit 6a23775db29dc4b38820994297c94201c9287aaf

From 035dbd669d428a19c2b57b30e2ba3cb8064e909a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 11 Jan 2024 06:02:54 +0800
Subject: [PATCH 087/127] Update submodule cudf to
 fa37e13db360e0b685bc6af020aa7510f1fbbdbd (#1691)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 6a23775db2..fa37e13db3 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 6a23775db29dc4b38820994297c94201c9287aaf
+Subproject commit fa37e13db360e0b685bc6af020aa7510f1fbbdbd

From 681391435d43ffb82dfafed4bd433e4d38aa1c5b Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 11 Jan 2024 11:25:10 +0800
Subject: [PATCH 088/127] Update submodule cudf to
 1078326535c9989a2e904d78ceb708a097be989b (#1693)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index fa37e13db3..1078326535 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit fa37e13db360e0b685bc6af020aa7510f1fbbdbd
+Subproject commit 1078326535c9989a2e904d78ceb708a097be989b

From ed7c814f14bd1f2dba6351c66da0e220cde2ea79 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Thu, 11 Jan 2024 10:29:38 -0600
Subject: [PATCH 089/127] Add explicit mention of git submodule use at top of
 CONTRIBUTING.md (#1690)

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
---
 CONTRIBUTING.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e8f64a9ae9..5fb76d548e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -28,6 +28,17 @@ There are two types of branches in this repository:
   is held here. `main` will change with new releases, but otherwise it should not change with
   every pull request merged, making it a more stable branch.
 
+## Git Submodules
+
+This repository uses Git submodules. After cloning this repository or moving to a new commit
+in this repository you will need to ensure the submodules are initialized and updated to the
+expected submodule commits. This can be done by executing the following command at the top of
+the repository:
+
+```commandline
+git submodule update --init --recursive
+```
+
 ## Building From Source
 
 [Maven](https://maven.apache.org) is used for most aspects of the build. For example, the

From dd00ca2d9304388e668dba8429f05b076dbbf7b0 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 12 Jan 2024 05:26:18 +0800
Subject: [PATCH 090/127] Update submodule cudf to
 85acdc640701940e47b3969b14a811f33e7faf5b (#1694)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 1078326535..85acdc6407 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 1078326535c9989a2e904d78ceb708a097be989b
+Subproject commit 85acdc640701940e47b3969b14a811f33e7faf5b

From a525bafbdac7bbcc4d9e49bcd6b2aa4a7c70a3dc Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 12 Jan 2024 12:06:39 +0800
Subject: [PATCH 091/127] Update submodule cudf to
 7a42b8b57923b9515391cfe2c4668380b15ed118 (#1695)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 85acdc6407..7a42b8b579 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 85acdc640701940e47b3969b14a811f33e7faf5b
+Subproject commit 7a42b8b57923b9515391cfe2c4668380b15ed118

From 3187eebd684a6efa5924f5667791a96d6d1f4012 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 12 Jan 2024 17:25:48 +0800
Subject: [PATCH 092/127] Update submodule cudf to
 27b106f832999afa5b3353aaa2adcdb695fb4a47 (#1696)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 7a42b8b579..27b106f832 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 7a42b8b57923b9515391cfe2c4668380b15ed118
+Subproject commit 27b106f832999afa5b3353aaa2adcdb695fb4a47

From 922bca294d8ebd52560197d3c4d6fc8c6ae5e305 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 12 Jan 2024 21:30:37 +0800
Subject: [PATCH 093/127] Update submodule cudf to
 5c78b7ea6b75f503d5df4abc828d80a0b470a284 (#1697)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 27b106f832..5c78b7ea6b 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 27b106f832999afa5b3353aaa2adcdb695fb4a47
+Subproject commit 5c78b7ea6b75f503d5df4abc828d80a0b470a284

From d9d87a3464d343ef72751c8fec9d49d3cdeef9d4 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 13 Jan 2024 05:25:08 +0800
Subject: [PATCH 094/127] Update submodule cudf to
 7ca988f207730a3ae936e90d0104c4e6a14749ff (#1698)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5c78b7ea6b..7ca988f207 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5c78b7ea6b75f503d5df4abc828d80a0b470a284
+Subproject commit 7ca988f207730a3ae936e90d0104c4e6a14749ff

From e264d32b3acfb5005b7e0240d704321b879f71a1 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Mon, 15 Jan 2024 21:29:37 +0800
Subject: [PATCH 095/127] Update submodule cudf to
 07103355fea0fb3fd0e1115019bbac7d65bb132f (#1699)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 7ca988f207..07103355fe 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 7ca988f207730a3ae936e90d0104c4e6a14749ff
+Subproject commit 07103355fea0fb3fd0e1115019bbac7d65bb132f

From e5c9657b2e216ff0d63b297c27710a5439544b7f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 16 Jan 2024 23:29:56 +0800
Subject: [PATCH 096/127] Update submodule cudf to
 726a7f30757d1a06d74d86bb82cf311cb159f7fd (#1701)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 07103355fe..726a7f3075 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 07103355fea0fb3fd0e1115019bbac7d65bb132f
+Subproject commit 726a7f30757d1a06d74d86bb82cf311cb159f7fd

From c3be4f471623c6e77bf5ec50936c8a069d344c44 Mon Sep 17 00:00:00 2001
From: Chong Gao <chongg@nvidia.com>
Date: Wed, 17 Jan 2024 10:28:28 +0800
Subject: [PATCH 097/127] Fix memory leak in time zone DB (#1689)

* Fix memory leak in time zone DB

Signed-off-by: Chong Gao <res_life@163.com>

* Fix bug

* Address comments

* Refine the sync

* Fix compile error

---------

Signed-off-by: Chong Gao <res_life@163.com>
Co-authored-by: Chong Gao <res_life@163.com>
---
 .../spark/rapids/jni/GpuTimeZoneDB.java       | 417 ++++++++++--------
 1 file changed, 236 insertions(+), 181 deletions(-)

diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index b63a9dc282..643db278df 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2023, NVIDIA CORPORATION.
+* Copyright (c) 2023-2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,13 @@
 
 package com.nvidia.spark.rapids.jni;
 
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.DType;
+import ai.rapids.cudf.HostColumnVector;
+import ai.rapids.cudf.Table;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import java.time.Instant;
 import java.time.ZoneId;
 import java.time.zone.ZoneOffsetTransition;
@@ -26,72 +33,171 @@
 import java.util.List;
 import java.util.Map;
 import java.util.TimeZone;
-import java.util.concurrent.*;
-
-import ai.rapids.cudf.ColumnVector;
-import ai.rapids.cudf.DType;
-import ai.rapids.cudf.HostColumnVector;
-import ai.rapids.cudf.Table;
+import java.util.concurrent.Executors;
 
+/**
+ * Gpu time zone utility.
+ * Provides two kinds of APIs
+ *  - Time zone transitions cache APIs
+ *      `cacheDatabaseAsync`, `cacheDatabase` and `shutdown` are synchronized.
+ *      When cacheDatabaseAsync is running, the `shutdown` and `cacheDatabase` will wait;
+ *      These APIs guarantee only one thread is loading transitions cache,
+ *      And guarantee loading cache only occurs one time.
+ *  - Rebase time zone APIs
+ *    fromTimestampToUtcTimestamp, fromUtcTimestampToTimestamp ...
+ */
 public class GpuTimeZoneDB {
-
-  public static final int TIMEOUT_SECS = 300;
-
+  private static final Logger log = LoggerFactory.getLogger(GpuTimeZoneDB.class);
 
   // For the timezone database, we store the transitions in a ColumnVector that is a list of 
   // structs. The type of this column vector is:
   //   LIST<STRUCT<utcInstant: int64, localInstant: int64, offset: int32>>
-  private CompletableFuture<Map<String, Integer>> zoneIdToTableFuture;
-  private CompletableFuture<HostColumnVector> fixedTransitionsFuture;
+  private Map<String, Integer> zoneIdToTable;
 
-  private boolean closed = false;
+  // use this reference to indicate if time zone cache is initialized.
+  private HostColumnVector fixedTransitions;
 
-  GpuTimeZoneDB() {
-    zoneIdToTableFuture = new CompletableFuture<>();
-    fixedTransitionsFuture = new CompletableFuture<>();
+  // Guarantee singleton instance
+  private GpuTimeZoneDB() {
   }
-  
-  private static GpuTimeZoneDB instance = new GpuTimeZoneDB();
-  // This method is default visibility for testing purposes only. The instance will be never be exposed publicly
-  // for this class.
+
+  // singleton instance
+  private static final GpuTimeZoneDB instance = new GpuTimeZoneDB();
+
+  // This method is default visibility for testing purposes only.
+  // The instance will be never be exposed publicly for this class.
   static GpuTimeZoneDB getInstance() {
     return instance;
   }
-  
+
+  static class LoadingLock {
+    Boolean isLoading = false;
+
+    // record whether a shutdown is called ever.
+    // if `isCloseCalledEver` is true, then the following loading should be skipped.
+    Boolean isShutdownCalledEver = false;
+  }
+
+  private static final LoadingLock lock = new LoadingLock();
+
   /**
-   * Start to cache the database. This should be called on startup of an executor. It should start
-   * to cache the data on the CPU in a background thread. It should return immediately and allow the
-   * other APIs to be called. Depending on what we want to do we can have the other APIs block
-   * until this is done caching, or we can have private APIs that would let us load and use specific
-   * parts of the database. I prefer the former solution at least until we see a performance hit
-   * where we are waiting on the database to finish loading.
+   * This should be called on startup of an executor.
+   * Runs in a thread asynchronously.
+   * If `shutdown` was called ever, then will not load the cache
    */
-  public static void cacheDatabase() {
-    synchronized (instance) {
-      if (!instance.isLoaded()) {
-        Executor executor = Executors.newSingleThreadExecutor(
-          new ThreadFactory() {
-            private ThreadFactory defaultFactory = Executors.defaultThreadFactory();
-
-            @Override
-            public Thread newThread(Runnable r) {
-              Thread thread = defaultFactory.newThread(r);
-              thread.setName("gpu-timezone-database-0");
-              thread.setDaemon(true);
-              return thread;
-            }
-          });
-        instance.loadData(executor);
+  public static void cacheDatabaseAsync() {
+    synchronized (lock) {
+      if (lock.isShutdownCalledEver) {
+        // shutdown was called ever, will never load cache again.
+        return;
+      }
+
+      if (lock.isLoading) {
+        // another thread is loading(), return
+        return;
+      } else {
+        lock.isLoading = true;
       }
     }
+
+    // start a new thread to load
+    Runnable runnable = () -> {
+      try {
+        instance.cacheDatabaseImpl();
+      } catch (Exception e) {
+        log.error("cache time zone transitions cache failed", e);
+      } finally {
+        synchronized (lock) {
+          // now loading is done
+          lock.isLoading = false;
+          // `cacheDatabase` and `shutdown` may wait loading is done.
+          lock.notify();
+        }
+      }
+    };
+    Thread thread = Executors.defaultThreadFactory().newThread(runnable);
+    thread.setName("gpu-timezone-database-0");
+    thread.setDaemon(true);
+    thread.start();
   }
 
+  /**
+   * Cache the database. This will take some time like several seconds.
+   * If one `cacheDatabase` is running, other `cacheDatabase` will wait until caching is done.
+   * If cache is exits, do not load cache again.
+   */
+  public static void cacheDatabase() {
+    synchronized (lock) {
+      if (lock.isLoading) {
+        // another thread is loading(), wait loading is done
+        while (lock.isLoading) {
+          try {
+            lock.wait();
+          } catch (InterruptedException e) {
+            throw new IllegalStateException("cache time zone transitions cache failed", e);
+          }
+        }
+        return;
+      } else {
+        lock.isLoading = true;
+      }
+    }
 
+    try {
+      instance.cacheDatabaseImpl();
+    } finally {
+      // loading is done.
+      synchronized (lock) {
+        lock.isLoading = false;
+        // `cacheDatabase` and/or `shutdown` may wait loading is done.
+        lock.notify();
+      }
+    }
+  }
+
+  /**
+   * close the cache, used when Plugin is closing
+   */
   public static void shutdown() {
-    if (instance.isLoaded()) {
-      instance.close();
-      // Recreate a new instance to reload the database if necessary
-      instance = new GpuTimeZoneDB();
+    synchronized (lock) {
+      lock.isShutdownCalledEver = true;
+      while (lock.isLoading) {
+        // wait until loading is done
+        try {
+          lock.wait();
+        } catch (InterruptedException e) {
+          throw new IllegalStateException("shutdown time zone transitions cache failed", e);
+        }
+      }
+      instance.shutdownImpl();
+      // `cacheDatabase` and/or `shutdown` may wait loading is done.
+      lock.notify();
+    }
+  }
+
+  private void cacheDatabaseImpl() {
+    if (fixedTransitions == null) {
+      try {
+        loadData();
+      } catch (Exception e) {
+        closeResources();
+        throw e;
+      }
+    }
+  }
+
+  private void shutdownImpl() {
+    closeResources();
+  }
+
+  private void closeResources()  {
+    if (zoneIdToTable != null) {
+      zoneIdToTable.clear();
+      zoneIdToTable = null;
+    }
+    if (fixedTransitions != null) {
+      fixedTransitions.close();
+      fixedTransitions = null;
     }
   }
 
@@ -102,15 +208,12 @@ public static ColumnVector fromTimestampToUtcTimestamp(ColumnVector input, ZoneI
       throw new IllegalArgumentException(String.format("Unsupported timezone: %s",
           currentTimeZone.toString()));
     }
-    if (!instance.isLoaded()) {
-      cacheDatabase(); // lazy load the database
-    }
+    cacheDatabase();
     Integer tzIndex = instance.getZoneIDMap().get(currentTimeZone.normalized().toString());
-    Table transitions = instance.getTransitions();
-    ColumnVector result = new ColumnVector(convertTimestampColumnToUTC(input.getNativeView(),
-        transitions.getNativeView(), tzIndex));
-    transitions.close();
-    return result;
+    try (Table transitions = instance.getTransitions()) {
+      return new ColumnVector(convertTimestampColumnToUTC(input.getNativeView(),
+          transitions.getNativeView(), tzIndex));
+    }
   }
   
   public static ColumnVector fromUtcTimestampToTimestamp(ColumnVector input, ZoneId desiredTimeZone) {
@@ -120,15 +223,12 @@ public static ColumnVector fromUtcTimestampToTimestamp(ColumnVector input, ZoneI
       throw new IllegalArgumentException(String.format("Unsupported timezone: %s",
           desiredTimeZone.toString()));
     }
-    if (!instance.isLoaded()) {
-      cacheDatabase(); // lazy load the database
-    }
+    cacheDatabase();
     Integer tzIndex = instance.getZoneIDMap().get(desiredTimeZone.normalized().toString());
-    Table transitions = instance.getTransitions();
-    ColumnVector result = new ColumnVector(convertUTCTimestampColumnToTimeZone(input.getNativeView(),
-        transitions.getNativeView(), tzIndex));
-    transitions.close();
-    return result;
+    try (Table transitions = instance.getTransitions()) {
+      return new ColumnVector(convertUTCTimestampColumnToTimeZone(input.getNativeView(),
+          transitions.getNativeView(), tzIndex));
+    }
   }
   
   // TODO: Deprecate this API when we support all timezones 
@@ -157,128 +257,85 @@ public static ZoneId getZoneId(String timeZoneId) {
     return ZoneId.of(formattedZoneId, ZoneId.SHORT_IDS);
   }
 
-  private boolean isLoaded() {
-    return zoneIdToTableFuture.isDone();
-  }
-  
-  private void loadData(Executor executor) throws IllegalStateException {
-    // Start loading the data in separate thread and return
-    try {
-      executor.execute(this::doLoadData);
-    } catch (RejectedExecutionException e) {
-      throw new IllegalStateException(e);
-    }
-  }
-
   @SuppressWarnings("unchecked")
-  private void doLoadData() {
-    synchronized (this) {
-      try {
-        Map<String, Integer> zoneIdToTable = new HashMap<>();
-        List<List<HostColumnVector.StructData>> masterTransitions = new ArrayList<>();
-        for (String tzId : TimeZone.getAvailableIDs()) {
-          ZoneId zoneId;
-          try {
-            zoneId = ZoneId.of(tzId).normalized(); // we use the normalized form to dedupe
-          } catch (ZoneRulesException e) {
-            // Sometimes the list of getAvailableIDs() is one of the 3-letter abbreviations, however,
-            // this use is deprecated due to ambiguity reasons (same abbrevation can be used for 
-            // multiple time zones). These are not supported by ZoneId.of(...) directly here.
-            continue;
-          }
-          ZoneRules zoneRules = zoneId.getRules();
-          // Filter by non-repeating rules
-          if (!zoneRules.isFixedOffset() && !zoneRules.getTransitionRules().isEmpty()) {
-            continue;
-          }
-          if (!zoneIdToTable.containsKey(zoneId.getId())) {
-            List<ZoneOffsetTransition> transitions = zoneRules.getTransitions();
-            int idx = masterTransitions.size();
-            List<HostColumnVector.StructData> data = new ArrayList<>();
-            if (zoneRules.isFixedOffset()) {
-              data.add(
-                  new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE,
-                      zoneRules.getOffset(Instant.now()).getTotalSeconds())
-              );
-            } else {
-              // Capture the first official offset (before any transition) using Long min
-              ZoneOffsetTransition first = transitions.get(0);
-              data.add(
-                  new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE,
-                      first.getOffsetBefore().getTotalSeconds())
-              );
-              transitions.forEach(t -> {
-                // Whether transition is an overlap vs gap.
-                // In Spark:
-                // if it's a gap, then we use the offset after *on* the instant
-                // If it's an overlap, then there are 2 sets of valid timestamps in that are overlapping
-                // So, for the transition to UTC, you need to compare to instant + {offset before} 
-                // The time math still uses {offset after}
-                if (t.isGap()) {
-                  data.add(
-                      new HostColumnVector.StructData(
-                          t.getInstant().getEpochSecond(),
-                          t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(),
-                          t.getOffsetAfter().getTotalSeconds())
-                  );
-                } else {
-                  data.add(
-                      new HostColumnVector.StructData(
-                          t.getInstant().getEpochSecond(),
-                          t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(),
-                          t.getOffsetAfter().getTotalSeconds())
-                  );
-                }
-              });
-            }
-            masterTransitions.add(data);
-            zoneIdToTable.put(zoneId.getId(), idx);
+  private void loadData() {
+    try {
+      List<List<HostColumnVector.StructData>> masterTransitions = new ArrayList<>();
+      zoneIdToTable = new HashMap<>();
+      for (String tzId : TimeZone.getAvailableIDs()) {
+        ZoneId zoneId;
+        try {
+          zoneId = ZoneId.of(tzId).normalized(); // we use the normalized form to dedupe
+        } catch (ZoneRulesException e) {
+          // Sometimes the list of getAvailableIDs() is one of the 3-letter abbreviations, however,
+          // this use is deprecated due to ambiguity reasons (same abbrevation can be used for
+          // multiple time zones). These are not supported by ZoneId.of(...) directly here.
+          continue;
+        }
+        ZoneRules zoneRules = zoneId.getRules();
+        // Filter by non-repeating rules
+        if (!zoneRules.isFixedOffset() && !zoneRules.getTransitionRules().isEmpty()) {
+          continue;
+        }
+        if (!zoneIdToTable.containsKey(zoneId.getId())) {
+          List<ZoneOffsetTransition> transitions = zoneRules.getTransitions();
+          int idx = masterTransitions.size();
+          List<HostColumnVector.StructData> data = new ArrayList<>();
+          if (zoneRules.isFixedOffset()) {
+            data.add(
+                new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE,
+                    zoneRules.getOffset(Instant.now()).getTotalSeconds())
+            );
+          } else {
+            // Capture the first official offset (before any transition) using Long min
+            ZoneOffsetTransition first = transitions.get(0);
+            data.add(
+                new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE,
+                    first.getOffsetBefore().getTotalSeconds())
+            );
+            transitions.forEach(t -> {
+              // Whether transition is an overlap vs gap.
+              // In Spark:
+              // if it's a gap, then we use the offset after *on* the instant
+              // If it's an overlap, then there are 2 sets of valid timestamps in that are overlapping
+              // So, for the transition to UTC, you need to compare to instant + {offset before}
+              // The time math still uses {offset after}
+              if (t.isGap()) {
+                data.add(
+                    new HostColumnVector.StructData(
+                        t.getInstant().getEpochSecond(),
+                        t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(),
+                        t.getOffsetAfter().getTotalSeconds())
+                );
+              } else {
+                data.add(
+                    new HostColumnVector.StructData(
+                        t.getInstant().getEpochSecond(),
+                        t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(),
+                        t.getOffsetAfter().getTotalSeconds())
+                );
+              }
+            });
           }
+          masterTransitions.add(data);
+          zoneIdToTable.put(zoneId.getId(), idx);
         }
-        HostColumnVector.DataType childType = new HostColumnVector.StructType(false,
-            new HostColumnVector.BasicType(false, DType.INT64),
-            new HostColumnVector.BasicType(false, DType.INT64),
-            new HostColumnVector.BasicType(false, DType.INT32));
-        HostColumnVector.DataType resultType =
-            new HostColumnVector.ListType(false, childType);
-        HostColumnVector fixedTransitions = HostColumnVector.fromLists(resultType,
-            masterTransitions.toArray(new List[0]));
-        fixedTransitionsFuture.complete(fixedTransitions);
-        zoneIdToTableFuture.complete(zoneIdToTable);
-      } catch (Exception e) {
-        fixedTransitionsFuture.completeExceptionally(e);
-        zoneIdToTableFuture.completeExceptionally(e);
-        throw e;
       }
-    }
-  }
-
-  private void close() {
-    synchronized (this) {
-      if (closed) {
-        return;
-      }
-      try (HostColumnVector hcv = getHostFixedTransitions()) {
-        // automatically closed
-        closed = true;
-      }
-    }
-  }
-
-  private HostColumnVector getHostFixedTransitions() {
-    try {
-      return fixedTransitionsFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS);
-    } catch (InterruptedException | ExecutionException | TimeoutException e) {
-      throw new RuntimeException(e);
+      HostColumnVector.DataType childType = new HostColumnVector.StructType(false,
+          new HostColumnVector.BasicType(false, DType.INT64),
+          new HostColumnVector.BasicType(false, DType.INT64),
+          new HostColumnVector.BasicType(false, DType.INT32));
+      HostColumnVector.DataType resultType =
+          new HostColumnVector.ListType(false, childType);
+      fixedTransitions = HostColumnVector.fromLists(resultType,
+          masterTransitions.toArray(new List[0]));
+    } catch (Exception e) {
+      throw new IllegalStateException("load time zone DB cache failed!", e);
     }
   }
 
   private Map<String, Integer> getZoneIDMap() {
-    try {
-      return zoneIdToTableFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS);
-    } catch (InterruptedException | ExecutionException | TimeoutException e) {
-      throw new RuntimeException(e);
-    }
+    return zoneIdToTable;
   }
 
   private Table getTransitions() {
@@ -288,8 +345,7 @@ private Table getTransitions() {
   }
 
   private ColumnVector getFixedTransitions() {
-    HostColumnVector hostTransitions = getHostFixedTransitions();
-    return hostTransitions.copyToDevice();
+    return fixedTransitions.copyToDevice();
   }
 
   /**
@@ -308,8 +364,7 @@ List getHostFixedTransitions(String zoneId) {
     if (idx == null) {
       return null;
     }
-    HostColumnVector transitions = getHostFixedTransitions();
-    return transitions.getList(idx);
+    return fixedTransitions.getList(idx);
   }
 
 

From 92adf79c297034d484b75c207947699a03264286 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 17 Jan 2024 11:31:15 +0800
Subject: [PATCH 098/127] Update submodule cudf to
 2bead955ce5f43887a6ccc9d9834ca57ce58029d (#1702)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 726a7f3075..2bead955ce 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 726a7f30757d1a06d74d86bb82cf311cb159f7fd
+Subproject commit 2bead955ce5f43887a6ccc9d9834ca57ce58029d

From 277c032ed8393ba20fa18fe2766f8c2e0ea10390 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 17 Jan 2024 17:31:01 +0800
Subject: [PATCH 099/127] Update submodule cudf to
 8f5e64ddcba788ddcc715fda7f2bf852166b7ee6 (#1705)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 2bead955ce..8f5e64ddcb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 2bead955ce5f43887a6ccc9d9834ca57ce58029d
+Subproject commit 8f5e64ddcba788ddcc715fda7f2bf852166b7ee6

From ad5514ae8abd632232cc65f9f92009e8e3fe32f7 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 17 Jan 2024 16:15:49 -0600
Subject: [PATCH 100/127] Update to new cudf strings where character data is no
 longer a child column (#1708)

* Update to new cudf strings where character data is no longer a child column

Signed-off-by: Jason Lowe <jlowe@nvidia.com>

* clang style fixes

---------

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
---
 src/main/cpp/src/cast_string_to_float.cu |  8 ++++----
 src/main/cpp/src/map_utils.cu            | 18 +++++++++++-------
 src/main/cpp/src/parse_uri.cu            | 23 +++++++++++++----------
 thirdparty/cudf                          |  2 +-
 4 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/src/main/cpp/src/cast_string_to_float.cu b/src/main/cpp/src/cast_string_to_float.cu
index fe8a7f64db..75523cd360 100644
--- a/src/main/cpp/src/cast_string_to_float.cu
+++ b/src/main/cpp/src/cast_string_to_float.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -679,7 +679,7 @@ std::unique_ptr<column> string_to_float(data_type dtype,
         out->mutable_view().null_mask(),
         ansi_mode ? static_cast<ScalarType*>(ansi_count.get())->data() : nullptr,
         static_cast<ScalarType*>(valid_count.get())->data(),
-        string_col.chars().begin<char>(),
+        string_col.chars_begin(stream),
         string_col.offsets().begin<size_type>(),
         string_col.null_mask(),
         num_rows);
@@ -690,7 +690,7 @@ std::unique_ptr<column> string_to_float(data_type dtype,
         out->mutable_view().null_mask(),
         ansi_mode ? static_cast<ScalarType*>(ansi_count.get())->data() : nullptr,
         static_cast<ScalarType*>(valid_count.get())->data(),
-        string_col.chars().begin<char>(),
+        string_col.chars_begin(stream),
         string_col.offsets().begin<size_type>(),
         string_col.null_mask(),
         num_rows);
@@ -714,7 +714,7 @@ std::unique_ptr<column> string_to_float(data_type dtype,
       dest.resize(string_bounds[1] - string_bounds[0]);
 
       cudaMemcpyAsync(dest.data(),
-                      &string_col.chars().data<char const>()[string_bounds[0]],
+                      &string_col.chars_begin(stream)[string_bounds[0]],
                       string_bounds[1] - string_bounds[0],
                       cudaMemcpyDeviceToHost,
                       stream.value());
diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/map_utils.cu
index a51a7de57b..002dadb0e3 100644
--- a/src/main/cpp/src/map_utils.cu
+++ b/src/main/cpp/src/map_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -75,24 +75,28 @@ rmm::device_uvector<char> unify_json_strings(cudf::column_view const& input,
   }
 
   auto const d_strings  = cudf::column_device_view::create(input, stream);
-  auto const chars_size = input.child(cudf::strings_column_view::chars_column_index).size();
+  auto const input_scv  = cudf::strings_column_view{input};
+  auto const chars_size = input_scv.chars_size(stream);
   auto const output_size =
     2l +                                            // two extra bracket characters '[' and ']'
     static_cast<int64_t>(chars_size) +
     static_cast<int64_t>(input.size() - 1) +        // append `,` character between input rows
     static_cast<int64_t>(input.null_count()) * 2l;  // replace null with "{}"
+  // TODO: This assertion eventually needs to be removed.
+  // See https://github.com/NVIDIA/spark-rapids-jni/issues/1707
   CUDF_EXPECTS(output_size <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
                "The input json column is too large and causes overflow.");
 
   auto const joined_input = cudf::strings::detail::join_strings(
-    cudf::strings_column_view{input},
+    input_scv,
     cudf::string_scalar(","),   // append `,` character between the input rows
     cudf::string_scalar("{}"),  // replacement for null rows
     stream,
     rmm::mr::get_current_device_resource());
-  auto const joined_input_child =
-    joined_input->child(cudf::strings_column_view::chars_column_index);
-  auto const joined_input_size_bytes = joined_input_child.size();
+  auto const joined_input_scv        = cudf::strings_column_view{*joined_input};
+  auto const joined_input_size_bytes = joined_input_scv.chars_size(stream);
+  // TODO: This assertion requires a stream synchronization, may want to remove at some point.
+  // See https://github.com/NVIDIA/spark-rapids-jni/issues/1707
   CUDF_EXPECTS(joined_input_size_bytes + 2 == output_size, "Incorrect output size computation.");
 
   // We want to concatenate 3 strings: "[" + joined_input + "]".
@@ -100,7 +104,7 @@ rmm::device_uvector<char> unify_json_strings(cudf::column_view const& input,
   auto output = rmm::device_uvector<char>(joined_input_size_bytes + 2, stream);
   CUDF_CUDA_TRY(cudaMemsetAsync(output.data(), static_cast<int>('['), 1, stream.value()));
   CUDF_CUDA_TRY(cudaMemcpyAsync(output.data() + 1,
-                                joined_input_child.view().data<char>(),
+                                joined_input_scv.chars_begin(stream),
                                 joined_input_size_bytes,
                                 cudaMemcpyDefault,
                                 stream.value()));
diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index 897ebe0208..83b14ced9e 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -700,19 +700,20 @@ uri_parts __device__ validate_uri(const char* str, int len)
  *
  * @param in_strings Input string column
  * @param chunk Chunk of URI to return
+ * @param base_ptr Pointer to the start of the character data in the strings column
  * @param out_lengths Number of characters in each decode URL
  * @param out_offsets Offsets to the start of the chunks
  * @param out_validity Bitmask of validity data, updated in function
  */
 __global__ void parse_uri_char_counter(column_device_view const in_strings,
                                        URI_chunks chunk,
+                                       char const* const base_ptr,
                                        size_type* const out_lengths,
                                        size_type* const out_offsets,
                                        bitmask_type* out_validity)
 {
   // thread per row
-  auto const tid      = cudf::detail::grid_1d::global_thread_id();
-  auto const base_ptr = in_strings.child(strings_column_view::chars_column_index).data<char>();
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
 
   for (thread_index_type tidx = tid; tidx < in_strings.size();
        tidx += cudf::detail::grid_1d::grid_stride()) {
@@ -778,17 +779,18 @@ __global__ void parse_uri_char_counter(column_device_view const in_strings,
  * @brief Parse protocol and copy from the input string column to the output char buffer.
  *
  * @param in_strings Input string column
+ * @param base_ptr Pointer to the start of the character data in the strings column
  * @param src_offsets Offset value of source strings in in_strings
  * @param offsets Offset value of each string associated with `out_chars`
  * @param out_chars Character buffer for the output string column
  */
 __global__ void parse_uri(column_device_view const in_strings,
+                          char const* const base_ptr,
                           size_type const* const src_offsets,
                           size_type const* const offsets,
                           char* const out_chars)
 {
-  auto const tid      = cudf::detail::grid_1d::global_thread_id();
-  auto const base_ptr = in_strings.child(strings_column_view::chars_column_index).data<char>();
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
 
   for (thread_index_type tidx = tid; tidx < in_strings.size();
        tidx += cudf::detail::grid_1d::grid_stride()) {
@@ -840,6 +842,7 @@ std::unique_ptr<column> parse_uri(strings_column_view const& input,
   parse_uri_char_counter<<<num_threadblocks, threadblock_size, 0, stream.value()>>>(
     *d_strings,
     chunk,
+    input.chars_begin(stream),
     offsets_mutable_view.begin<size_type>(),
     reinterpret_cast<size_type*>(src_offsets.data()),
     reinterpret_cast<bitmask_type*>(null_mask.data()));
@@ -854,23 +857,23 @@ std::unique_ptr<column> parse_uri(strings_column_view const& input,
   // to the host memory
   auto out_chars_bytes = cudf::detail::get_value<size_type>(offsets_view, offset_count - 1, stream);
 
-  // create the chars column
-  auto chars_column = cudf::strings::detail::create_chars_child_column(out_chars_bytes, stream, mr);
-  auto d_out_chars  = chars_column->mutable_view().data<char>();
+  // create the chars buffer
+  auto d_out_chars = rmm::device_buffer(out_chars_bytes, stream, mr);
 
   // copy the characters from the input column to the output column
   parse_uri<<<num_threadblocks, threadblock_size, 0, stream.value()>>>(
     *d_strings,
+    input.chars_begin(stream),
     reinterpret_cast<size_type*>(src_offsets.data()),
     offsets_column->view().begin<size_type>(),
-    d_out_chars);
+    static_cast<char*>(d_out_chars.data()));
 
   auto null_count =
     cudf::null_count(reinterpret_cast<bitmask_type*>(null_mask.data()), 0, strings_count);
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(d_out_chars),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8f5e64ddcb..6abef4a474 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8f5e64ddcba788ddcc715fda7f2bf852166b7ee6
+Subproject commit 6abef4a4746f1f9917711f372726023efdc21e85

From 5ffd328274605392f12c1157b770067c6f22033f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 18 Jan 2024 12:05:48 +0800
Subject: [PATCH 101/127] [submodule-sync] bot-submodule-sync-branch-24.02 to
 branch-24.02 [skip ci] [bot] (#1706)

* Update submodule cudf to c7acdaa231fb0ffe7751611590f9b85ba7508d4d

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to c81198789be183e7e1eb288eb98dd16f65b57e44

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 9acddc08cc209e8d6b94891be6131edd63ff5b43

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

---------

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 6abef4a474..9acddc08cc 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 6abef4a4746f1f9917711f372726023efdc21e85
+Subproject commit 9acddc08cc209e8d6b94891be6131edd63ff5b43

From 8331658383d088c95759c6e3bf77bdc310978a0e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 19 Jan 2024 06:05:10 +0800
Subject: [PATCH 102/127] Update submodule cudf to
 66c3e8e92f9c37dd909b78936addb463f1bd6011 (#1709)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 9acddc08cc..66c3e8e92f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 9acddc08cc209e8d6b94891be6131edd63ff5b43
+Subproject commit 66c3e8e92f9c37dd909b78936addb463f1bd6011

From fea7d22a6a9578f8cf960c909df34e204ed1873e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 19 Jan 2024 12:05:14 +0800
Subject: [PATCH 103/127] Update submodule cudf to
 eeee795c232e2811adeb5a3942f7a149d8b16d49 (#1710)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 66c3e8e92f..eeee795c23 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 66c3e8e92f9c37dd909b78936addb463f1bd6011
+Subproject commit eeee795c232e2811adeb5a3942f7a149d8b16d49

From 91407929bea933bf53ab6b20a23843572b4f687f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 19 Jan 2024 17:27:17 +0800
Subject: [PATCH 104/127] Update submodule cudf to
 f785ed3ddebf8b225b9d7c07aab9d5f32eb39b05 (#1711)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index eeee795c23..f785ed3dde 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit eeee795c232e2811adeb5a3942f7a149d8b16d49
+Subproject commit f785ed3ddebf8b225b9d7c07aab9d5f32eb39b05

From d45dca03ffa9eaa7aff8494fb98188787e654795 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 20 Jan 2024 06:05:25 +0800
Subject: [PATCH 105/127] Update submodule cudf to
 a38fc01a6b8cb8506753b6a7fd77c7444e25d52c (#1712)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f785ed3dde..a38fc01a6b 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f785ed3ddebf8b225b9d7c07aab9d5f32eb39b05
+Subproject commit a38fc01a6b8cb8506753b6a7fd77c7444e25d52c

From 93f4a38d33c7f40f326f32ea1a03636f4e244eb6 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 20 Jan 2024 11:24:51 +0800
Subject: [PATCH 106/127] Update submodule cudf to
 1c37c780ced37d6084c90b815b274b598665d60e (#1713)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index a38fc01a6b..1c37c780ce 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit a38fc01a6b8cb8506753b6a7fd77c7444e25d52c
+Subproject commit 1c37c780ced37d6084c90b815b274b598665d60e

From 13e4652011ce5dab8fed19bd2873931238515169 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 20 Jan 2024 17:23:39 +0800
Subject: [PATCH 107/127] Update submodule cudf to
 19942809679e4675c296a38f90bfdbaa8574eee2 (#1714)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 1c37c780ce..1994280967 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 1c37c780ced37d6084c90b815b274b598665d60e
+Subproject commit 19942809679e4675c296a38f90bfdbaa8574eee2

From 5de3c3a9356005a8dbe4611e5045557072505d7c Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 23 Jan 2024 06:03:16 +0800
Subject: [PATCH 108/127] Update submodule cudf to
 f24f0b528b16454a2b79182f77bb46a663ab2c25 (#1715)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 1994280967..f24f0b528b 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 19942809679e4675c296a38f90bfdbaa8574eee2
+Subproject commit f24f0b528b16454a2b79182f77bb46a663ab2c25

From 96c4ebc23f333a3ca41b46b4f10d9845f092be6e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 23 Jan 2024 12:06:31 +0800
Subject: [PATCH 109/127] Update submodule cudf to
 ef3ce4bc8db008f58249241c16c80f7e6e600fa9 (#1716)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f24f0b528b..ef3ce4bc8d 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f24f0b528b16454a2b79182f77bb46a663ab2c25
+Subproject commit ef3ce4bc8db008f58249241c16c80f7e6e600fa9

From e667df4c888e7fcda9f7e7cac0d8bc027e0b4eac Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 23 Jan 2024 18:09:19 +0800
Subject: [PATCH 110/127] Update submodule cudf to
 a39897c108d44a4d5e027ca741be5462863eeefc (#1717)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index ef3ce4bc8d..a39897c108 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit ef3ce4bc8db008f58249241c16c80f7e6e600fa9
+Subproject commit a39897c108d44a4d5e027ca741be5462863eeefc

From 516c48c29334edea210c89f0498022e0fa708ecf Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 24 Jan 2024 05:32:27 +0800
Subject: [PATCH 111/127] Update submodule cudf to
 67a36a9104097cd6a8ae6efee1018e249f2fe441 (#1720)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index a39897c108..67a36a9104 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit a39897c108d44a4d5e027ca741be5462863eeefc
+Subproject commit 67a36a9104097cd6a8ae6efee1018e249f2fe441

From 7b6b25e470b6aa5594182fd5ccd654b14c043414 Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Wed, 24 Jan 2024 17:18:56 +0800
Subject: [PATCH 112/127] Enable auto-merge from branch-24.02 to branch-24.04
 (#1722)

Signed-off-by: Tim Liu <timl@nvidia.com>
---
 .github/workflows/auto-merge.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
index 77a7701008..08ee4f34d8 100755
--- a/.github/workflows/auto-merge.yml
+++ b/.github/workflows/auto-merge.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,12 +18,12 @@ name: auto-merge HEAD to BASE
 on:
   pull_request_target:
     branches:
-      - branch-23.12
+      - branch-24.02
     types: [closed]
 
 env:
-  HEAD: branch-23.12
-  BASE: branch-24.02
+  HEAD: branch-24.02
+  BASE: branch-24.04
 
 jobs:
   auto-merge:

From 9065404aa70232d5c7141228eaa49cd9c5c800ec Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Wed, 24 Jan 2024 16:25:14 -0500
Subject: [PATCH 113/127] Adding literal support for parse_uri for query
 (#1704)

---
 src/main/cpp/src/ParseURIJni.cpp              |  16 +++
 src/main/cpp/src/parse_uri.cu                 | 104 ++++++++++++++++--
 src/main/cpp/src/parse_uri.hpp                |  15 +++
 src/main/cpp/tests/parse_uri.cpp              |  53 ++++++---
 .../com/nvidia/spark/rapids/jni/ParseURI.java |  13 +++
 .../nvidia/spark/rapids/jni/ParseURITest.java |  38 +++++++
 6 files changed, 216 insertions(+), 23 deletions(-)

diff --git a/src/main/cpp/src/ParseURIJni.cpp b/src/main/cpp/src/ParseURIJni.cpp
index 3af72687b6..c688d10736 100644
--- a/src/main/cpp/src/ParseURIJni.cpp
+++ b/src/main/cpp/src/ParseURIJni.cpp
@@ -61,4 +61,20 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseQuery(JNI
   }
   CATCH_STD(env, 0);
 }
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseQueryWithLiteral(
+  JNIEnv* env, jclass, jlong input_column, jstring query)
+{
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+  JNI_NULL_CHECK(env, query, "query is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const input = reinterpret_cast<cudf::column_view const*>(input_column);
+    cudf::jni::native_jstring native_query(env, query);
+    return cudf::jni::ptr_as_jlong(
+      spark_rapids_jni::parse_uri_to_query(*input, native_query.get()).release());
+  }
+  CATCH_STD(env, 0);
+}
 }
diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index 83b14ced9e..4d21617fd7 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -33,6 +34,7 @@
 #include <cuda/functional>
 
 #include <memory>
+#include <optional>
 
 namespace spark_rapids_jni {
 
@@ -490,7 +492,55 @@ bool __device__ validate_fragment(string_view fragment)
     }));
 }
 
-uri_parts __device__ validate_uri(const char* str, int len)
+__device__ std::pair<string_view, bool> find_query_part(string_view haystack, string_view needle)
+{
+  auto const n_bytes     = needle.size_bytes();
+  auto const find_length = haystack.size_bytes() - n_bytes + 1;
+
+  auto h           = haystack.data();
+  auto const end_h = haystack.data() + find_length;
+  auto n           = needle.data();
+  while (h < end_h) {
+    bool match = true;
+    for (size_type jdx = 0; match && (jdx < n_bytes); ++jdx) {
+      match = (h[jdx] == n[jdx]);
+    }
+    if (match) { match = n_bytes < haystack.size_bytes() && h[n_bytes] == '='; }
+    if (match) {
+      // we don't care about the matched part, we want the string data after that.
+      h += n_bytes;
+      break;
+    } else {
+      // skip to the next param, which is after a &.
+      while (h < end_h && *h != '&') {
+        h++;
+      }
+    }
+    h++;
+  }
+
+  // if h is past the end of the haystack, no match.
+  if (haystack.data() + haystack.size_bytes() <= h || *h != '=') { return {{}, false}; }
+
+  // skip over the =
+  h++;
+
+  // rest of string until end or until '&' is query match
+  auto const bytes_left = haystack.size_bytes() - (h - haystack.data());
+  int match_len         = 0;
+  auto start            = h;
+  while (*h != '&' && match_len < bytes_left) {
+    ++match_len;
+    ++h;
+  }
+
+  return {{start, match_len}, true};
+}
+
+uri_parts __device__ validate_uri(const char* str,
+                                  int len,
+                                  thrust::optional<column_device_view const> query_match,
+                                  size_type row_idx)
 {
   uri_parts ret;
 
@@ -572,6 +622,23 @@ uri_parts __device__ validate_uri(const char* str, int len)
         ret.valid = 0;
         return ret;
       }
+
+      // Maybe limit the query data if a literal or a column is passed as a filter. This alters the
+      // return from the entire query to just a specific parameter. For example, query for the URI
+      // http://www.nvidia.com/page?param0=5&param1=2 is param0=5&param1=2, but if the literal is
+      // passed as param0, the return would simply be 5.
+      if (query_match && query_match->size() > 0) {
+        auto const match_idx = row_idx % query_match->size();
+        auto in_match        = query_match->element<string_view>(match_idx);
+
+        auto const [query, valid] = find_query_part(ret.query, in_match);
+        if (!valid) {
+          ret.valid = 0;
+          return ret;
+        }
+        ret.query = query;
+      }
+
       ret.valid |= (1 << static_cast<int>(URI_chunks::QUERY));
     }
     auto const path_len = question >= 0 ? question : len;
@@ -710,7 +777,8 @@ __global__ void parse_uri_char_counter(column_device_view const in_strings,
                                        char const* const base_ptr,
                                        size_type* const out_lengths,
                                        size_type* const out_offsets,
-                                       bitmask_type* out_validity)
+                                       bitmask_type* out_validity,
+                                       thrust::optional<column_device_view const> query_match)
 {
   // thread per row
   auto const tid = cudf::detail::grid_1d::global_thread_id();
@@ -727,7 +795,7 @@ __global__ void parse_uri_char_counter(column_device_view const in_strings,
     auto const in_chars      = in_string.data();
     auto const string_length = in_string.size_bytes();
 
-    auto const uri = validate_uri(in_chars, string_length);
+    auto const uri = validate_uri(in_chars, string_length, query_match, row_idx);
     if ((uri.valid & (1 << static_cast<int>(chunk))) == 0) {
       out_lengths[row_idx] = 0;
       clear_bit(out_validity, row_idx);
@@ -809,6 +877,7 @@ __global__ void parse_uri(column_device_view const in_strings,
 
 std::unique_ptr<column> parse_uri(strings_column_view const& input,
                                   URI_chunks chunk,
+                                  std::optional<strings_column_view const> query_match,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
@@ -822,6 +891,9 @@ std::unique_ptr<column> parse_uri(strings_column_view const& input,
 
   auto offset_count    = strings_count + 1;
   auto const d_strings = column_device_view::create(input.parent(), stream);
+  auto const d_matches =
+    query_match ? column_device_view::create(query_match->parent(), stream)
+                : std::unique_ptr<column_device_view, std::function<void(column_device_view*)>>{};
 
   // build offsets column
   auto offsets_column = make_numeric_column(
@@ -845,7 +917,8 @@ std::unique_ptr<column> parse_uri(strings_column_view const& input,
     input.chars_begin(stream),
     offsets_mutable_view.begin<size_type>(),
     reinterpret_cast<size_type*>(src_offsets.data()),
-    reinterpret_cast<bitmask_type*>(null_mask.data()));
+    reinterpret_cast<bitmask_type*>(null_mask.data()),
+    d_matches ? thrust::optional<column_device_view const>{*d_matches} : thrust::nullopt);
 
   // use scan to transform number of bytes into offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
@@ -887,7 +960,7 @@ std::unique_ptr<column> parse_uri_to_protocol(strings_column_view const& input,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::parse_uri(input, detail::URI_chunks::PROTOCOL, stream, mr);
+  return detail::parse_uri(input, detail::URI_chunks::PROTOCOL, std::nullopt, stream, mr);
 }
 
 std::unique_ptr<column> parse_uri_to_host(strings_column_view const& input,
@@ -895,7 +968,7 @@ std::unique_ptr<column> parse_uri_to_host(strings_column_view const& input,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::parse_uri(input, detail::URI_chunks::HOST, stream, mr);
+  return detail::parse_uri(input, detail::URI_chunks::HOST, std::nullopt, stream, mr);
 }
 
 std::unique_ptr<column> parse_uri_to_query(strings_column_view const& input,
@@ -903,8 +976,21 @@ std::unique_ptr<column> parse_uri_to_query(strings_column_view const& input,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::parse_uri(
-    input, detail::URI_chunks::QUERY, stream, rmm::mr::get_current_device_resource());
+  return detail::parse_uri(input, detail::URI_chunks::QUERY, std::nullopt, stream, mr);
+}
+
+std::unique_ptr<cudf::column> parse_uri_to_query(cudf::strings_column_view const& input,
+                                                 std::string const& query_match,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+
+  // build string_column_view from incoming query_match string
+  auto d_scalar = make_string_scalar(query_match, stream);
+  auto col      = make_column_from_scalar(*d_scalar, 1);
+
+  return detail::parse_uri(input, detail::URI_chunks::QUERY, strings_column_view(*col), stream, mr);
 }
 
-}  // namespace spark_rapids_jni
\ No newline at end of file
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/parse_uri.hpp b/src/main/cpp/src/parse_uri.hpp
index 07f6f9cd46..bb001e3167 100644
--- a/src/main/cpp/src/parse_uri.hpp
+++ b/src/main/cpp/src/parse_uri.hpp
@@ -65,4 +65,19 @@ std::unique_ptr<cudf::column> parse_uri_to_query(
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Parse query and copy from the input string column to the output string column.
+ *
+ * @param input Input string column of URIs to parse.
+ * @param query_match String to match in query.
+ * @param stream Stream on which to operate.
+ * @param mr Memory resource for returned column.
+ * @return std::unique_ptr<column> String column of queries parsed.
+ */
+std::unique_ptr<cudf::column> parse_uri_to_query(
+  cudf::strings_column_view const& input,
+  std::string const& query_match,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/tests/parse_uri.cpp b/src/main/cpp/tests/parse_uri.cpp
index 36ebbeacc0..234ad380c7 100644
--- a/src/main/cpp/tests/parse_uri.cpp
+++ b/src/main/cpp/tests/parse_uri.cpp
@@ -127,14 +127,15 @@ cudf::test::strings_column_wrapper get_test_data(test_types t)
         "https:// /path/to/file",
       });
     case test_types::QUERY:
-      return cudf::test::strings_column_wrapper({
-        "https://www.nvidia.com/path?param0=1&param2=3&param4=5",
-        "https:// /?params=5&cloth=0&metal=1",
-        "https://[2001:db8::2:1]:443/parms/in/the/uri?a=b",
-        "https://[::1]/?invalid=param&f„⁈.=7",
-        "https://[::1]/?invalid=param&~.=!@&^",
-        "userinfo@www.nvidia.com/path?query=1#Ref",
-      });
+      return cudf::test::strings_column_wrapper(
+        {"https://www.nvidia.com/path?param0=1&param2=3&param4=5",
+         "https:// /?params=5&cloth=0&metal=1&param0=param3",
+         "https://[2001:db8::2:1]:443/parms/in/the/uri?a=b&param0=true",
+         "https://[::1]/?invalid=param&f„⁈.=7&param0=3",
+         "https://[::1]/?invalid=param&param0=f„⁈&~.=!@&^",
+         "userinfo@www.nvidia.com/path?query=1&param0=5#Ref",
+         "https://www.nvidia.com/path?brokenparam0=1&fakeparam0=5&param0=true",
+         "http://nvidia.com?CBA=CBA&C=C"});
     default: CUDF_FAIL("Test type unsupported!"); return cudf::test::strings_column_wrapper();
   }
 }
@@ -362,12 +363,36 @@ TEST_F(ParseURIQueryTests, SparkEdges)
 
 TEST_F(ParseURIQueryTests, Queries)
 {
-  auto const col    = get_test_data(test_types::QUERY);
-  auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col});
+  auto const col = get_test_data(test_types::QUERY);
+
+  {
+    auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col});
+
+    cudf::test::strings_column_wrapper const expected({"param0=1&param2=3&param4=5",
+                                                       "",
+                                                       "a=b&param0=true",
+                                                       "invalid=param&f„⁈.=7&param0=3",
+                                                       "",
+                                                       "query=1&param0=5",
+                                                       "brokenparam0=1&fakeparam0=5&param0=true",
+                                                       "CBA=CBA&C=C"},
+                                                      {1, 0, 1, 1, 0, 1, 1, 1});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+  }
+  {
+    auto const result =
+      spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col}, "param0");
+    cudf::test::strings_column_wrapper const expected({"1", "", "true", "3", "", "5", "true", ""},
+                                                      {1, 0, 1, 1, 0, 1, 1, 0});
 
-  cudf::test::strings_column_wrapper const expected(
-    {"param0=1&param2=3&param4=5", "", "a=b", "invalid=param&f„⁈.=7", "", "query=1"},
-    {1, 0, 1, 1, 0, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+  }
+  {
+    auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col}, "C");
+    cudf::test::strings_column_wrapper const expected({"", "", "", "", "", "", "", "C"},
+                                                      {0, 0, 0, 0, 0, 0, 0, 1});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+  }
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
index 8f82bfc908..e9908f9ea5 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
@@ -60,7 +60,20 @@ public static ColumnVector parseURIQuery(ColumnView uriColumn) {
     return new ColumnVector(parseQuery(uriColumn.getNativeView()));
   }
 
+  /**
+   * Parse query and return a specific parameter for each URI from the incoming column.
+   *
+   * @param URIColumn The input strings column in which each row contains a URI.
+   * @param String The parameter to extract from the query
+   * @return A string column with query data extracted.
+   */
+  public static ColumnVector parseURIQueryWithLiteral(ColumnView uriColumn, String query) {
+    assert uriColumn.getType().equals(DType.STRING) : "Input type must be String";
+    return new ColumnVector(parseQueryWithLiteral(uriColumn.getNativeView(), query));
+  }
+
   private static native long parseProtocol(long jsonColumnHandle);
   private static native long parseHost(long jsonColumnHandle);
   private static native long parseQuery(long jsonColumnHandle);
+  private static native long parseQueryWithLiteral(long jsonColumnHandle, String query);
 }
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
index ca76df2bf3..c79633008c 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
@@ -90,6 +90,40 @@ void testQuery(String[] testData) {
     }
   }
 
+  void testQuery(String[] testData, String param) {
+    String[] expectedQueryStrings = new String[testData.length];
+    for (int i=0; i<testData.length; i++) {
+      String query = null;
+      try {
+        URI uri = new URI(testData[i]);
+        query = uri.getRawQuery();
+      } catch (URISyntaxException ex) {
+        // leave the query null if URI is invalid
+      } catch (NullPointerException ex) {
+        // leave the query null if URI is null
+      }
+
+      String subquery = null;
+
+      if (query != null) {
+        String[] pairs = query.split("&");
+        for (String pair : pairs) {
+          int idx = pair.indexOf("=");
+          if (idx > 0 && pair.substring(0, idx).equals(param)) {
+            subquery = pair.substring(idx + 1);
+            break;
+          }
+        }
+      }
+      expectedQueryStrings[i] = subquery;
+    }
+    try (ColumnVector v0 = ColumnVector.fromStrings(testData);
+      ColumnVector expectedQuery = ColumnVector.fromStrings(expectedQueryStrings);
+      ColumnVector queryResult = ParseURI.parseURIQueryWithLiteral(v0, param)) {
+      AssertUtils.assertColumnsAreEqual(expectedQuery, queryResult);
+    }
+  }
+
   @Test
   void parseURISparkTest() {
     String[] testData = {
@@ -150,6 +184,7 @@ void parseURISparkTest() {
     testProtocol(testData);
     testHost(testData);
     testQuery(testData);
+    testQuery(testData, "query");
   }
 
   @Test
@@ -163,6 +198,7 @@ void parseURIUTF8Test() {
     testProtocol(testData);
     testHost(testData);
     testQuery(testData);
+    testQuery(testData, "query");
   }
 
   @Test
@@ -178,6 +214,7 @@ void parseURIIP4Test() {
     testProtocol(testData);
     testHost(testData);
     testQuery(testData);
+    testQuery(testData, "query");
   }
 
   @Test
@@ -206,5 +243,6 @@ void parseURIIP6Test() {
     testProtocol(testData);
     testHost(testData);
     testQuery(testData);
+    testQuery(testData, "query");
   }
 }

From 34536f8c10c18aa92d0f792e32218ed8cbd30083 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 25 Jan 2024 05:33:25 +0800
Subject: [PATCH 114/127] Update submodule cudf to
 f800f5a2fa9a961699345e6febe740b4b8f4760e (#1729)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 67a36a9104..f800f5a2fa 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 67a36a9104097cd6a8ae6efee1018e249f2fe441
+Subproject commit f800f5a2fa9a961699345e6febe740b4b8f4760e

From 1d27b332b1252031ff1efd412e4f98a556c37daf Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Wed, 24 Jan 2024 18:41:35 -0500
Subject: [PATCH 115/127] Add support for `parse_uri` to limit query with a
 column (#1719)

* Adding support for parse uri for query with a column for keys

Signed-off-by: Mike Wilson <knobby@burntsheep.com>
Co-authored-by: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
---
 src/main/cpp/src/ParseURIJni.cpp              | 15 +++
 src/main/cpp/src/parse_uri.cu                 | 17 +++-
 src/main/cpp/src/parse_uri.hpp                | 15 +++
 src/main/cpp/tests/parse_uri.cpp              | 11 +++
 .../com/nvidia/spark/rapids/jni/ParseURI.java | 22 ++++-
 .../nvidia/spark/rapids/jni/ParseURITest.java | 92 +++++++++++++++++++
 6 files changed, 167 insertions(+), 5 deletions(-)

diff --git a/src/main/cpp/src/ParseURIJni.cpp b/src/main/cpp/src/ParseURIJni.cpp
index c688d10736..354d47c424 100644
--- a/src/main/cpp/src/ParseURIJni.cpp
+++ b/src/main/cpp/src/ParseURIJni.cpp
@@ -77,4 +77,19 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseQueryWith
   }
   CATCH_STD(env, 0);
 }
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseQueryWithColumn(
+  JNIEnv* env, jclass, jlong input_column, jlong query_column)
+{
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+  JNI_NULL_CHECK(env, query_column, "query column is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const input = reinterpret_cast<cudf::column_view const*>(input_column);
+    auto const query = reinterpret_cast<cudf::column_view const*>(query_column);
+    return cudf::jni::ptr_as_jlong(spark_rapids_jni::parse_uri_to_query(*input, *query).release());
+  }
+  CATCH_STD(env, 0);
+}
 }
diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index 4d21617fd7..cd64c539ef 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -629,7 +629,11 @@ uri_parts __device__ validate_uri(const char* str,
       // passed as param0, the return would simply be 5.
       if (query_match && query_match->size() > 0) {
         auto const match_idx = row_idx % query_match->size();
-        auto in_match        = query_match->element<string_view>(match_idx);
+        if (query_match->is_null(match_idx)) {
+          ret.valid = 0;
+          return ret;
+        }
+        auto in_match = query_match->element<string_view>(match_idx);
 
         auto const [query, valid] = find_query_part(ret.query, in_match);
         if (!valid) {
@@ -993,4 +997,15 @@ std::unique_ptr<cudf::column> parse_uri_to_query(cudf::strings_column_view const
   return detail::parse_uri(input, detail::URI_chunks::QUERY, strings_column_view(*col), stream, mr);
 }
 
+std::unique_ptr<cudf::column> parse_uri_to_query(cudf::strings_column_view const& input,
+                                                 cudf::strings_column_view const& query_match,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(input.size() == query_match.size(), "Query column must be the same size as input!");
+
+  return detail::parse_uri(input, detail::URI_chunks::QUERY, query_match, stream, mr);
+}
+
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/parse_uri.hpp b/src/main/cpp/src/parse_uri.hpp
index bb001e3167..004d800ddb 100644
--- a/src/main/cpp/src/parse_uri.hpp
+++ b/src/main/cpp/src/parse_uri.hpp
@@ -80,4 +80,19 @@ std::unique_ptr<cudf::column> parse_uri_to_query(
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Parse query and copy from the input string column to the output string column.
+ *
+ * @param input Input string column of URIs to parse.
+ * @param query_match string column to match in query.
+ * @param stream Stream on which to operate.
+ * @param mr Memory resource for returned column.
+ * @return std::unique_ptr<column> String column of queries parsed.
+ */
+std::unique_ptr<cudf::column> parse_uri_to_query(
+  cudf::strings_column_view const& input,
+  cudf::strings_column_view const& query_match,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/tests/parse_uri.cpp b/src/main/cpp/tests/parse_uri.cpp
index 234ad380c7..09f238e18c 100644
--- a/src/main/cpp/tests/parse_uri.cpp
+++ b/src/main/cpp/tests/parse_uri.cpp
@@ -395,4 +395,15 @@ TEST_F(ParseURIQueryTests, Queries)
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
   }
+  {
+    cudf::test::strings_column_wrapper const query(
+      {"param0", "q", "a", "invalid", "test", "query", "fakeparam0", "C"});
+    cudf::test::strings_column_wrapper const expected({"1", "", "b", "param", "", "1", "5", "C"},
+                                                      {1, 0, 1, 1, 0, 1, 1, 1});
+
+    auto const result = spark_rapids_jni::parse_uri_to_query(cudf::strings_column_view{col},
+                                                             cudf::strings_column_view{query});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+  }
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
index e9908f9ea5..6de84ea519 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
@@ -72,8 +72,22 @@ public static ColumnVector parseURIQueryWithLiteral(ColumnView uriColumn, String
     return new ColumnVector(parseQueryWithLiteral(uriColumn.getNativeView(), query));
   }
 
-  private static native long parseProtocol(long jsonColumnHandle);
-  private static native long parseHost(long jsonColumnHandle);
-  private static native long parseQuery(long jsonColumnHandle);
-  private static native long parseQueryWithLiteral(long jsonColumnHandle, String query);
+    /**
+   * Parse query and return a specific parameter for each URI from the incoming column.
+   *
+   * @param URIColumn The input strings column in which each row contains a URI.
+   * @param String The parameter to extract from the query
+   * @return A string column with query data extracted.
+   */
+  public static ColumnVector parseURIQueryWithColumn(ColumnView uriColumn, ColumnView queryColumn) {
+    assert uriColumn.getType().equals(DType.STRING) : "Input type must be String";
+    assert queryColumn.getType().equals(DType.STRING) : "Query type must be String";
+    return new ColumnVector(parseQueryWithColumn(uriColumn.getNativeView(), queryColumn.getNativeView()));
+  }
+
+  private static native long parseProtocol(long inputColumnHandle);
+  private static native long parseHost(long inputColumnHandle);
+  private static native long parseQuery(long inputColumnHandle);
+  private static native long parseQueryWithLiteral(long inputColumnHandle, String query);
+  private static native long parseQueryWithColumn(long inputColumnHandle, long queryColumnHandle);
 }
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
index c79633008c..f8ed45c704 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
@@ -124,6 +124,41 @@ void testQuery(String[] testData, String param) {
     }
   }
 
+  void testQuery(String[] testData, String[] params) {
+    String[] expectedQueryStrings = new String[testData.length];
+    for (int i=0; i<testData.length; i++) {
+      String query = null;
+      try {
+        URI uri = new URI(testData[i]);
+        query = uri.getRawQuery();
+      } catch (URISyntaxException ex) {
+        // leave the query null if URI is invalid
+      } catch (NullPointerException ex) {
+        // leave the query null if URI is null
+      }
+
+      String subquery = null;
+
+      if (query != null) {
+        String[] pairs = query.split("&");
+        for (String pair : pairs) {
+          int idx = pair.indexOf("=");
+          if (idx > 0 && pair.substring(0, idx).equals(params[i])) {
+            subquery = pair.substring(idx + 1);
+            break;
+          }
+        }
+      }
+      expectedQueryStrings[i] = subquery;
+    }
+    try (ColumnVector v0 = ColumnVector.fromStrings(testData);
+      ColumnVector p0 = ColumnVector.fromStrings(params);
+      ColumnVector expectedQuery = ColumnVector.fromStrings(expectedQueryStrings);
+      ColumnVector queryResult = ParseURI.parseURIQueryWithColumn(v0, p0)) {
+      AssertUtils.assertColumnsAreEqual(expectedQuery, queryResult);
+    }
+  }
+
   @Test
   void parseURISparkTest() {
     String[] testData = {
@@ -180,11 +215,68 @@ void parseURISparkTest() {
       "userinfo@www.nvidia.com/path?query=1#Ref",
       "",
       null};
+  
+
+      String[] queries = {
+        "a",
+        "h",
+        // commented out until https://github.com/NVIDIA/spark-rapids/issues/10036 is fixed
+        //"object",
+        "object",
+        "a",
+        "h",
+        "a",
+        "f",
+        "g",
+        "a",
+        "a",
+        "f",
+        "g",
+        "a",
+        "a",
+        "b",
+        "a",
+        "",
+        "a",
+        "a",
+        "a",
+        "a",
+        "b",
+        "a",
+        "q",
+        "b",
+        "a",
+        "query",
+        "a",
+        "primekey_in",
+        "a",
+        "q",
+        "ExpertId",
+        "query",
+        "solutionId",
+        "f",
+        "param",
+        "",
+        "q",
+        "a",
+        "f",
+        "mnid=5080",
+        "f",
+        "a",
+        "param4",
+        "cloth",
+        "a",
+        "invalid",
+        "invalid",
+        "query",
+        "a",
+        "f"};
 
     testProtocol(testData);
     testHost(testData);
     testQuery(testData);
     testQuery(testData, "query");
+    testQuery(testData, queries);
   }
 
   @Test

From 8cb6913b5c672c82b97da0908f8af3d977fca821 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 25 Jan 2024 11:28:15 +0800
Subject: [PATCH 116/127] Update submodule cudf to
 5b1eef31ed4c5935285ef780dc74d35cea086b49 (#1732)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f800f5a2fa..5b1eef31ed 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f800f5a2fa9a961699345e6febe740b4b8f4760e
+Subproject commit 5b1eef31ed4c5935285ef780dc74d35cea086b49

From 95fcad879802613c21d0ed89d8656970b8869cad Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 26 Jan 2024 06:06:39 +0800
Subject: [PATCH 117/127] Update submodule cudf to
 0cd58fbec63d5e461b487e7e37aa9942ebe0f116 (#1736)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5b1eef31ed..0cd58fbec6 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5b1eef31ed4c5935285ef780dc74d35cea086b49
+Subproject commit 0cd58fbec63d5e461b487e7e37aa9942ebe0f116

From cd0b85761f2a0db353798a542d4b4589856fdfa0 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 26 Jan 2024 11:02:49 +0800
Subject: [PATCH 118/127] Update submodule cudf to
 821f4dea107db6a51fcbffff997fa6844ab5565f (#1738)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 0cd58fbec6..821f4dea10 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 0cd58fbec63d5e461b487e7e37aa9942ebe0f116
+Subproject commit 821f4dea107db6a51fcbffff997fa6844ab5565f

From 9c34fae6e919d5d4ce76d7384b086b9ba69648ea Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 26 Jan 2024 20:57:28 +0800
Subject: [PATCH 119/127] Fix a parse_uri query bug (#1740)

* Fix a parse_uri query bug

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Add comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

---------

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/parse_uri.cu                       |  9 +++++----
 .../com/nvidia/spark/rapids/jni/ParseURITest.java   | 13 ++++++++++---
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index cd64c539ef..13c4050404 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -500,9 +500,10 @@ __device__ std::pair<string_view, bool> find_query_part(string_view haystack, st
   auto h           = haystack.data();
   auto const end_h = haystack.data() + find_length;
   auto n           = needle.data();
+  bool match       = false;
   while (h < end_h) {
-    bool match = true;
-    for (size_type jdx = 0; match && (jdx < n_bytes); ++jdx) {
+    match = false;  // initialize to false to prevent empty query key
+    for (size_type jdx = 0; (jdx == 0 || match) && (jdx < n_bytes); ++jdx) {
       match = (h[jdx] == n[jdx]);
     }
     if (match) { match = n_bytes < haystack.size_bytes() && h[n_bytes] == '='; }
@@ -519,8 +520,8 @@ __device__ std::pair<string_view, bool> find_query_part(string_view haystack, st
     h++;
   }
 
-  // if h is past the end of the haystack, no match.
-  if (haystack.data() + haystack.size_bytes() <= h || *h != '=') { return {{}, false}; }
+  // if not match or no value, return nothing
+  if (!match || *h != '=') { return {{}, false}; }
 
   // skip over the =
   h++;
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
index f8ed45c704..8f9fcfd903 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
@@ -214,8 +214,11 @@ void parseURISparkTest() {
       "https://[::1]/?invalid=param&~.=!@&^",
       "userinfo@www.nvidia.com/path?query=1#Ref",
       "",
-      null};
-  
+      null,
+      "https://www.nvidia.com/?cat=12",
+      "www.nvidia.com/vote.php?pid=50",
+      "https://www.nvidia.com/vote.php?=50",
+    };
 
       String[] queries = {
         "a",
@@ -270,7 +273,11 @@ void parseURISparkTest() {
         "invalid",
         "query",
         "a",
-        "f"};
+        "f",
+        "query",
+        "query",
+        ""
+      };
 
     testProtocol(testData);
     testHost(testData);

From 9d50ce52e827fde86bae03ba1ab7b584e3a717dc Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 29 Jan 2024 15:13:47 +0800
Subject: [PATCH 120/127] Fix build warnings of chars and make_strings_column
 (#1725)

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/benchmarks/common/generate_input.cu | 2 +-
 src/main/cpp/src/cast_decimal_to_string.cu       | 2 +-
 src/main/cpp/src/cast_float_to_string.cu         | 2 +-
 src/main/cpp/src/cast_string.cu                  | 6 +++---
 src/main/cpp/src/format_float.cu                 | 2 +-
 src/main/cpp/src/map_utils.cu                    | 7 +++++--
 6 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/main/cpp/benchmarks/common/generate_input.cu b/src/main/cpp/benchmarks/common/generate_input.cu
index 3b1376c89a..75f0a8fca0 100644
--- a/src/main/cpp/benchmarks/common/generate_input.cu
+++ b/src/main/cpp/benchmarks/common/generate_input.cu
@@ -520,7 +520,7 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
   return cudf::make_strings_column(
     num_rows,
     std::move(offsets),
-    std::move(chars),
+    std::move(chars->release().data.release()[0]),
     profile.get_null_frequency().has_value() ? std::move(result_bitmask) : rmm::device_buffer{},
     null_count);
 }
diff --git a/src/main/cpp/src/cast_decimal_to_string.cu b/src/main/cpp/src/cast_decimal_to_string.cu
index 0cd2713a2f..01fc5c5a92 100644
--- a/src/main/cpp/src/cast_decimal_to_string.cu
+++ b/src/main/cpp/src/cast_decimal_to_string.cu
@@ -191,7 +191,7 @@ struct dispatch_decimal_to_non_ansi_string_fn {
 
     return make_strings_column(input.size(),
                                std::move(offsets),
-                               std::move(chars),
+                               std::move(chars->release().data.release()[0]),
                                input.null_count(),
                                cudf::detail::copy_bitmask(input, stream, mr));
   }
diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index 6fc4d20f79..c0e0875914 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -88,7 +88,7 @@ struct dispatch_float_to_string_fn {
 
     return make_strings_column(strings_count,
                                std::move(offsets),
-                               std::move(chars),
+                               std::move(chars->release().data.release()[0]),
                                floats.null_count(),
                                cudf::detail::copy_bitmask(floats, stream, mr));
   }
diff --git a/src/main/cpp/src/cast_string.cu b/src/main/cpp/src/cast_string.cu
index 59a27a59b7..c2a8190062 100644
--- a/src/main/cpp/src/cast_string.cu
+++ b/src/main/cpp/src/cast_string.cu
@@ -624,7 +624,7 @@ void validate_ansi_column(column_view const& col,
     dest.resize(string_bounds[1] - string_bounds[0]);
 
     cudaMemcpyAsync(dest.data(),
-                    &source_col.chars().data<char const>()[string_bounds[0]],
+                    &source_col.chars_begin(stream)[string_bounds[0]],
                     string_bounds[1] - string_bounds[0],
                     cudaMemcpyDeviceToHost,
                     stream.value());
@@ -667,7 +667,7 @@ struct string_to_integer_impl {
     detail::string_to_integer_kernel<<<blocks, threads, 0, stream.value()>>>(
       data.data(),
       null_mask.data(),
-      string_col.chars().data<char const>(),
+      string_col.chars_begin(stream),
       string_col.offsets().data<size_type>(),
       string_col.null_mask(),
       string_col.size(),
@@ -736,7 +736,7 @@ struct string_to_decimal_impl {
     detail::string_to_decimal_kernel<<<blocks, threads, 0, stream.value()>>>(
       data.data(),
       null_mask.data(),
-      string_col.chars().data<char const>(),
+      string_col.chars_begin(stream),
       string_col.offsets().data<size_type>(),
       string_col.null_mask(),
       string_col.size(),
diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index d9ecbe8206..3052d334aa 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -89,7 +89,7 @@ struct dispatch_format_float_fn {
 
     return cudf::make_strings_column(strings_count,
                                      std::move(offsets),
-                                     std::move(chars),
+                                     std::move(chars->release().data.release()[0]),
                                      floats.null_count(),
                                      cudf::detail::copy_bitmask(floats, stream, mr));
   }
diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/map_utils.cu
index 002dadb0e3..9b420e201f 100644
--- a/src/main/cpp/src/map_utils.cu
+++ b/src/main/cpp/src/map_utils.cu
@@ -575,8 +575,11 @@ std::unique_ptr<cudf::column> extract_keys_or_values(
 
   auto children = cudf::strings::detail::make_strings_children(
     substring_fn{unified_json_buff, extract_ranges}, num_extract, stream, mr);
-  return cudf::make_strings_column(
-    num_extract, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{});
+  return cudf::make_strings_column(num_extract,
+                                   std::move(children.first),
+                                   std::move(children.second->release().data.release()[0]),
+                                   0,
+                                   rmm::device_buffer{});
 }
 
 // Compute the offsets for the final lists of Struct<String,String>.

From c776bd954e53fea65a513bdda8456b39c84400c4 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 30 Jan 2024 03:37:28 +0800
Subject: [PATCH 121/127] Fix a out of bound bug in parse_url query (#1746)

---
 src/main/cpp/src/parse_uri.cu                 | 64 +++++++++----------
 .../nvidia/spark/rapids/jni/ParseURITest.java |  4 +-
 2 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index 13c4050404..0e6ea2690d 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -494,48 +494,42 @@ bool __device__ validate_fragment(string_view fragment)
 
 __device__ std::pair<string_view, bool> find_query_part(string_view haystack, string_view needle)
 {
-  auto const n_bytes     = needle.size_bytes();
-  auto const find_length = haystack.size_bytes() - n_bytes + 1;
-
-  auto h           = haystack.data();
-  auto const end_h = haystack.data() + find_length;
-  auto n           = needle.data();
-  bool match       = false;
-  while (h < end_h) {
-    match = false;  // initialize to false to prevent empty query key
-    for (size_type jdx = 0; (jdx == 0 || match) && (jdx < n_bytes); ++jdx) {
-      match = (h[jdx] == n[jdx]);
+  auto const n_bytes = needle.size_bytes();
+  auto h             = haystack.data();
+  auto const h_end   = h + haystack.size_bytes();
+  auto n             = needle.data();
+
+  // stop matching early after it can no longer contain the string we are searching for
+  while (h + n_bytes < h_end) {
+    bool match_needle = true;
+    for (size_type jdx = 0; jdx < n_bytes; ++jdx) {
+      match_needle = (h[jdx] == n[jdx]);
+      if (!match_needle) { break; }
     }
-    if (match) { match = n_bytes < haystack.size_bytes() && h[n_bytes] == '='; }
-    if (match) {
-      // we don't care about the matched part, we want the string data after that.
-      h += n_bytes;
-      break;
-    } else {
-      // skip to the next param, which is after a &.
-      while (h < end_h && *h != '&') {
+
+    if (match_needle && h[n_bytes] == '=') {
+      // we don't care about the matched part, we want the string data after '='
+      h += n_bytes + 1;
+
+      // rest of string until end or until '&' is query match
+      int match_len = 0;
+      auto start    = h;
+      while (h < h_end && *h != '&') {
+        match_len++;
         h++;
       }
-    }
-    h++;
-  }
-
-  // if not match or no value, return nothing
-  if (!match || *h != '=') { return {{}, false}; }
 
-  // skip over the =
-  h++;
+      return {{start, match_len}, true};
+    }
 
-  // rest of string until end or until '&' is query match
-  auto const bytes_left = haystack.size_bytes() - (h - haystack.data());
-  int match_len         = 0;
-  auto start            = h;
-  while (*h != '&' && match_len < bytes_left) {
-    ++match_len;
-    ++h;
+    // not match, skip to the next param if possible, which is after a &.
+    while (h + n_bytes < h_end && *h != '&') {
+      h++;
+    }
+    h++;  // skip over the & if has, or point to h_end +1
   }
 
-  return {{start, match_len}, true};
+  return {{}, false};
 }
 
 uri_parts __device__ validate_uri(const char* str,
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
index 8f9fcfd903..ffe7e9e946 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
@@ -143,7 +143,7 @@ void testQuery(String[] testData, String[] params) {
         String[] pairs = query.split("&");
         for (String pair : pairs) {
           int idx = pair.indexOf("=");
-          if (idx > 0 && pair.substring(0, idx).equals(params[i])) {
+          if (idx >= 0 && pair.substring(0, idx).equals(params[i])) {
             subquery = pair.substring(idx + 1);
             break;
           }
@@ -218,6 +218,7 @@ void parseURISparkTest() {
       "https://www.nvidia.com/?cat=12",
       "www.nvidia.com/vote.php?pid=50",
       "https://www.nvidia.com/vote.php?=50",
+      "https://www.nvidia.com/vote.php?query=50"
     };
 
       String[] queries = {
@@ -276,6 +277,7 @@ void parseURISparkTest() {
         "f",
         "query",
         "query",
+        "",
         ""
       };
 

From f0c63a84cdb8ba0f3b0e633cd804dd049fb04309 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 30 Jan 2024 07:02:31 +0800
Subject: [PATCH 122/127] Update submodule cudf to
 fc2b9771f17644243817a339e218360aa97a1a79 (#1751)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 821f4dea10..fc2b9771f1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 821f4dea107db6a51fcbffff997fa6844ab5565f
+Subproject commit fc2b9771f17644243817a339e218360aa97a1a79

From d44c477e0276f2032e5c9ec341cc4f452f2cc28f Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Fri, 2 Feb 2024 16:32:51 +0800
Subject: [PATCH 123/127] Change version to 24.02.0

Signed-off-by: Tim Liu <timl@nvidia.com>
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 4f2d19e45c..5bedec16b6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -21,7 +21,7 @@
 
   <groupId>com.nvidia</groupId>
   <artifactId>spark-rapids-jni</artifactId>
-  <version>24.02.0-SNAPSHOT</version>
+  <version>24.02.0</version>
   <packaging>jar</packaging>
   <name>RAPIDS Accelerator JNI for Apache Spark</name>
   <description>

From f53ea02cd3f44544d57da2314feeb7f74529ef24 Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Fri, 2 Feb 2024 17:10:06 +0800
Subject: [PATCH 124/127] Update copyright

Signed-off-by: Tim Liu <timl@nvidia.com>
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 5bedec16b6..f9b48e6c40 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
-  Copyright (c) 2022-2023, NVIDIA CORPORATION.
+  Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.

From 95c8c1aef33914fd2b73518014bb63a82465d5e7 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 7 Feb 2024 11:03:27 +0800
Subject: [PATCH 125/127] Update submodule cudf to
 c78033b7e77bebf9596d971b41714e1c4f29bb8f (#1766)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index fc2b9771f1..c78033b7e7 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit fc2b9771f17644243817a339e218360aa97a1a79
+Subproject commit c78033b7e77bebf9596d971b41714e1c4f29bb8f

From ff126e5f576152ec07f89e303baab2a75176460c Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 13 Feb 2024 06:57:10 +0800
Subject: [PATCH 126/127] Update submodule cudf to
 82f6a5356aa10fd22c13f6aa85d1770c4c1a1c1b (#1780)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index c78033b7e7..82f6a5356a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit c78033b7e77bebf9596d971b41714e1c4f29bb8f
+Subproject commit 82f6a5356aa10fd22c13f6aa85d1770c4c1a1c1b

From b7a99b152f7092d82a6dad965e05c2fbdde57d94 Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Wed, 14 Feb 2024 09:37:37 +0800
Subject: [PATCH 127/127] Update cudf submodule ref to tag v24.02.01 (#1783)

Update cudf submodule ref to https://github.com/rapidsai/cudf/releases/tag/v24.02.01

Signed-off-by: Tim Liu <timl@nvidia.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 82f6a5356a..33ffdf5b71 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 82f6a5356aa10fd22c13f6aa85d1770c4c1a1c1b
+Subproject commit 33ffdf5b71f639d83c6e31d76129a5ce74bf23cd