From 4b0a634e51f64c68f107683d82ebfea87290efaf Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 29 Oct 2024 10:42:07 -0400
Subject: [PATCH 01/41] Auto assign PR to author (#16969)

I think most PRs remain unassigned, so this PR auto assigns the PR to the PR author. I think this will help keep our project boards up-to-date.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16969
---
 .github/workflows/auto-assign.yml | 17 +++++++++++++++++
 .github/workflows/labeler.yml     |  1 +
 2 files changed, 18 insertions(+)
 create mode 100644 .github/workflows/auto-assign.yml

diff --git a/.github/workflows/auto-assign.yml b/.github/workflows/auto-assign.yml
new file mode 100644
index 00000000000..673bebd4ecc
--- /dev/null
+++ b/.github/workflows/auto-assign.yml
@@ -0,0 +1,17 @@
+name: "Auto Assign PR"
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - reopened
+      - synchronize
+
+jobs:
+  add_assignees:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions-ecosystem/action-add-assignees@v1
+        with:
+          repo_token: "${{ secrets.GITHUB_TOKEN }}"
+          assignees: ${{ github.actor }}
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 31e78f82a62..f5cb71bfc14 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -1,4 +1,5 @@
 name: "Pull Request Labeler"
+
 on:
 - pull_request_target
 

From 3775f7b9f6509bd0f2f75c46edb60abf2522de86 Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Tue, 29 Oct 2024 14:49:52 +0000
Subject: [PATCH 02/41] Fixed unused attribute compilation error for GCC 13
 (#17188)

With `decltype(&pclose) ` for the destructor type of the `unique_ptr`, gcc makes the signature inherit the attributes of `pclose`. The compiler then ignores this attribute as it doesn't apply within the context with a warning, and since we have `-Werror` on for ignored attributes,  the build fails.
This happens on gcc 13.2.0.

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/17188
---
 cpp/benchmarks/io/cuio_common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index fe24fb58728..45b46005c47 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -186,7 +186,7 @@ std::string exec_cmd(std::string_view cmd)
   std::fflush(nullptr);
   // Switch stderr and stdout to only capture stderr
   auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null");
-  std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
+  std::unique_ptr<FILE, int (*)(FILE*)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
   CUDF_EXPECTS(pipe != nullptr, "popen() failed");
 
   std::array<char, 128> buffer;

From ddfb2848d6b7bb3cd03b8377f349f401030f558c Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Tue, 29 Oct 2024 09:51:19 -0700
Subject: [PATCH 03/41] Support storing `precision` of decimal types in
 `Schema` class (#17176)

In Spark, the `DecimalType` has a specific number of digits to represent the numbers. However, when creating a data Schema, only type and name of the column are stored, thus we lose that precision information. As such, it would be difficult to reconstruct the original decimal types from cudf's `Schema` instance.

This PR adds a `precision` member variable to the `Schema` class in cudf Java, allowing it to store the precision number of the original decimal column.

Partially contributes to https://github.com/NVIDIA/spark-rapids/issues/11560.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/17176
---
 java/src/main/java/ai/rapids/cudf/Schema.java | 77 +++++++++++++++++--
 1 file changed, 70 insertions(+), 7 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index 76b2799aad6..6da591d659f 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -29,26 +29,52 @@ public class Schema {
   public static final Schema INFERRED = new Schema();
 
   private final DType topLevelType;
+
+  /**
+   * Default value for precision value, when it is not specified or the column type is not decimal.
+   */
+  private static final int UNKNOWN_PRECISION = -1;
+
+  /**
+  * Store precision for the top level column, only applicable if the column is a decimal type.
+  * <p/>
+  * This variable is not designed to be used by any libcudf's APIs since libcudf does not support
+  * precisions for fixed point numbers.
+  * Instead, it is used only to pass down the precision values from Spark's DecimalType to the
+  * JNI level, where some JNI functions require these values to perform their operations.
+  */
+  private final int topLevelPrecision;
+
   private final List<String> childNames;
   private final List<Schema> childSchemas;
   private boolean flattened = false;
   private String[] flattenedNames;
   private DType[] flattenedTypes;
+  private int[] flattenedPrecisions;
   private int[] flattenedCounts;
 
   private Schema(DType topLevelType,
+                 int topLevelPrecision,
                  List<String> childNames,
                  List<Schema> childSchemas) {
     this.topLevelType = topLevelType;
+    this.topLevelPrecision = topLevelPrecision;
     this.childNames = childNames;
     this.childSchemas = childSchemas;
   }
 
+  private Schema(DType topLevelType,
+                 List<String> childNames,
+                 List<Schema> childSchemas) {
+    this(topLevelType, UNKNOWN_PRECISION, childNames, childSchemas);
+  }
+
   /**
    * Inferred schema.
    */
   private Schema() {
     topLevelType = null;
+    topLevelPrecision = UNKNOWN_PRECISION;
     childNames = null;
     childSchemas = null;
   }
@@ -104,14 +130,17 @@ private void flattenIfNeeded() {
       if (flatLen == 0) {
         flattenedNames = null;
         flattenedTypes = null;
+        flattenedPrecisions = null;
         flattenedCounts = null;
       } else {
         String[] names = new String[flatLen];
         DType[] types = new DType[flatLen];
+        int[] precisions = new int[flatLen];
         int[] counts = new int[flatLen];
-        collectFlattened(names, types, counts, 0);
+        collectFlattened(names, types, precisions, counts, 0);
         flattenedNames = names;
         flattenedTypes = types;
+        flattenedPrecisions = precisions;
         flattenedCounts = counts;
       }
       flattened = true;
@@ -128,19 +157,20 @@ private int flattenedLength(int startingLength) {
     return startingLength;
   }
 
-  private int collectFlattened(String[] names, DType[] types, int[] counts, int offset) {
+  private int collectFlattened(String[] names, DType[] types, int[] precisions, int[] counts, int offset) {
     if (childSchemas != null) {
       for (int i = 0; i < childSchemas.size(); i++) {
         Schema child = childSchemas.get(i);
         names[offset] = childNames.get(i);
         types[offset] = child.topLevelType;
+        precisions[offset] = child.topLevelPrecision;
         if (child.childNames != null) {
           counts[offset] = child.childNames.size();
         } else {
           counts[offset] = 0;
         }
         offset++;
-        offset = this.childSchemas.get(i).collectFlattened(names, types, counts, offset);
+        offset = this.childSchemas.get(i).collectFlattened(names, types, precisions, counts, offset);
       }
     }
     return offset;
@@ -226,6 +256,22 @@ public int[] getFlattenedTypeScales() {
     return ret;
   }
 
+  /**
+   * Get decimal precisions of the columns' types flattened from all levels in schema by
+   * depth-first traversal.
+   * <p/>
+   * This is used to pass down the decimal precisions from Spark to only the JNI layer, where
+   * some JNI functions require precision values to perform their operations.
+   * Decimal precisions should not be consumed by any libcudf's APIs since libcudf does not
+   * support precisions for fixed point numbers.
+   *
+   * @return An array containing decimal precision of all columns in schema.
+   */
+  public int[] getFlattenedDecimalPrecisions() {
+    flattenIfNeeded();
+    return flattenedPrecisions;
+  }
+
   /**
    * Get the types of the columns in schema flattened from all levels by depth-first traversal.
    * @return An array containing types of all columns in schema.
@@ -307,11 +353,13 @@ public HostColumnVector.DataType asHostDataType() {
 
   public static class Builder {
     private final DType topLevelType;
+    private final int topLevelPrecision;
     private final List<String> names;
     private final List<Builder> types;
 
-    private Builder(DType topLevelType) {
+    private Builder(DType topLevelType, int topLevelPrecision) {
       this.topLevelType = topLevelType;
+      this.topLevelPrecision = topLevelPrecision;
       if (topLevelType == DType.STRUCT || topLevelType == DType.LIST) {
         // There can be children
         names = new ArrayList<>();
@@ -322,14 +370,19 @@ private Builder(DType topLevelType) {
       }
     }
 
+    private Builder(DType topLevelType) {
+      this(topLevelType, UNKNOWN_PRECISION);
+    }
+
     /**
      * Add a new column
      * @param type the type of column to add
      * @param name the name of the column to add (Ignored for list types)
+     * @param precision the decimal precision, only applicable for decimal types
      * @return the builder for the new column. This should really only be used when the type
      * passed in is a LIST or a STRUCT.
      */
-    public Builder addColumn(DType type, String name) {
+    public Builder addColumn(DType type, String name, int precision) {
       if (names == null) {
         throw new IllegalStateException("A column of type " + topLevelType +
             " cannot have children");
@@ -340,21 +393,31 @@ public Builder addColumn(DType type, String name) {
       if (names.contains(name)) {
         throw new IllegalStateException("Cannot add duplicate names to a schema");
       }
-      Builder ret = new Builder(type);
+      Builder ret = new Builder(type, precision);
       types.add(ret);
       names.add(name);
       return ret;
     }
 
+    public Builder addColumn(DType type, String name) {
+      return addColumn(type, name, UNKNOWN_PRECISION);
+    }
+
     /**
      * Adds a single column to the current schema. addColumn is preferred as it can be used
      * to support nested types.
      * @param type the type of the column.
      * @param name the name of the column.
+     * @param precision the decimal precision, only applicable for decimal types.
      * @return this for chaining.
      */
+    public Builder column(DType type, String name, int precision) {
+      addColumn(type, name, precision);
+      return this;
+    }
+
     public Builder column(DType type, String name) {
-      addColumn(type, name);
+      addColumn(type, name, UNKNOWN_PRECISION);
       return this;
     }
 

From 63b773e73a9f582e2cfa75ae04bcad8608e8f03a Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Tue, 29 Oct 2024 13:25:55 -0500
Subject: [PATCH 04/41] Add in new java API for raw host memory allocation
 (#17197)

This is the first patch in a series of patches that should make it so that all java host memory allocations go through the DefaultHostMemoryAllocator unless another allocator is explicitly provided.

This is to make it simpler to track/control host memory usage.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Alessandro Bellina (https://github.com/abellina)

URL: https://github.com/rapidsai/cudf/pull/17197
---
 .../main/java/ai/rapids/cudf/HostMemoryBuffer.java   | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
index e4106574a19..d792459901c 100644
--- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -155,6 +155,16 @@ public static HostMemoryBuffer allocate(long bytes) {
     return allocate(bytes, defaultPreferPinned);
   }
 
+  /**
+   * Allocate host memory bypassing the default allocator. This is intended to only be used by other allocators.
+   * Pinned memory will not be used for these allocations.
+   * @param bytes size in bytes to allocate
+   * @return the newly created buffer
+   */
+  public static HostMemoryBuffer allocateRaw(long bytes) {
+    return new HostMemoryBuffer(UnsafeMemoryAccessor.allocate(bytes), bytes);
+  }
+
   /**
    * Create a host buffer that is memory-mapped to a file.
    * @param path path to the file to map into host memory

From 52d7e638af366a2384868c41a7ece889d7ada30e Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Tue, 29 Oct 2024 19:59:13 +0000
Subject: [PATCH 05/41] Unified binary_ops and ast benchmarks parameter names
 (#17200)

This merge request unifies the parameter names of the AST and BINARYOP benchmark suites and makes it easier to perform parameter sweeps and compare the outputs of both benchmarks.

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17200
---
 cpp/benchmarks/ast/transform.cpp              | 26 +++++++++----------
 cpp/benchmarks/binaryop/binaryop.cpp          | 26 +++++++++----------
 cpp/benchmarks/binaryop/compiled_binaryop.cpp | 12 ++++-----
 3 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index 7fe61054a26..2533ea9611c 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -52,14 +52,14 @@ enum class TreeType {
 template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
 static void BM_ast_transform(nvbench::state& state)
 {
-  auto const table_size  = static_cast<cudf::size_type>(state.get_int64("table_size"));
+  auto const num_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
 
   // Create table data
   auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
   auto const source_table =
     create_sequence_table(cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols),
-                          row_count{table_size},
+                          row_count{num_rows},
                           Nullable ? std::optional<double>{0.5} : std::nullopt);
   auto table = source_table->view();
 
@@ -99,8 +99,8 @@ static void BM_ast_transform(nvbench::state& state)
   auto const& expression_tree_root = expressions.back();
 
   // Use the number of bytes read from global memory
-  state.add_global_memory_reads<key_type>(static_cast<size_t>(table_size) * (tree_levels + 1));
-  state.add_global_memory_writes<key_type>(table_size);
+  state.add_global_memory_reads<key_type>(static_cast<size_t>(num_rows) * (tree_levels + 1));
+  state.add_global_memory_writes<key_type>(num_rows);
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
@@ -109,15 +109,15 @@ static void BM_ast_transform(nvbench::state& state)
 template <cudf::ast::ast_operator cmp_op, cudf::ast::ast_operator reduce_op>
 static void BM_string_compare_ast_transform(nvbench::state& state)
 {
-  auto const string_width    = static_cast<cudf::size_type>(state.get_int64("string_width"));
-  auto const num_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const num_comparisons = static_cast<cudf::size_type>(state.get_int64("num_comparisons"));
-  auto const hit_rate        = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+  auto const string_width = static_cast<cudf::size_type>(state.get_int64("string_width"));
+  auto const num_rows     = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const tree_levels  = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
+  auto const hit_rate     = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
 
-  CUDF_EXPECTS(num_comparisons > 0, "benchmarks require 1 or more comparisons");
+  CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons");
 
   // Create table data
-  auto const num_cols = num_comparisons * 2;
+  auto const num_cols = tree_levels * 2;
   std::vector<std::unique_ptr<cudf::column>> columns;
   std::for_each(
     thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
@@ -150,7 +150,7 @@ static void BM_string_compare_ast_transform(nvbench::state& state)
   expressions.emplace_back(cudf::ast::operation(cmp_op, column_refs[0], column_refs[1]));
 
   std::for_each(thrust::make_counting_iterator(1),
-                thrust::make_counting_iterator(num_comparisons),
+                thrust::make_counting_iterator(tree_levels),
                 [&](size_t idx) {
                   auto const& lhs = expressions.back();
                   auto const& rhs = expressions.emplace_back(
@@ -177,7 +177,7 @@ static void BM_string_compare_ast_transform(nvbench::state& state)
   NVBENCH_BENCH(name)                                                                      \
     .set_name(#name)                                                                       \
     .add_int64_axis("tree_levels", {1, 5, 10})                                             \
-    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
+    .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
@@ -202,7 +202,7 @@ AST_TRANSFORM_BENCHMARK_DEFINE(
     .set_name(#name)                                                           \
     .add_int64_axis("string_width", {32, 64, 128, 256})                        \
     .add_int64_axis("num_rows", {32768, 262144, 2097152})                      \
-    .add_int64_axis("num_comparisons", {1, 2, 3, 4})                           \
+    .add_int64_axis("tree_levels", {1, 2, 3, 4})                               \
     .add_int64_axis("hit_rate", {50, 100})
 
 AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(ast_string_equal_logical_and,
diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp
index 35e41c6c2a4..75c91d270a7 100644
--- a/cpp/benchmarks/binaryop/binaryop.cpp
+++ b/cpp/benchmarks/binaryop/binaryop.cpp
@@ -40,18 +40,18 @@ enum class TreeType {
 template <typename key_type, TreeType tree_type, bool reuse_columns>
 static void BM_binaryop_transform(nvbench::state& state)
 {
-  auto const table_size{static_cast<cudf::size_type>(state.get_int64("table_size"))};
+  auto const num_rows{static_cast<cudf::size_type>(state.get_int64("num_rows"))};
   auto const tree_levels{static_cast<cudf::size_type>(state.get_int64("tree_levels"))};
 
   // Create table data
   auto const n_cols       = reuse_columns ? 1 : tree_levels + 1;
   auto const source_table = create_sequence_table(
-    cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{table_size});
+    cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{num_rows});
   cudf::table_view table{*source_table};
 
   // Use the number of bytes read from global memory
-  state.add_global_memory_reads<key_type>(static_cast<size_t>(table_size) * (tree_levels + 1));
-  state.add_global_memory_writes<key_type>(table_size);
+  state.add_global_memory_reads<key_type>(static_cast<size_t>(num_rows) * (tree_levels + 1));
+  state.add_global_memory_writes<key_type>(num_rows);
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
     // Execute tree that chains additions like (((a + b) + c) + d)
@@ -74,15 +74,15 @@ static void BM_binaryop_transform(nvbench::state& state)
 template <cudf::binary_operator cmp_op, cudf::binary_operator reduce_op>
 static void BM_string_compare_binaryop_transform(nvbench::state& state)
 {
-  auto const string_width    = static_cast<cudf::size_type>(state.get_int64("string_width"));
-  auto const num_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const num_comparisons = static_cast<cudf::size_type>(state.get_int64("num_comparisons"));
-  auto const hit_rate        = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+  auto const string_width = static_cast<cudf::size_type>(state.get_int64("string_width"));
+  auto const num_rows     = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const tree_levels  = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
+  auto const hit_rate     = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
 
-  CUDF_EXPECTS(num_comparisons > 0, "benchmarks require 1 or more comparisons");
+  CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons");
 
   // Create table data
-  auto const num_cols = num_comparisons * 2;
+  auto const num_cols = tree_levels * 2;
   std::vector<std::unique_ptr<cudf::column>> columns;
   std::for_each(
     thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
@@ -113,7 +113,7 @@ static void BM_string_compare_binaryop_transform(nvbench::state& state)
       cudf::binary_operation(table.get_column(0), table.get_column(1), cmp_op, bool_type, stream);
     std::for_each(
       thrust::make_counting_iterator(1),
-      thrust::make_counting_iterator(num_comparisons),
+      thrust::make_counting_iterator(tree_levels),
       [&](size_t idx) {
         std::unique_ptr<cudf::column> comparison = cudf::binary_operation(
           table.get_column(idx * 2), table.get_column(idx * 2 + 1), cmp_op, bool_type, stream);
@@ -133,7 +133,7 @@ static void BM_string_compare_binaryop_transform(nvbench::state& state)
   }                                                                                   \
   NVBENCH_BENCH(name)                                                                 \
     .add_int64_axis("tree_levels", {1, 2, 5, 10})                                     \
-    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
+    .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique,
                                     int32_t,
@@ -158,7 +158,7 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique,
     .set_name(#name)                                                                \
     .add_int64_axis("string_width", {32, 64, 128, 256})                             \
     .add_int64_axis("num_rows", {32768, 262144, 2097152})                           \
-    .add_int64_axis("num_comparisons", {1, 2, 3, 4})                                \
+    .add_int64_axis("tree_levels", {1, 2, 3, 4})                                    \
     .add_int64_axis("hit_rate", {50, 100})
 
 STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(string_compare_binaryop_transform,
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index cd3c3871a2e..426f44a4fa1 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -23,10 +23,10 @@
 template <typename TypeLhs, typename TypeRhs, typename TypeOut>
 void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
 {
-  auto const table_size = static_cast<cudf::size_type>(state.get_int64("table_size"));
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
 
   auto const source_table = create_random_table(
-    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{table_size});
+    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{num_rows});
 
   auto lhs = cudf::column_view(source_table->get_column(0));
   auto rhs = cudf::column_view(source_table->get_column(1));
@@ -37,9 +37,9 @@ void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
   cudf::binary_operation(lhs, rhs, binop, output_dtype);
 
   // use number of bytes read and written to global memory
-  state.add_global_memory_reads<TypeLhs>(table_size);
-  state.add_global_memory_reads<TypeRhs>(table_size);
-  state.add_global_memory_writes<TypeOut>(table_size);
+  state.add_global_memory_reads<TypeLhs>(num_rows);
+  state.add_global_memory_reads<TypeRhs>(num_rows);
+  state.add_global_memory_writes<TypeOut>(num_rows);
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); });
@@ -55,7 +55,7 @@ void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
   }                                                                           \
   NVBENCH_BENCH(name)                                                         \
     .set_name("compiled_binary_op_" BM_STRINGIFY(name))                       \
-    .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000})
+    .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000})
 
 #define build_name(a, b, c, d) a##_##b##_##c##_##d
 

From 8d7b0d8bf0aebebde0a5036d2e51f5991ecbe63b Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 29 Oct 2024 16:31:27 -0400
Subject: [PATCH 06/41] [BUG] Replace `repo_token` with `github_token` in Auto
 Assign PR GHA (#17203)

The Auto Assign GHA workflow fails with this [error](https://github.com/rapidsai/cudf/actions/runs/11580081781). This PR fixes this error.
xref #16969

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17203
---
 .github/workflows/auto-assign.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/auto-assign.yml b/.github/workflows/auto-assign.yml
index 673bebd4ecc..1bf4ac08b69 100644
--- a/.github/workflows/auto-assign.yml
+++ b/.github/workflows/auto-assign.yml
@@ -13,5 +13,5 @@ jobs:
     steps:
       - uses: actions-ecosystem/action-add-assignees@v1
         with:
-          repo_token: "${{ secrets.GITHUB_TOKEN }}"
+          github_token: "${{ secrets.GITHUB_TOKEN }}"
           assignees: ${{ github.actor }}

From eeb4d2780163794f4b705062e49dbdc3283ebce0 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Tue, 29 Oct 2024 17:12:43 -0400
Subject: [PATCH 07/41] Parquet reader list microkernel (#16538)

This PR refactors fixed-width parquet list reader decoding into its own set of micro-kernels, templatizing the existing fixed-width microkernels. When skipping rows for lists, this will skip ahead the decoding of the definition, repetition, and dictionary rle_streams as well. The list kernel uses 128 threads per block and 71 registers per thread, so I've changed the launch_bounds to enforce a minimum of 8 blocks per SM.  This causes a small register spill but the benchmarks are still faster, as seen below:

DEVICE_BUFFER list benchmarks (decompress + decode, not bound by IO):
run_length 1,   cardinality 0,             no byte_limit: 24.7% faster
run_length 32, cardinality 1000,       no byte_limit: 18.3% faster
run_length 1,   cardinality 0,       500kb byte_limit: 57% faster
run_length 32, cardinality 1000, 500kb byte_limit: 53% faster

Compressed list of ints on hard drive: 5.5% faster
Sample real data on hard drive (many columns not lists): 0.5% faster

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16538
---
 cpp/src/io/parquet/decode_fixed.cu | 585 ++++++++++++++++++++++++-----
 cpp/src/io/parquet/page_hdr.cu     |  17 +-
 cpp/src/io/parquet/parquet_gpu.hpp |  10 +
 cpp/src/io/parquet/reader_impl.cpp |  45 +++
 cpp/src/io/parquet/rle_stream.cuh  |  81 ++--
 5 files changed, 615 insertions(+), 123 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 4522ea7fe56..45380e6ea20 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -37,7 +37,14 @@ struct block_scan_results {
 };
 
 template <int decode_block_size>
-static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_results& results)
+using block_scan_temp_storage = int[decode_block_size / cudf::detail::warp_size];
+
+// Similar to CUB, must __syncthreads() after calling if reusing temp_storage
+template <int decode_block_size>
+__device__ inline static void scan_block_exclusive_sum(
+  int thread_bit,
+  block_scan_results& results,
+  block_scan_temp_storage<decode_block_size>& temp_storage)
 {
   int const t              = threadIdx.x;
   int const warp_index     = t / cudf::detail::warp_size;
@@ -45,15 +52,19 @@ static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_resul
   uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1;
 
   uint32_t warp_bits = ballot(thread_bit);
-  scan_block_exclusive_sum<decode_block_size>(warp_bits, warp_lane, warp_index, lane_mask, results);
+  scan_block_exclusive_sum<decode_block_size>(
+    warp_bits, warp_lane, warp_index, lane_mask, results, temp_storage);
 }
 
+// Similar to CUB, must __syncthreads() after calling if reusing temp_storage
 template <int decode_block_size>
-__device__ static void scan_block_exclusive_sum(uint32_t warp_bits,
-                                                int warp_lane,
-                                                int warp_index,
-                                                uint32_t lane_mask,
-                                                block_scan_results& results)
+__device__ static void scan_block_exclusive_sum(
+  uint32_t warp_bits,
+  int warp_lane,
+  int warp_index,
+  uint32_t lane_mask,
+  block_scan_results& results,
+  block_scan_temp_storage<decode_block_size>& temp_storage)
 {
   // Compute # warps
   constexpr int num_warps = decode_block_size / cudf::detail::warp_size;
@@ -64,49 +75,64 @@ __device__ static void scan_block_exclusive_sum(uint32_t warp_bits,
   results.thread_count_within_warp = __popc(results.warp_bits & lane_mask);
 
   // Share the warp counts amongst the block threads
-  __shared__ int warp_counts[num_warps];
-  if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; }
-  __syncthreads();
+  if (warp_lane == 0) { temp_storage[warp_index] = results.warp_count; }
+  __syncthreads();  // Sync to share counts between threads/warps
 
   // Compute block-wide results
   results.block_count               = 0;
   results.thread_count_within_block = results.thread_count_within_warp;
   for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
-    results.block_count += warp_counts[warp_idx];
-    if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; }
+    results.block_count += temp_storage[warp_idx];
+    if (warp_idx < warp_index) { results.thread_count_within_block += temp_storage[warp_idx]; }
   }
 }
 
-template <int block_size, typename state_buf>
-__device__ inline void gpuDecodeFixedWidthValues(
+template <int block_size, bool has_lists_t, typename state_buf>
+__device__ void gpuDecodeFixedWidthValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
   constexpr int num_warps      = block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
-  int const dtype                          = s->col.physical_type;
+  // nesting level that is storing actual leaf values
+  int const leaf_level_index = s->col.max_nesting_depth - 1;
+  auto const data_out        = s->nesting_info[leaf_level_index].data_out;
+
+  int const dtype          = s->col.physical_type;
+  uint32_t const dtype_len = s->dtype_len;
+
+  int const skipped_leaf_values = s->page.skipped_leaf_values;
 
   // decode values
   int pos = start;
   while (pos < end) {
     int const batch_size = min(max_batch_size, end - pos);
-
     int const target_pos = pos + batch_size;
-    int const src_pos    = pos + t;
+    int const thread_pos = pos + t;
 
-    // the position in the output column/buffer
-    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] - s->first_row;
+    // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls)
+    int const dst_pos = [&]() {
+      int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(thread_pos)];
+      if constexpr (!has_lists_t) { dst_pos -= s->first_row; }
+      return dst_pos;
+    }();
 
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
-    if (src_pos < target_pos && dst_pos >= 0) {
+    if (thread_pos < target_pos && dst_pos >= 0) {
       // nesting level that is storing actual leaf values
-      int const leaf_level_index = s->col.max_nesting_depth - 1;
 
-      uint32_t dtype_len = s->dtype_len;
-      void* dst =
-        nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      // src_pos represents the logical row position we want to read from. But in the case of
+      // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos
+      // has to take into account the # of values we have to skip in the page to get to the
+      // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+      int const src_pos = [&]() {
+        if constexpr (has_lists_t) { return thread_pos + skipped_leaf_values; }
+        return thread_pos;
+      }();
+
+      void* const dst = data_out + (static_cast<size_t>(dst_pos) * dtype_len);
+
       if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) {
         switch (dtype) {
           case INT32: gpuOutputFast(s, sb, src_pos, static_cast<uint32_t*>(dst)); break;
@@ -145,15 +171,15 @@ __device__ inline void gpuDecodeFixedWidthValues(
   }
 }
 
-template <int block_size, typename state_buf>
+template <int block_size, bool has_lists_t, typename state_buf>
 struct decode_fixed_width_values_func {
   __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
   {
-    gpuDecodeFixedWidthValues<block_size, state_buf>(s, sb, start, end, t);
+    gpuDecodeFixedWidthValues<block_size, has_lists_t, state_buf>(s, sb, start, end, t);
   }
 };
 
-template <int block_size, typename state_buf>
+template <int block_size, bool has_lists_t, typename state_buf>
 __device__ inline void gpuDecodeFixedWidthSplitValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
@@ -161,10 +187,15 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
   constexpr int num_warps      = block_size / warp_size;
   constexpr int max_batch_size = num_warps * warp_size;
 
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
-  int const dtype                          = s->col.physical_type;
-  auto const data_len                      = thrust::distance(s->data_start, s->data_end);
-  auto const num_values                    = data_len / s->dtype_len_in;
+  // nesting level that is storing actual leaf values
+  int const leaf_level_index = s->col.max_nesting_depth - 1;
+  auto const data_out        = s->nesting_info[leaf_level_index].data_out;
+
+  int const dtype       = s->col.physical_type;
+  auto const data_len   = thrust::distance(s->data_start, s->data_end);
+  auto const num_values = data_len / s->dtype_len_in;
+
+  int const skipped_leaf_values = s->page.skipped_leaf_values;
 
   // decode values
   int pos = start;
@@ -172,21 +203,34 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
     int const batch_size = min(max_batch_size, end - pos);
 
     int const target_pos = pos + batch_size;
-    int const src_pos    = pos + t;
+    int const thread_pos = pos + t;
 
     // the position in the output column/buffer
-    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] - s->first_row;
+    // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls)
+    int const dst_pos = [&]() {
+      int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(thread_pos)];
+      if constexpr (!has_lists_t) { dst_pos -= s->first_row; }
+      return dst_pos;
+    }();
 
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
-    if (src_pos < target_pos && dst_pos >= 0) {
-      // nesting level that is storing actual leaf values
-      int const leaf_level_index = s->col.max_nesting_depth - 1;
+    if (thread_pos < target_pos && dst_pos >= 0) {
+      // src_pos represents the logical row position we want to read from. But in the case of
+      // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos
+      // has to take into account the # of values we have to skip in the page to get to the
+      // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+      int const src_pos = [&]() {
+        if constexpr (has_lists_t) {
+          return thread_pos + skipped_leaf_values;
+        } else {
+          return thread_pos;
+        }
+      }();
 
-      uint32_t dtype_len = s->dtype_len;
-      uint8_t const* src = s->data_start + src_pos;
-      uint8_t* dst =
-        nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      uint32_t const dtype_len = s->dtype_len;
+      uint8_t const* const src = s->data_start + src_pos;
+      uint8_t* const dst       = data_out + static_cast<size_t>(dst_pos) * dtype_len;
       auto const is_decimal =
         s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
 
@@ -239,11 +283,11 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
   }
 }
 
-template <int block_size, typename state_buf>
+template <int block_size, bool has_lists_t, typename state_buf>
 struct decode_fixed_width_split_values_func {
   __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
   {
-    gpuDecodeFixedWidthSplitValues<block_size, state_buf>(s, sb, start, end, t);
+    gpuDecodeFixedWidthSplitValues<block_size, has_lists_t, state_buf>(s, sb, start, end, t);
   }
 };
 
@@ -274,12 +318,14 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
     // definition level
-    int d = 1;
-    if (t >= batch_size) {
-      d = -1;
-    } else if (def) {
-      d = static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
-    }
+    int const d = [&]() {
+      if (t >= batch_size) {
+        return -1;
+      } else if (def) {
+        return static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+      }
+      return 1;
+    }();
 
     int const thread_value_count = t;
     int const block_value_count  = batch_size;
@@ -340,6 +386,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
         if (is_valid) {
           int const dst_pos = value_count + thread_value_count;
           int const src_pos = max_depth_valid_count + thread_valid_count;
+
           sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
         }
         // update stuff
@@ -396,16 +443,16 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
 
     // use definition level & row bounds to determine if is valid
-    int is_valid;
-    if (t >= batch_size) {
-      is_valid = 0;
-    } else if (def) {
-      int const def_level =
-        static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
-      is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0;
-    } else {
-      is_valid = in_row_bounds;
-    }
+    int const is_valid = [&]() {
+      if (t >= batch_size) {
+        return 0;
+      } else if (def) {
+        int const def_level =
+          static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+        return ((def_level > 0) && in_row_bounds) ? 1 : 0;
+      }
+      return in_row_bounds;
+    }();
 
     // thread and block validity count
     using block_scan = cub::BlockScan<int, decode_block_size>;
@@ -447,8 +494,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
 
     // output offset
     if (is_valid) {
-      int const dst_pos                                          = value_count + thread_value_count;
-      int const src_pos                                          = valid_count + thread_valid_count;
+      int const dst_pos = value_count + thread_value_count;
+      int const src_pos = valid_count + thread_valid_count;
+
       sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
     }
 
@@ -460,7 +508,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
     ni.valid_count       = valid_count;
-    ni.value_count       = value_count;  // TODO: remove? this is unused in the non-list path
+    ni.value_count       = value_count;
     s->nz_count          = valid_count;
     s->input_value_count = value_count;
     s->input_row_count   = value_count;
@@ -533,6 +581,239 @@ static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_v
   return valid_count;
 }
 
+template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_count,
+                                                          page_state_s* s,
+                                                          state_buf* sb,
+                                                          level_t const* const def,
+                                                          level_t const* const rep,
+                                                          int t)
+{
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
+
+  // how many (input) values we've processed in the page so far, prior to this loop iteration
+  int value_count = s->input_value_count;
+
+  // how many rows we've processed in the page so far
+  int input_row_count = s->input_row_count;
+
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row = s->first_row;
+  int const last_row  = first_row + s->num_rows;
+
+  int const row_index_lower_bound = s->row_index_lower_bound;
+  int const max_depth             = s->col.max_nesting_depth - 1;
+  int max_depth_valid_count       = s->nesting_info[max_depth].valid_count;
+
+  int const warp_index     = t / cudf::detail::warp_size;
+  int const warp_lane      = t % cudf::detail::warp_size;
+  bool const is_first_lane = (warp_lane == 0);
+
+  __syncthreads();
+  __shared__ block_scan_temp_storage<decode_block_size> temp_storage;
+
+  while (value_count < target_value_count) {
+    bool const within_batch = value_count + t < target_value_count;
+
+    // get definition level, use repetition level to get start/end depth
+    // different for each thread, as each thread has a different r/d
+    auto const [def_level, start_depth, end_depth] = [&]() {
+      if (!within_batch) { return cuda::std::make_tuple(-1, -1, -1); }
+
+      int const level_index = rolling_index<state_buf::nz_buf_size>(value_count + t);
+      int const rep_level   = static_cast<int>(rep[level_index]);
+      int const start_depth = s->nesting_info[rep_level].start_depth;
+
+      if constexpr (!nullable) {
+        return cuda::std::make_tuple(-1, start_depth, max_depth);
+      } else {
+        if (def != nullptr) {
+          int const def_level = static_cast<int>(def[level_index]);
+          return cuda::std::make_tuple(
+            def_level, start_depth, s->nesting_info[def_level].end_depth);
+        } else {
+          return cuda::std::make_tuple(1, start_depth, max_depth);
+        }
+      }
+    }();
+
+    // Determine value count & row index
+    //  track (page-relative) row index for the thread so we can compare against input bounds
+    //  keep track of overall # of rows we've read.
+    int const is_new_row = start_depth == 0 ? 1 : 0;
+    int num_prior_new_rows, total_num_new_rows;
+    {
+      block_scan_results new_row_scan_results;
+      scan_block_exclusive_sum<decode_block_size>(is_new_row, new_row_scan_results, temp_storage);
+      __syncthreads();
+      num_prior_new_rows = new_row_scan_results.thread_count_within_block;
+      total_num_new_rows = new_row_scan_results.block_count;
+    }
+
+    int const row_index = input_row_count + ((num_prior_new_rows + is_new_row) - 1);
+    input_row_count += total_num_new_rows;
+    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+
+    // VALUE COUNT:
+    // in_nesting_bounds: if at a nesting level where we need to add value indices
+    // the bounds: from current rep to the rep AT the def depth
+    int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0;
+    int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count;
+    {
+      block_scan_results value_count_scan_results;
+      scan_block_exclusive_sum<decode_block_size>(
+        in_nesting_bounds, value_count_scan_results, temp_storage);
+      __syncthreads();
+
+      thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp;
+      warp_value_count               = value_count_scan_results.warp_count;
+      thread_value_count             = value_count_scan_results.thread_count_within_block;
+      block_value_count              = value_count_scan_results.block_count;
+    }
+
+    // iterate by depth
+    for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
+      auto& ni = s->nesting_info[d_idx];
+
+      // everything up to the max_def_level is a non-null value
+      int const is_valid = [&](int input_def_level) {
+        if constexpr (nullable) {
+          return ((input_def_level >= ni.max_def_level) && in_nesting_bounds) ? 1 : 0;
+        } else {
+          return in_nesting_bounds;
+        }
+      }(def_level);
+
+      // VALID COUNT:
+      // Not all values visited by this block will represent a value at this nesting level.
+      // the validity bit for thread t might actually represent output value t-6.
+      // the correct position for thread t's bit is thread_value_count.
+      uint32_t const warp_valid_mask =
+        WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp);
+      int thread_valid_count, block_valid_count;
+      {
+        auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1;
+
+        block_scan_results valid_count_scan_results;
+        scan_block_exclusive_sum<decode_block_size>(warp_valid_mask,
+                                                    warp_lane,
+                                                    warp_index,
+                                                    thread_mask,
+                                                    valid_count_scan_results,
+                                                    temp_storage);
+        __syncthreads();
+        thread_valid_count = valid_count_scan_results.thread_count_within_block;
+        block_valid_count  = valid_count_scan_results.block_count;
+      }
+
+      // compute warp and thread value counts for the -next- nesting level. we need to
+      // do this for lists so that we can emit an offset for the -current- nesting level.
+      // the offset for the current nesting level == current length of the next nesting level
+      int next_thread_value_count_within_warp = 0, next_warp_value_count = 0;
+      int next_thread_value_count = 0, next_block_value_count = 0;
+      int next_in_nesting_bounds = 0;
+      if (d_idx < max_depth) {
+        // NEXT DEPTH VALUE COUNT:
+        next_in_nesting_bounds =
+          ((d_idx + 1 >= start_depth) && (d_idx + 1 <= end_depth) && in_row_bounds) ? 1 : 0;
+        {
+          block_scan_results next_value_count_scan_results;
+          scan_block_exclusive_sum<decode_block_size>(
+            next_in_nesting_bounds, next_value_count_scan_results, temp_storage);
+          __syncthreads();
+
+          next_thread_value_count_within_warp =
+            next_value_count_scan_results.thread_count_within_warp;
+          next_warp_value_count   = next_value_count_scan_results.warp_count;
+          next_thread_value_count = next_value_count_scan_results.thread_count_within_block;
+          next_block_value_count  = next_value_count_scan_results.block_count;
+        }
+
+        // STORE OFFSET TO THE LIST LOCATION
+        // if we're -not- at a leaf column and we're within nesting/row bounds
+        // and we have a valid data_out pointer, it implies this is a list column, so
+        // emit an offset.
+        if (in_nesting_bounds && ni.data_out != nullptr) {
+          const auto& next_ni = s->nesting_info[d_idx + 1];
+          int const idx       = ni.value_count + thread_value_count;
+          cudf::size_type const ofs =
+            next_ni.value_count + next_thread_value_count + next_ni.page_start_value;
+
+          (reinterpret_cast<cudf::size_type*>(ni.data_out))[idx] = ofs;
+        }
+      }
+
+      // validity is processed per-warp (on lane 0's)
+      // thi is because when atomic writes are needed, they are 32-bit operations
+      //
+      // lists always read and write to the same bounds
+      // (that is, read and write positions are already pre-bounded by first_row/num_rows).
+      // since we are about to write the validity vector
+      // here we need to adjust our computed mask to take into account the write row bounds.
+      if constexpr (nullable) {
+        if (is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) {
+          // absolute bit offset into the output validity map
+          // is cumulative sum of warp_value_count at the given nesting depth
+          // DON'T subtract by first_row: since it's lists it's not 1-row-per-value
+          int const bit_offset = ni.valid_map_offset + thread_value_count;
+
+          store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count);
+        }
+
+        if (t == 0) { ni.null_count += block_value_count - block_valid_count; }
+      }
+
+      // if this is valid and we're at the leaf, output dst_pos
+      // Read value_count before the sync, so that when thread 0 modifies it we've already read its
+      // value
+      int const current_value_count = ni.value_count;
+      __syncthreads();  // guard against modification of ni.value_count below
+      if (d_idx == max_depth) {
+        if (is_valid) {
+          int const dst_pos      = current_value_count + thread_value_count;
+          int const src_pos      = max_depth_valid_count + thread_valid_count;
+          int const output_index = rolling_index<state_buf::nz_buf_size>(src_pos);
+
+          // Index from rolling buffer of values (which doesn't include nulls) to final array (which
+          // includes gaps for nulls)
+          sb->nz_idx[output_index] = dst_pos;
+        }
+        max_depth_valid_count += block_valid_count;
+      }
+
+      // update stuff
+      if (t == 0) {
+        ni.value_count += block_value_count;
+        ni.valid_map_offset += block_value_count;
+      }
+      __syncthreads();  // sync modification of ni.value_count
+
+      // propagate value counts for the next depth level
+      block_value_count              = next_block_value_count;
+      thread_value_count             = next_thread_value_count;
+      in_nesting_bounds              = next_in_nesting_bounds;
+      warp_value_count               = next_warp_value_count;
+      thread_value_count_within_warp = next_thread_value_count_within_warp;
+    }  // END OF DEPTH LOOP
+
+    int const batch_size = min(max_batch_size, target_value_count - value_count);
+    value_count += batch_size;
+  }
+
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    s->nesting_info[max_depth].valid_count = max_depth_valid_count;
+    s->nz_count                            = max_depth_valid_count;
+    s->input_value_count                   = value_count;
+
+    // If we have lists # rows != # values
+    s->input_row_count = input_row_count;
+  }
+
+  return max_depth_valid_count;
+}
+
 // is the page marked nullable or not
 __device__ inline bool is_nullable(page_state_s* s)
 {
@@ -560,6 +841,23 @@ __device__ inline bool maybe_has_nulls(page_state_s* s)
   return run_val != s->col.max_level[lvl];
 }
 
+template <int rolling_buf_size, typename stream_type>
+__device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t)
+{
+  // it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000:
+  // in that case skip_decode() only skips 4000, and we have to process the remaining 1000 up front
+  // modulo 2 * block_size of course, since that's as many as we process at once
+  int num_skipped = parquet_stream.skip_decode(t, num_to_skip);
+  while (num_skipped < num_to_skip) {
+    // TODO: Instead of decoding, skip within the run to the appropriate location
+    auto const to_decode = min(rolling_buf_size, num_to_skip - num_skipped);
+    num_skipped += parquet_stream.decode_next(t, to_decode);
+    __syncthreads();
+  }
+
+  return num_skipped;
+}
+
 /**
  * @brief Kernel for computing fixed width non dictionary column data stored in the pages
  *
@@ -579,9 +877,10 @@ template <typename level_t,
           decode_kernel_mask kernel_mask_t,
           bool has_dict_t,
           bool has_nesting_t,
-          template <int block_size, typename state_buf>
+          bool has_lists_t,
+          template <int block_size, bool decode_has_lists_t, typename state_buf>
           typename DecodeValuesFunc>
-CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
+CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
   gpuDecodePageDataGeneric(PageInfo* pages,
                            device_span<ColumnChunkDesc const> chunks,
                            size_t min_row,
@@ -621,31 +920,29 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   if (s->num_rows == 0) { return; }
 
-  DecodeValuesFunc<decode_block_size_t, state_buf_t> decode_values;
+  DecodeValuesFunc<decode_block_size_t, has_lists_t, state_buf_t> decode_values;
 
-  bool const nullable             = is_nullable(s);
-  bool const should_process_nulls = nullable && maybe_has_nulls(s);
+  bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s);
 
   // shared buffer. all shared memory is suballocated out of here
-  // constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size *
-  // sizeof(rle_run<level_t>), size_t{16}) : 0;
+  constexpr int shared_rep_size =
+    has_lists_t
+      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16})
+      : 0;
   constexpr int shared_dict_size =
     has_dict_t
       ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), size_t{16})
       : 0;
   constexpr int shared_def_size =
     cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16});
-  constexpr int shared_buf_size = /*shared_rep_size +*/ shared_dict_size + shared_def_size;
+  constexpr int shared_buf_size = shared_rep_size + shared_dict_size + shared_def_size;
   __shared__ __align__(16) uint8_t shared_buf[shared_buf_size];
 
   // setup all shared memory buffers
-  int shared_offset = 0;
-  /*
-  rle_run<level_t> *rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
-  if constexpr (has_lists_t){
-    shared_offset += shared_rep_size;
-  }
-  */
+  int shared_offset          = 0;
+  rle_run<level_t>* rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
+  if constexpr (has_lists_t) { shared_offset += shared_rep_size; }
+
   rle_run<uint32_t>* dict_runs = reinterpret_cast<rle_run<uint32_t>*>(shared_buf + shared_offset);
   if constexpr (has_dict_t) { shared_offset += shared_dict_size; }
   rle_run<level_t>* def_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
@@ -660,38 +957,51 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
                      def,
                      s->page.num_input_values);
   }
-  /*
+
   rle_stream<level_t, decode_block_size_t, rolling_buf_size> rep_decoder{rep_runs};
   level_t* const rep = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
-  if constexpr(has_lists_t){
+  if constexpr (has_lists_t) {
     rep_decoder.init(s->col.level_bits[level_type::REPETITION],
                      s->abs_lvl_start[level_type::REPETITION],
                      s->abs_lvl_end[level_type::REPETITION],
                      rep,
                      s->page.num_input_values);
   }
-  */
 
   rle_stream<uint32_t, decode_block_size_t, rolling_buf_size> dict_stream{dict_runs};
   if constexpr (has_dict_t) {
     dict_stream.init(
       s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
   }
-  __syncthreads();
 
   // We use two counters in the loop below: processed_count and valid_count.
-  // - processed_count: number of rows out of num_input_values that we have decoded so far.
+  // - processed_count: number of values out of num_input_values that we have decoded so far.
   //   the definition stream returns the number of total rows it has processed in each call
   //   to decode_next and we accumulate in process_count.
-  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
+  // - valid_count: number of non-null values we have decoded so far. In each iteration of the
   //   loop below, we look at the number of valid items (which could be all for non-nullable),
   //   and valid_count is that running count.
   int processed_count = 0;
   int valid_count     = 0;
+
+  // Skip ahead in the decoding so that we don't repeat work (skipped_leaf_values = 0 for non-lists)
+  if constexpr (has_lists_t) {
+    auto const skipped_leaf_values = s->page.skipped_leaf_values;
+    if (skipped_leaf_values > 0) {
+      if (should_process_nulls) {
+        skip_decode<rolling_buf_size>(def_decoder, skipped_leaf_values, t);
+      }
+      processed_count = skip_decode<rolling_buf_size>(rep_decoder, skipped_leaf_values, t);
+      if constexpr (has_dict_t) {
+        skip_decode<rolling_buf_size>(dict_stream, skipped_leaf_values, t);
+      }
+    }
+  }
+
   // the core loop. decode batches of level stream data using rle_stream objects
   // and pass the results to gpuDecodeValues
   // For chunked reads we may not process all of the rows on the page; if not stop early
-  int last_row = s->first_row + s->num_rows;
+  int const last_row = s->first_row + s->num_rows;
   while ((s->error == 0) && (processed_count < s->page.num_input_values) &&
          (s->input_row_count <= last_row)) {
     int next_valid_count;
@@ -701,7 +1011,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       processed_count += def_decoder.decode_next(t);
       __syncthreads();
 
-      if constexpr (has_nesting_t) {
+      if constexpr (has_lists_t) {
+        rep_decoder.decode_next(t);
+        __syncthreads();
+        next_valid_count = gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, true, level_t>(
+          processed_count, s, sb, def, rep, t);
+      } else if constexpr (has_nesting_t) {
         next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, level_t>(
           processed_count, s, sb, def, t);
       } else {
@@ -713,9 +1028,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
     // nz_idx.  gpuDecodeFixedWidthValues would be the only work that happens.
     else {
-      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count =
-        gpuUpdateValidityAndRowIndicesNonNullable<decode_block_size_t>(processed_count, s, sb, t);
+      if constexpr (has_lists_t) {
+        processed_count += rep_decoder.decode_next(t);
+        __syncthreads();
+        next_valid_count = gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, false, level_t>(
+          processed_count, s, sb, nullptr, rep, t);
+      } else {
+        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
+        next_valid_count =
+          gpuUpdateValidityAndRowIndicesNonNullable<decode_block_size_t>(processed_count, s, sb, t);
+      }
     }
     __syncthreads();
 
@@ -745,6 +1067,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                   size_t min_row,
                                   int level_type_size,
                                   bool has_nesting,
+                                  bool is_list,
                                   kernel_error::pointer error_code,
                                   rmm::cuda_stream_view stream)
 {
@@ -754,12 +1077,23 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST,
+                               false,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -769,17 +1103,29 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   } else {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST,
+                               false,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -789,6 +1135,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -802,6 +1149,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                       size_t min_row,
                                       int level_type_size,
                                       bool has_nesting,
+                                      bool is_list,
                                       kernel_error::pointer error_code,
                                       rmm::cuda_stream_view stream)
 {
@@ -811,12 +1159,23 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
                                true,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -826,17 +1185,29 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                decode_kernel_mask::FIXED_WIDTH_DICT,
                                true,
                                false,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   } else {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
                                true,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -846,6 +1217,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                decode_kernel_mask::FIXED_WIDTH_DICT,
                                true,
                                false,
+                               true,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -860,6 +1232,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
                               size_t min_row,
                               int level_type_size,
                               bool has_nesting,
+                              bool is_list,
                               kernel_error::pointer error_code,
                               rmm::cuda_stream_view stream)
 {
@@ -869,12 +1242,23 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -884,17 +1268,29 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   } else {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -904,6 +1300,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index d604642be54..52d53cb8225 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -183,17 +183,20 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
     return decode_kernel_mask::STRING;
   }
 
-  if (!is_list(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
+  if (!is_byte_array(chunk) && !is_boolean(chunk)) {
     if (page.encoding == Encoding::PLAIN) {
-      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
-                              : decode_kernel_mask::FIXED_WIDTH_NO_DICT;
+      return is_list(chunk)     ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST
+             : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
+                                : decode_kernel_mask::FIXED_WIDTH_NO_DICT;
     } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
                page.encoding == Encoding::RLE_DICTIONARY) {
-      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
-                              : decode_kernel_mask::FIXED_WIDTH_DICT;
+      return is_list(chunk)     ? decode_kernel_mask::FIXED_WIDTH_DICT_LIST
+             : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
+                                : decode_kernel_mask::FIXED_WIDTH_DICT;
     } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
-      return is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
-                              : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT;
+      return is_list(chunk)     ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST
+             : is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
+                                : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT;
     }
   }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index be502b581af..dba24b553e6 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -220,6 +220,10 @@ enum class decode_kernel_mask {
     (1 << 9),                              // Same as above but for nested, fixed-width data
   FIXED_WIDTH_NO_DICT_NESTED = (1 << 10),  // Run decode kernel for fixed width non-dictionary pages
   FIXED_WIDTH_DICT_NESTED    = (1 << 11),  // Run decode kernel for fixed width dictionary pages
+  FIXED_WIDTH_DICT_LIST      = (1 << 12),  // Run decode kernel for fixed width dictionary pages
+  FIXED_WIDTH_NO_DICT_LIST   = (1 << 13),  // Run decode kernel for fixed width non-dictionary pages
+  BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST =
+    (1 << 14),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists
 };
 
 // mask representing all the ways in which a string can be encoded
@@ -908,6 +912,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
+ * @param[in] is_list Whether or not the data contains list data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -917,6 +922,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                          size_t min_row,
                          int level_type_size,
                          bool has_nesting,
+                         bool is_list,
                          kernel_error::pointer error_code,
                          rmm::cuda_stream_view stream);
 
@@ -932,6 +938,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
+ * @param[in] is_list Whether or not the data contains list data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -941,6 +948,7 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
                              size_t min_row,
                              int level_type_size,
                              bool has_nesting,
+                             bool is_list,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream);
 
@@ -956,6 +964,7 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
+ * @param[in] is_list Whether or not the data contains list data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -965,6 +974,7 @@ void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages
                                    size_t min_row,
                                    int level_type_size,
                                    bool has_nesting,
+                                   bool is_list,
                                    kernel_error::pointer error_code,
                                    rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index fed1a309064..689386b8957 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -272,6 +272,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                                   skip_rows,
                                   level_type_size,
                                   false,
+                                  false,
                                   error_code.data(),
                                   streams[s_idx++]);
   }
@@ -284,6 +285,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                                   skip_rows,
                                   level_type_size,
                                   true,
+                                  false,
+                                  error_code.data(),
+                                  streams[s_idx++]);
+  }
+
+  // launch byte stream split decoder, for list columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST) != 0) {
+    DecodeSplitPageFixedWidthData(subpass.pages,
+                                  pass.chunks,
+                                  num_rows,
+                                  skip_rows,
+                                  level_type_size,
+                                  true,
+                                  true,
                                   error_code.data(),
                                   streams[s_idx++]);
   }
@@ -307,6 +322,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                         skip_rows,
                         level_type_size,
                         false,
+                        false,
+                        error_code.data(),
+                        streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder for lists
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) != 0) {
+    DecodePageDataFixed(subpass.pages,
+                        pass.chunks,
+                        num_rows,
+                        skip_rows,
+                        level_type_size,
+                        true,
+                        true,
                         error_code.data(),
                         streams[s_idx++]);
   }
@@ -319,6 +348,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                         skip_rows,
                         level_type_size,
                         true,
+                        false,
                         error_code.data(),
                         streams[s_idx++]);
   }
@@ -331,6 +361,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                             skip_rows,
                             level_type_size,
                             false,
+                            false,
+                            error_code.data(),
+                            streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder with dictionaries for lists
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_LIST) != 0) {
+    DecodePageDataFixedDict(subpass.pages,
+                            pass.chunks,
+                            num_rows,
+                            skip_rows,
+                            level_type_size,
+                            true,
+                            true,
                             error_code.data(),
                             streams[s_idx++]);
   }
@@ -343,6 +387,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                             skip_rows,
                             level_type_size,
                             true,
+                            false,
                             error_code.data(),
                             streams[s_idx++]);
   }
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 4a0791d5c54..69e783a89d0 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -19,6 +19,7 @@
 #include "parquet_gpu.hpp"
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 
 namespace cudf::io::parquet::detail {
 
@@ -216,6 +217,26 @@ struct rle_stream {
     decode_index = -1;  // signals the first iteration. Nothing to decode.
   }
 
+  __device__ inline int get_rle_run_info(rle_run<level_t>& run)
+  {
+    run.start     = cur;
+    run.level_run = get_vlq32(run.start, end);
+
+    // run_bytes includes the header size
+    int run_bytes = run.start - cur;
+    if (is_literal_run(run.level_run)) {
+      // from the parquet spec: literal runs always come in multiples of 8 values.
+      run.size = (run.level_run >> 1) * 8;
+      run_bytes += util::div_rounding_up_unsafe(run.size * level_bits, 8);
+    } else {
+      // repeated value run
+      run.size = (run.level_run >> 1);
+      run_bytes += util::div_rounding_up_unsafe(level_bits, 8);
+    }
+
+    return run_bytes;
+  }
+
   __device__ inline void fill_run_batch()
   {
     // decode_index == -1 means we are on the very first decode iteration for this stream.
@@ -226,31 +247,14 @@ struct rle_stream {
     while (((decode_index == -1 && fill_index < num_rle_stream_decode_warps) ||
             fill_index < decode_index + run_buffer_size) &&
            cur < end) {
-      auto& run = runs[rolling_index<run_buffer_size>(fill_index)];
-
       // Encoding::RLE
+      // Pass by reference to fill the runs shared memory with the run data
+      auto& run           = runs[rolling_index<run_buffer_size>(fill_index)];
+      int const run_bytes = get_rle_run_info(run);
 
-      // bytes for the varint header
-      uint8_t const* _cur = cur;
-      int const level_run = get_vlq32(_cur, end);
-      // run_bytes includes the header size
-      int run_bytes = _cur - cur;
-
-      // literal run
-      if (is_literal_run(level_run)) {
-        // from the parquet spec: literal runs always come in multiples of 8 values.
-        run.size = (level_run >> 1) * 8;
-        run_bytes += ((run.size * level_bits) + 7) >> 3;
-      }
-      // repeated value run
-      else {
-        run.size = (level_run >> 1);
-        run_bytes += ((level_bits) + 7) >> 3;
-      }
-      run.output_pos = output_pos;
-      run.start      = _cur;
-      run.level_run  = level_run;
       run.remaining  = run.size;
+      run.output_pos = output_pos;
+
       cur += run_bytes;
       output_pos += run.size;
       fill_index++;
@@ -372,6 +376,39 @@ struct rle_stream {
     return values_processed_shared;
   }
 
+  __device__ inline int skip_runs(int target_count)
+  {
+    // we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip
+    // amount so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info.
+    // then when it hits the one that matters, we don't process it at all and bail as if we never
+    // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for
+    // the first time
+    while (cur < end) {
+      rle_run<level_t> run;
+      int run_bytes = get_rle_run_info(run);
+
+      if ((output_pos + run.size) > target_count) {
+        return output_pos;  // bail! we've reached the starting run
+      }
+
+      // skip this run
+      output_pos += run.size;
+      cur += run_bytes;
+    }
+
+    return output_pos;  // we skipped everything
+  }
+
+  __device__ inline int skip_decode(int t, int count)
+  {
+    int const output_count = min(count, total_values - cur_values);
+
+    // if level_bits == 0, there's nothing to do
+    // a very common case: columns with no nulls, especially if they are non-nested
+    cur_values = (level_bits == 0) ? output_count : skip_runs(output_count);
+    return cur_values;
+  }
+
   __device__ inline int decode_next(int t) { return decode_next(t, max_output_values); }
 };
 

From 6328ad679947eb5cbc352c345a28f079aa6b8005 Mon Sep 17 00:00:00 2001
From: Renjie Liu <liurenjie2008@gmail.com>
Date: Wed, 30 Oct 2024 17:17:47 +0800
Subject: [PATCH 08/41] Make ai.rapids.cudf.HostMemoryBuffer#copyFromStream
 public. (#17179)

This is the first pr of [a larger one](https://github.com/NVIDIA/spark-rapids-jni/pull/2532) to introduce a new serialization format. It make `ai.rapids.cudf.HostMemoryBuffer#copyFromStream` public. For more background, see https://github.com/NVIDIA/spark-rapids-jni/issues/2496

Authors:
  - Renjie Liu (https://github.com/liurenjie1024)
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Alessandro Bellina (https://github.com/abellina)

URL: https://github.com/rapidsai/cudf/pull/17179
---
 java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
index d792459901c..bfb959b12c1 100644
--- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
@@ -255,8 +255,10 @@ public final void copyFromHostBuffer(long destOffset, HostMemoryBuffer srcData,
    * @param destOffset  offset in bytes in this buffer to start copying to
    * @param in input stream to copy bytes from
    * @param byteLength number of bytes to copy
+   * @throws EOFException If there are not enough bytes in the stream to copy.
+   * @throws IOException If there is an error reading from the stream.
    */
-  final void copyFromStream(long destOffset, InputStream in, long byteLength) throws IOException {
+  public final void copyFromStream(long destOffset, InputStream in, long byteLength) throws IOException {
     addressOutOfBoundsCheck(address + destOffset, byteLength, "copy from stream");
     byte[] arrayBuffer = new byte[(int) Math.min(1024 * 128, byteLength)];
     long left = byteLength;
@@ -264,7 +266,7 @@ final void copyFromStream(long destOffset, InputStream in, long byteLength) thro
       int amountToCopy = (int) Math.min(arrayBuffer.length, left);
       int amountRead = in.read(arrayBuffer, 0, amountToCopy);
       if (amountRead < 0) {
-        throw new EOFException();
+        throw new EOFException("Unexpected end of stream, expected " + left + " more bytes");
       }
       setBytes(destOffset, arrayBuffer, 0, amountRead);
       destOffset += amountRead;

From 5ee7d7caaf459e7d30597e6ea2dd1904c50d12fc Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 30 Oct 2024 10:13:28 -0400
Subject: [PATCH 09/41] [no ci] Add empty-columns section to the libcudf
 developer guide (#17183)

Adds a section on `Empty Columns` to the libcudf DEVELOPER_GUIDE

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17183
---
 cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 311539efbfc..1c1052487f2 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1483,6 +1483,17 @@ struct, and therefore `cudf::struct_view` is the data type of a `cudf::column` o
 
 `cudf::type_dispatcher` dispatches to the `struct_view` data type when invoked on a `STRUCT` column.
 
+# Empty Columns
+
+The libcudf columns support empty, typed content. These columns have no data and no validity mask.
+Empty strings or lists columns may or may not contain a child offsets column.
+It is undefined behavior (UB) to access the offsets child of an empty strings or lists column.
+Nested columns like lists and structs may require other children columns to provide the
+nested structure of the empty types.
+
+Use `cudf::make_empty_column()` to create fixed-width and strings columns.
+Use `cudf::empty_like()` to create an empty column from an existing `cudf::column_view`.
+
 # cuIO: file reading and writing
 
 cuIO is a component of libcudf that provides GPU-accelerated reading and writing of data file

From 6c2eb4ef03c56413000b3d28574868b68c86181f Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 30 Oct 2024 10:38:38 -0500
Subject: [PATCH 10/41] Upgrade nvcomp to 4.1.0.6 (#17201)

This updates cudf to use nvcomp 4.1.0.6.

The version is updated in rapids-cmake in https://github.com/rapidsai/rapids-cmake/pull/709.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/17201
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 conda/recipes/libcudf/conda_build_config.yaml    | 2 +-
 dependencies.yaml                                | 8 ++++----
 python/libcudf/pyproject.toml                    | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index c3716c4759a..f3bbaaa8779 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -58,7 +58,7 @@ dependencies:
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
-- nvcomp==4.0.1
+- nvcomp==4.1.0.6
 - nvtx>=0.2.1
 - openpyxl
 - packaging
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 38e131e79cb..38c5b361f70 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -56,7 +56,7 @@ dependencies:
 - numba-cuda>=0.0.13
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcomp==4.0.1
+- nvcomp==4.1.0.6
 - nvtx>=0.2.1
 - openpyxl
 - packaging
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index dc75eb4b252..c78ca326005 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -35,7 +35,7 @@ spdlog_version:
   - ">=1.14.1,<1.15"
 
 nvcomp_version:
-  - "=4.0.1"
+  - "=4.1.0.6"
 
 zlib_version:
   - ">=1.2.13"
diff --git a/dependencies.yaml b/dependencies.yaml
index 7c7aa43fa41..bd1a5deb878 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -399,21 +399,21 @@ dependencies:
       - output_types: conda
         packages:
           # Align nvcomp version with rapids-cmake
-          - nvcomp==4.0.1
+          - nvcomp==4.1.0.6
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix:
               cuda: "12.*"
             packages:
-              - nvidia-nvcomp-cu12==4.0.1
+              - nvidia-nvcomp-cu12==4.1.0.6
           - matrix:
               cuda: "11.*"
             packages:
-              - nvidia-nvcomp-cu11==4.0.1
+              - nvidia-nvcomp-cu11==4.1.0.6
           - matrix:
             packages:
-              - nvidia-nvcomp==4.0.1
+              - nvidia-nvcomp==4.1.0.6
   rapids_build_skbuild:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 84660cbc276..c6d9ae56467 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -38,7 +38,7 @@ classifiers = [
     "Environment :: GPU :: NVIDIA CUDA",
 ]
 dependencies = [
-    "nvidia-nvcomp==4.0.1",
+    "nvidia-nvcomp==4.1.0.6",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]

From 0b9277b3abe014b9ab1cf7f849c36b21c2422bbe Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 30 Oct 2024 13:52:56 -0400
Subject: [PATCH 11/41] Fix bug in recovering invalid lines in JSONL inputs
 (#17098)

Addresses #16999

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17098
---
 cpp/src/io/json/read_json.cu     | 44 +++++++++++++++++++++++---------
 cpp/src/io/json/read_json.hpp    |  1 +
 cpp/tests/io/json/json_test.cpp  | 18 +++++++++++++
 cpp/tests/io/json/json_utils.cuh |  1 +
 4 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 8a740ae17ef..2bc15ea19cb 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -20,6 +20,7 @@
 
 #include <cudf/concatenate.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -127,7 +128,8 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
 
   std::size_t const total_source_size       = sources_size(sources, 0, 0);
   auto constexpr num_delimiter_chars        = 1;
-  auto const num_extra_delimiters           = num_delimiter_chars * (sources.size() - 1);
+  auto const delimiter                      = reader_opts.get_delimiter();
+  auto const num_extra_delimiters           = num_delimiter_chars * sources.size();
   compression_type const reader_compression = reader_opts.get_compression();
   std::size_t const chunk_offset            = reader_opts.get_byte_range_offset();
   std::size_t chunk_size                    = reader_opts.get_byte_range_size();
@@ -135,10 +137,10 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
   CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
                "Invalid offsetting",
                std::invalid_argument);
-  auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
-  chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
+  auto should_load_till_last_source = !chunk_size || chunk_size >= total_source_size - chunk_offset;
+  chunk_size = should_load_till_last_source ? total_source_size - chunk_offset : chunk_size;
 
-  int num_subchunks_prealloced        = should_load_all_sources ? 0 : max_subchunks_prealloced;
+  int num_subchunks_prealloced        = should_load_till_last_source ? 0 : max_subchunks_prealloced;
   std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
 
   // The allocation for single source compressed input is estimated by assuming a ~4:1
@@ -155,17 +157,17 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
 
   // Offset within buffer indicating first read position
   std::int64_t buffer_offset = 0;
-  auto readbufspan =
-    ingest_raw_input(bufspan, sources, reader_compression, chunk_offset, chunk_size, stream);
+  auto readbufspan           = ingest_raw_input(
+    bufspan, sources, reader_compression, chunk_offset, chunk_size, delimiter, stream);
 
   auto const shift_for_nonzero_offset = std::min<std::int64_t>(chunk_offset, 1);
   auto const first_delim_pos =
-    chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, '\n', stream);
+    chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, delimiter, stream);
   if (first_delim_pos == -1) {
     // return empty owning datasource buffer
     auto empty_buf = rmm::device_buffer(0, stream);
     return datasource::owning_buffer<rmm::device_buffer>(std::move(empty_buf));
-  } else if (!should_load_all_sources) {
+  } else if (!should_load_till_last_source) {
     // Find next delimiter
     std::int64_t next_delim_pos     = -1;
     std::size_t next_subchunk_start = chunk_offset + chunk_size;
@@ -180,14 +182,15 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
                                        reader_compression,
                                        next_subchunk_start,
                                        size_per_subchunk,
+                                       delimiter,
                                        stream);
-        next_delim_pos = find_first_delimiter(readbufspan, '\n', stream) + buffer_offset;
+        next_delim_pos = find_first_delimiter(readbufspan, delimiter, stream) + buffer_offset;
         next_subchunk_start += size_per_subchunk;
       }
       if (next_delim_pos < buffer_offset) {
         if (next_subchunk_start >= total_source_size) {
           // If we have reached the end of source list but the source does not terminate with a
-          // newline character
+          // delimiter character
           next_delim_pos = buffer_offset + readbufspan.size();
         } else {
           // Our buffer_size estimate is insufficient to read until the end of the line! We need to
@@ -209,10 +212,26 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
       reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
       next_delim_pos - first_delim_pos - shift_for_nonzero_offset);
   }
+
+  // Add delimiter to end of buffer - possibly adding an empty line to the input buffer - iff we are
+  // reading till the end of the last source i.e. should_load_till_last_source is true Note that the
+  // table generated from the JSONL input remains unchanged since empty lines are ignored by the
+  // parser.
+  size_t num_chars = readbufspan.size() - first_delim_pos - shift_for_nonzero_offset;
+  if (num_chars) {
+    auto last_char = delimiter;
+    cudf::detail::cuda_memcpy_async<char>(
+      device_span<char>(reinterpret_cast<char*>(buffer.data()), buffer.size())
+        .subspan(readbufspan.size(), 1),
+      host_span<char const>(&last_char, 1, false),
+      stream);
+    num_chars++;
+  }
+
   return datasource::owning_buffer<rmm::device_buffer>(
     std::move(buffer),
     reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
-    readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
+    num_chars);
 }
 
 // Helper function to read the current batch using byte range offsets and size
@@ -245,6 +264,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                                    compression_type compression,
                                    std::size_t range_offset,
                                    std::size_t range_size,
+                                   char delimiter,
                                    rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
@@ -296,7 +316,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
     if (sources.size() > 1) {
       static_assert(num_delimiter_chars == 1,
                     "Currently only single-character delimiters are supported");
-      auto const delimiter_source = thrust::make_constant_iterator('\n');
+      auto const delimiter_source = thrust::make_constant_iterator(delimiter);
       auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
         delimiter_map, stream, cudf::get_current_device_resource_ref());
       thrust::scatter(rmm::exec_policy_nosync(stream),
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index 982190eecb5..4def69cc629 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -56,6 +56,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                                    compression_type compression,
                                    size_t range_offset,
                                    size_t range_size,
+                                   char delimiter,
                                    rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 5f070bd53b9..b58ca56e066 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2973,4 +2973,22 @@ TEST_F(JsonReaderTest, JsonDtypeSchema)
                                  cudf::test::debug_output_level::ALL_ERRORS);
 }
 
+TEST_F(JsonReaderTest, LastRecordInvalid)
+{
+  std::string data = R"({"key": "1"}
+    {"key": "})";
+  std::map<std::string, cudf::io::schema_element> schema{{"key", {dtype<cudf::string_view>()}}};
+  auto opts =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .dtypes(schema)
+      .lines(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+      .build();
+  auto const result = cudf::io::read_json(opts);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "key");
+  cudf::test::strings_column_wrapper expected{{"1", ""}, cudf::test::iterators::nulls_at({1})};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), cudf::table_view{{expected}});
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json/json_utils.cuh b/cpp/tests/io/json/json_utils.cuh
index 9383797d91b..c31bb2d24e0 100644
--- a/cpp/tests/io/json/json_utils.cuh
+++ b/cpp/tests/io/json/json_utils.cuh
@@ -52,6 +52,7 @@ std::vector<cudf::io::table_with_metadata> split_byte_range_reading(
                                                                 reader_opts.get_compression(),
                                                                 reader_opts.get_byte_range_offset(),
                                                                 reader_opts.get_byte_range_size(),
+                                                                reader_opts.get_delimiter(),
                                                                 stream);
     // Note: we cannot reuse cudf::io::json::detail::find_first_delimiter since the
     // return type of that function is size_type. However, when the chunk_size is

From 7157de71f25a1b4f6da0374a4f268c249a80ae1b Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 30 Oct 2024 19:15:54 +0000
Subject: [PATCH 12/41] Add conversion from cudf-polars expressions to libcudf
 ast for parquet filters (#17141)

Previously, we always applied parquet filters by post-filtering. This negates much of the potential gain from having filters available at read time, namely discarding row groups. To fix this, implement, with the new visitor system of #17016, conversion to pylibcudf expressions.

We must distinguish two types of expressions, ones that we can evaluate via `cudf::compute_column`, and the more restricted set of expressions that the parquet reader understands, this is handled by having a state that tracks the usage. The former style will be useful when we implement inequality joins.

While here, extend the support in pylibcudf expressions to handle all supported literal types and expose `compute_column` so we can test the correctness of the broader (non-parquet) implementation.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17141
---
 python/cudf/cudf/_lib/transform.pyx           |  24 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      |   9 +
 python/cudf_polars/cudf_polars/dsl/to_ast.py  | 265 ++++++++++++++++++
 .../cudf_polars/testing/asserts.py            |   2 +-
 .../cudf_polars/cudf_polars/testing/plugin.py |   6 +-
 python/cudf_polars/tests/dsl/test_to_ast.py   |  78 ++++++
 .../cudf_polars/tests/test_parquet_filters.py |  60 ++++
 python/pylibcudf/pylibcudf/expressions.pyx    |  50 +++-
 .../pylibcudf/pylibcudf/libcudf/transform.pxd |   5 +
 python/pylibcudf/pylibcudf/libcudf/types.pxd  |  11 +-
 .../pylibcudf/libcudf/wrappers/durations.pxd  |   5 +-
 .../pylibcudf/libcudf/wrappers/timestamps.pxd |   5 +-
 .../pylibcudf/tests/test_expressions.py       |  39 ++-
 python/pylibcudf/pylibcudf/transform.pxd      |   3 +
 python/pylibcudf/pylibcudf/transform.pyx      |  27 ++
 15 files changed, 552 insertions(+), 37 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/dsl/to_ast.py
 create mode 100644 python/cudf_polars/tests/dsl/test_to_ast.py
 create mode 100644 python/cudf_polars/tests/test_parquet_filters.py

diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 40d0c9eac3a..1589e23f716 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -7,20 +7,11 @@ from cudf.core._internals.expressions import parse_expression
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.utils import cudautils
 
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-cimport pylibcudf.libcudf.transform as libcudf_transform
 from pylibcudf cimport transform as plc_transform
 from pylibcudf.expressions cimport Expression
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.expressions cimport expression
-from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport table_view_from_columns
 
 import pylibcudf as plc
 
@@ -121,13 +112,8 @@ def compute_column(list columns, tuple column_names, expr: str):
 
     # At the end, all the stack contains is the expression to evaluate.
     cdef Expression cudf_expr = visitor.expression
-    cdef table_view tbl = table_view_from_columns(columns)
-    cdef unique_ptr[column] col
-    with nogil:
-        col = move(
-            libcudf_transform.compute_column(
-                tbl,
-                <expression &> dereference(cudf_expr.c_obj.get())
-            )
-        )
-    return Column.from_unique_ptr(move(col))
+    result = plc_transform.compute_column(
+        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
+        cudf_expr,
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index f79e229d3f3..1aa6741d417 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -28,6 +28,7 @@
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import Column, DataFrame
 from cudf_polars.dsl.nodebase import Node
+from cudf_polars.dsl.to_ast import to_parquet_filter
 from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
@@ -418,9 +419,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 colnames[0],
             )
         elif self.typ == "parquet":
+            filters = None
+            if self.predicate is not None and self.row_index is None:
+                # Can't apply filters during read if we have a row index.
+                filters = to_parquet_filter(self.predicate.value)
             tbl_w_meta = plc.io.parquet.read_parquet(
                 plc.io.SourceInfo(self.paths),
                 columns=with_columns,
+                filters=filters,
                 nrows=n_rows,
                 skip_rows=self.skip_rows,
             )
@@ -429,6 +435,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 # TODO: consider nested column names?
                 tbl_w_meta.column_names(include_children=False),
             )
+            if filters is not None:
+                # Mask must have been applied.
+                return df
         elif self.typ == "ndjson":
             json_schema: list[tuple[str, str, list]] = [
                 (name, typ, []) for name, typ in self.schema.items()
diff --git a/python/cudf_polars/cudf_polars/dsl/to_ast.py b/python/cudf_polars/cudf_polars/dsl/to_ast.py
new file mode 100644
index 00000000000..ffdae81de55
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/to_ast.py
@@ -0,0 +1,265 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Conversion of expression nodes to libcudf AST nodes."""
+
+from __future__ import annotations
+
+from functools import partial, reduce, singledispatch
+from typing import TYPE_CHECKING, TypeAlias
+
+import pylibcudf as plc
+from pylibcudf import expressions as plc_expr
+
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.dsl import expr
+from cudf_polars.dsl.traversal import CachingVisitor
+from cudf_polars.typing import GenericTransformer
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+# Can't merge these op-mapping dictionaries because scoped enum values
+# are exposed by cython with equality/hash based one their underlying
+# representation type. So in a dict they are just treated as integers.
+BINOP_TO_ASTOP = {
+    plc.binaryop.BinaryOperator.EQUAL: plc_expr.ASTOperator.EQUAL,
+    plc.binaryop.BinaryOperator.NULL_EQUALS: plc_expr.ASTOperator.NULL_EQUAL,
+    plc.binaryop.BinaryOperator.NOT_EQUAL: plc_expr.ASTOperator.NOT_EQUAL,
+    plc.binaryop.BinaryOperator.LESS: plc_expr.ASTOperator.LESS,
+    plc.binaryop.BinaryOperator.LESS_EQUAL: plc_expr.ASTOperator.LESS_EQUAL,
+    plc.binaryop.BinaryOperator.GREATER: plc_expr.ASTOperator.GREATER,
+    plc.binaryop.BinaryOperator.GREATER_EQUAL: plc_expr.ASTOperator.GREATER_EQUAL,
+    plc.binaryop.BinaryOperator.ADD: plc_expr.ASTOperator.ADD,
+    plc.binaryop.BinaryOperator.SUB: plc_expr.ASTOperator.SUB,
+    plc.binaryop.BinaryOperator.MUL: plc_expr.ASTOperator.MUL,
+    plc.binaryop.BinaryOperator.DIV: plc_expr.ASTOperator.DIV,
+    plc.binaryop.BinaryOperator.TRUE_DIV: plc_expr.ASTOperator.TRUE_DIV,
+    plc.binaryop.BinaryOperator.FLOOR_DIV: plc_expr.ASTOperator.FLOOR_DIV,
+    plc.binaryop.BinaryOperator.PYMOD: plc_expr.ASTOperator.PYMOD,
+    plc.binaryop.BinaryOperator.BITWISE_AND: plc_expr.ASTOperator.BITWISE_AND,
+    plc.binaryop.BinaryOperator.BITWISE_OR: plc_expr.ASTOperator.BITWISE_OR,
+    plc.binaryop.BinaryOperator.BITWISE_XOR: plc_expr.ASTOperator.BITWISE_XOR,
+    plc.binaryop.BinaryOperator.LOGICAL_AND: plc_expr.ASTOperator.LOGICAL_AND,
+    plc.binaryop.BinaryOperator.LOGICAL_OR: plc_expr.ASTOperator.LOGICAL_OR,
+    plc.binaryop.BinaryOperator.NULL_LOGICAL_AND: plc_expr.ASTOperator.NULL_LOGICAL_AND,
+    plc.binaryop.BinaryOperator.NULL_LOGICAL_OR: plc_expr.ASTOperator.NULL_LOGICAL_OR,
+}
+
+UOP_TO_ASTOP = {
+    plc.unary.UnaryOperator.SIN: plc_expr.ASTOperator.SIN,
+    plc.unary.UnaryOperator.COS: plc_expr.ASTOperator.COS,
+    plc.unary.UnaryOperator.TAN: plc_expr.ASTOperator.TAN,
+    plc.unary.UnaryOperator.ARCSIN: plc_expr.ASTOperator.ARCSIN,
+    plc.unary.UnaryOperator.ARCCOS: plc_expr.ASTOperator.ARCCOS,
+    plc.unary.UnaryOperator.ARCTAN: plc_expr.ASTOperator.ARCTAN,
+    plc.unary.UnaryOperator.SINH: plc_expr.ASTOperator.SINH,
+    plc.unary.UnaryOperator.COSH: plc_expr.ASTOperator.COSH,
+    plc.unary.UnaryOperator.TANH: plc_expr.ASTOperator.TANH,
+    plc.unary.UnaryOperator.ARCSINH: plc_expr.ASTOperator.ARCSINH,
+    plc.unary.UnaryOperator.ARCCOSH: plc_expr.ASTOperator.ARCCOSH,
+    plc.unary.UnaryOperator.ARCTANH: plc_expr.ASTOperator.ARCTANH,
+    plc.unary.UnaryOperator.EXP: plc_expr.ASTOperator.EXP,
+    plc.unary.UnaryOperator.LOG: plc_expr.ASTOperator.LOG,
+    plc.unary.UnaryOperator.SQRT: plc_expr.ASTOperator.SQRT,
+    plc.unary.UnaryOperator.CBRT: plc_expr.ASTOperator.CBRT,
+    plc.unary.UnaryOperator.CEIL: plc_expr.ASTOperator.CEIL,
+    plc.unary.UnaryOperator.FLOOR: plc_expr.ASTOperator.FLOOR,
+    plc.unary.UnaryOperator.ABS: plc_expr.ASTOperator.ABS,
+    plc.unary.UnaryOperator.RINT: plc_expr.ASTOperator.RINT,
+    plc.unary.UnaryOperator.BIT_INVERT: plc_expr.ASTOperator.BIT_INVERT,
+    plc.unary.UnaryOperator.NOT: plc_expr.ASTOperator.NOT,
+}
+
+SUPPORTED_STATISTICS_BINOPS = {
+    plc.binaryop.BinaryOperator.EQUAL,
+    plc.binaryop.BinaryOperator.NOT_EQUAL,
+    plc.binaryop.BinaryOperator.LESS,
+    plc.binaryop.BinaryOperator.LESS_EQUAL,
+    plc.binaryop.BinaryOperator.GREATER,
+    plc.binaryop.BinaryOperator.GREATER_EQUAL,
+}
+
+REVERSED_COMPARISON = {
+    plc.binaryop.BinaryOperator.EQUAL: plc.binaryop.BinaryOperator.EQUAL,
+    plc.binaryop.BinaryOperator.NOT_EQUAL: plc.binaryop.BinaryOperator.NOT_EQUAL,
+    plc.binaryop.BinaryOperator.LESS: plc.binaryop.BinaryOperator.GREATER,
+    plc.binaryop.BinaryOperator.LESS_EQUAL: plc.binaryop.BinaryOperator.GREATER_EQUAL,
+    plc.binaryop.BinaryOperator.GREATER: plc.binaryop.BinaryOperator.LESS,
+    plc.binaryop.BinaryOperator.GREATER_EQUAL: plc.binaryop.BinaryOperator.LESS_EQUAL,
+}
+
+
+Transformer: TypeAlias = GenericTransformer[expr.Expr, plc_expr.Expression]
+
+
+@singledispatch
+def _to_ast(node: expr.Expr, self: Transformer) -> plc_expr.Expression:
+    """
+    Translate an expression to a pylibcudf Expression.
+
+    Parameters
+    ----------
+    node
+        Expression to translate.
+    self
+        Recursive transformer. The state dictionary should contain a
+       `for_parquet` key indicating if this transformation should
+        provide an expression suitable for use in parquet filters.
+
+        If `for_parquet` is `False`, the dictionary should contain a
+        `name_to_index` mapping that maps column names to their
+        integer index in the table that will be used for evaluation of
+        the expression.
+
+    Returns
+    -------
+    pylibcudf Expression.
+
+    Raises
+    ------
+    NotImplementedError or KeyError if the expression cannot be translated.
+    """
+    raise NotImplementedError(f"Unhandled expression type {type(node)}")
+
+
+@_to_ast.register
+def _(node: expr.Col, self: Transformer) -> plc_expr.Expression:
+    if self.state["for_parquet"]:
+        return plc_expr.ColumnNameReference(node.name)
+    return plc_expr.ColumnReference(self.state["name_to_index"][node.name])
+
+
+@_to_ast.register
+def _(node: expr.Literal, self: Transformer) -> plc_expr.Expression:
+    return plc_expr.Literal(plc.interop.from_arrow(node.value))
+
+
+@_to_ast.register
+def _(node: expr.BinOp, self: Transformer) -> plc_expr.Expression:
+    if node.op == plc.binaryop.BinaryOperator.NULL_NOT_EQUALS:
+        return plc_expr.Operation(
+            plc_expr.ASTOperator.NOT,
+            self(
+                # Reconstruct and apply, rather than directly
+                # constructing the right expression so we get the
+                # handling of parquet special cases for free.
+                expr.BinOp(
+                    node.dtype, plc.binaryop.BinaryOperator.NULL_EQUALS, *node.children
+                )
+            ),
+        )
+    if self.state["for_parquet"]:
+        op1_col, op2_col = (isinstance(op, expr.Col) for op in node.children)
+        if op1_col ^ op2_col:
+            op = node.op
+            if op not in SUPPORTED_STATISTICS_BINOPS:
+                raise NotImplementedError(
+                    f"Parquet filter binop with column doesn't support {node.op!r}"
+                )
+            op1, op2 = node.children
+            if op2_col:
+                (op1, op2) = (op2, op1)
+                op = REVERSED_COMPARISON[op]
+            if not isinstance(op2, expr.Literal):
+                raise NotImplementedError(
+                    "Parquet filter binops must have form 'col binop literal'"
+                )
+            return plc_expr.Operation(BINOP_TO_ASTOP[op], self(op1), self(op2))
+        elif op1_col and op2_col:
+            raise NotImplementedError(
+                "Parquet filter binops must have one column reference not two"
+            )
+    return plc_expr.Operation(BINOP_TO_ASTOP[node.op], *map(self, node.children))
+
+
+@_to_ast.register
+def _(node: expr.BooleanFunction, self: Transformer) -> plc_expr.Expression:
+    if node.name == pl_expr.BooleanFunction.IsIn:
+        needles, haystack = node.children
+        if isinstance(haystack, expr.LiteralColumn) and len(haystack.value) < 16:
+            # 16 is an arbitrary limit
+            needle_ref = self(needles)
+            values = [
+                plc_expr.Literal(plc.interop.from_arrow(v)) for v in haystack.value
+            ]
+            return reduce(
+                partial(plc_expr.Operation, plc_expr.ASTOperator.LOGICAL_OR),
+                (
+                    plc_expr.Operation(plc_expr.ASTOperator.EQUAL, needle_ref, value)
+                    for value in values
+                ),
+            )
+    if self.state["for_parquet"] and isinstance(node.children[0], expr.Col):
+        raise NotImplementedError(
+            f"Parquet filters don't support {node.name} on columns"
+        )
+    if node.name == pl_expr.BooleanFunction.IsNull:
+        return plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0]))
+    elif node.name == pl_expr.BooleanFunction.IsNotNull:
+        return plc_expr.Operation(
+            plc_expr.ASTOperator.NOT,
+            plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0])),
+        )
+    elif node.name == pl_expr.BooleanFunction.Not:
+        return plc_expr.Operation(plc_expr.ASTOperator.NOT, self(node.children[0]))
+    raise NotImplementedError(f"AST conversion does not support {node.name}")
+
+
+@_to_ast.register
+def _(node: expr.UnaryFunction, self: Transformer) -> plc_expr.Expression:
+    if isinstance(node.children[0], expr.Col) and self.state["for_parquet"]:
+        raise NotImplementedError(
+            "Parquet filters don't support {node.name} on columns"
+        )
+    return plc_expr.Operation(
+        UOP_TO_ASTOP[node._OP_MAPPING[node.name]], self(node.children[0])
+    )
+
+
+def to_parquet_filter(node: expr.Expr) -> plc_expr.Expression | None:
+    """
+    Convert an expression to libcudf AST nodes suitable for parquet filtering.
+
+    Parameters
+    ----------
+    node
+        Expression to convert.
+
+    Returns
+    -------
+    pylibcudf Expression if conversion is possible, otherwise None.
+    """
+    mapper = CachingVisitor(_to_ast, state={"for_parquet": True})
+    try:
+        return mapper(node)
+    except (KeyError, NotImplementedError):
+        return None
+
+
+def to_ast(
+    node: expr.Expr, *, name_to_index: Mapping[str, int]
+) -> plc_expr.Expression | None:
+    """
+    Convert an expression to libcudf AST nodes suitable for compute_column.
+
+    Parameters
+    ----------
+    node
+        Expression to convert.
+    name_to_index
+        Mapping from column names to their index in the table that
+        will be used for expression evaluation.
+
+    Returns
+    -------
+    pylibcudf Expressoin if conversion is possible, otherwise None.
+    """
+    mapper = CachingVisitor(
+        _to_ast, state={"for_parquet": False, "name_to_index": name_to_index}
+    )
+    try:
+        return mapper(node)
+    except (KeyError, NotImplementedError):
+        return None
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index 7b6f3848fc4..7b45c1eaa06 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -151,7 +151,7 @@ def assert_collect_raises(
     collect_kwargs: dict[OptimizationArgs, bool] | None = None,
     polars_collect_kwargs: dict[OptimizationArgs, bool] | None = None,
     cudf_collect_kwargs: dict[OptimizationArgs, bool] | None = None,
-):
+) -> None:
     """
     Assert that collecting the result of a query raises the expected exceptions.
 
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index a3607159e01..e01ccd05527 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -16,7 +16,7 @@
     from collections.abc import Mapping
 
 
-def pytest_addoption(parser: pytest.Parser):
+def pytest_addoption(parser: pytest.Parser) -> None:
     """Add plugin-specific options."""
     group = parser.getgroup(
         "cudf-polars", "Plugin to set GPU as default engine for polars tests"
@@ -28,7 +28,7 @@ def pytest_addoption(parser: pytest.Parser):
     )
 
 
-def pytest_configure(config: pytest.Config):
+def pytest_configure(config: pytest.Config) -> None:
     """Enable use of this module as a pytest plugin to enable GPU collection."""
     no_fallback = config.getoption("--cudf-polars-no-fallback")
     collect = polars.LazyFrame.collect
@@ -172,7 +172,7 @@ def pytest_configure(config: pytest.Config):
 
 def pytest_collection_modifyitems(
     session: pytest.Session, config: pytest.Config, items: list[pytest.Item]
-):
+) -> None:
     """Mark known failing tests."""
     if config.getoption("--cudf-polars-no-fallback"):
         # Don't xfail tests if running without fallback
diff --git a/python/cudf_polars/tests/dsl/test_to_ast.py b/python/cudf_polars/tests/dsl/test_to_ast.py
new file mode 100644
index 00000000000..a7b779a6ec9
--- /dev/null
+++ b/python/cudf_polars/tests/dsl/test_to_ast.py
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pylibcudf as plc
+import pytest
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+import cudf_polars.dsl.ir as ir_nodes
+from cudf_polars import translate_ir
+from cudf_polars.containers.dataframe import DataFrame, NamedColumn
+from cudf_polars.dsl.to_ast import to_ast
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pl.LazyFrame(
+        {
+            "c": ["a", "b", "c", "d", "e", "f"],
+            "a": [1, 2, 3, None, 4, 5],
+            "b": pl.Series([None, None, 3, float("inf"), 4, 0], dtype=pl.Float64),
+            "d": [False, True, True, None, False, False],
+        }
+    )
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.col("a").is_in([0, 1]),
+        pl.col("a").is_between(0, 2),
+        (pl.col("a") < pl.col("b")).not_(),
+        pl.lit(2) > pl.col("a"),
+        pl.lit(2) >= pl.col("a"),
+        pl.lit(2) < pl.col("a"),
+        pl.lit(2) <= pl.col("a"),
+        pl.lit(0) == pl.col("a"),
+        pl.lit(1) != pl.col("a"),
+        (pl.col("b") < pl.lit(2, dtype=pl.Float64).sqrt()),
+        (pl.col("a") >= pl.lit(2)) & (pl.col("b") > 0),
+        pl.col("a").is_null(),
+        pl.col("a").is_not_null(),
+        pl.col("b").is_finite(),
+        pytest.param(
+            pl.col("a").sin(),
+            marks=pytest.mark.xfail(reason="Need to insert explicit casts"),
+        ),
+        pl.col("b").cos(),
+        pl.col("a").abs().is_between(0, 2),
+        pl.col("a").ne_missing(pl.lit(None, dtype=pl.Int64)),
+        [pl.col("a") * 2, pl.col("b") + pl.col("a")],
+        pl.col("d").not_(),
+    ],
+)
+def test_compute_column(expr, df):
+    q = df.select(expr)
+    ir = translate_ir(q._ldf.visit())
+
+    assert isinstance(ir, ir_nodes.Select)
+    table = ir.children[0].evaluate(cache={})
+    name_to_index = {c.name: i for i, c in enumerate(table.columns)}
+
+    def compute_column(e):
+        ast = to_ast(e.value, name_to_index=name_to_index)
+        if ast is not None:
+            return NamedColumn(
+                plc.transform.compute_column(table.table, ast), name=e.name
+            )
+        return e.evaluate(table)
+
+    got = DataFrame(map(compute_column, ir.exprs)).to_polars()
+
+    expect = q.collect()
+
+    assert_frame_equal(expect, got)
diff --git a/python/cudf_polars/tests/test_parquet_filters.py b/python/cudf_polars/tests/test_parquet_filters.py
new file mode 100644
index 00000000000..545a89250fc
--- /dev/null
+++ b/python/cudf_polars/tests/test_parquet_filters.py
@@ -0,0 +1,60 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pl.DataFrame(
+        {
+            "c": ["a", "b", "c", "d", "e", "f"],
+            "a": [1, 2, 3, None, 4, 5],
+            "b": pl.Series([None, None, 3, float("inf"), 4, 0], dtype=pl.Float64),
+            "d": [-1, 2, -3, None, 4, -5],
+        }
+    )
+
+
+@pytest.fixture(scope="module")
+def pq_file(tmp_path_factory, df):
+    tmp_path = tmp_path_factory.mktemp("parquet_filter")
+    df.write_parquet(tmp_path / "tmp.pq", row_group_size=3)
+    return pl.scan_parquet(tmp_path / "tmp.pq")
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.col("a").is_in([0, 1]),
+        pl.col("a").is_between(0, 2),
+        (pl.col("a") < 2).not_(),
+        pl.lit(2) > pl.col("a"),
+        pl.lit(2) >= pl.col("a"),
+        pl.lit(2) < pl.col("a"),
+        pl.lit(2) <= pl.col("a"),
+        pl.lit(0) == pl.col("a"),
+        pl.lit(1) != pl.col("a"),
+        pl.col("a") == pl.col("d"),
+        (pl.col("b") < pl.lit(2, dtype=pl.Float64).sqrt()),
+        (pl.col("a") >= pl.lit(2)) & (pl.col("b") > 0),
+        pl.col("b").is_finite(),
+        pl.col("a").is_null(),
+        pl.col("a").is_not_null(),
+        pl.col("a").abs().is_between(0, 2),
+        pl.col("a").ne_missing(pl.lit(None, dtype=pl.Int64)),
+    ],
+)
+@pytest.mark.parametrize("selection", [["c", "b"], ["a"], ["a", "c"], ["b"], "c"])
+def test_scan_by_hand(expr, selection, pq_file):
+    df = pq_file.collect()
+    q = pq_file.filter(expr).select(*selection)
+    # Not using assert_gpu_result_equal because
+    # https://github.com/pola-rs/polars/issues/19238
+    got = q.collect(engine=pl.GPUEngine(raise_on_fail=True))
+    expect = df.filter(expr).select(*selection)
+    assert_frame_equal(got, expect)
diff --git a/python/pylibcudf/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx
index a44c9e25987..1535f68366b 100644
--- a/python/pylibcudf/pylibcudf/expressions.pyx
+++ b/python/pylibcudf/pylibcudf/expressions.pyx
@@ -5,7 +5,17 @@ from pylibcudf.libcudf.expressions import \
     table_reference as TableReference  # no-cython-lint
 
 from cython.operator cimport dereference
-from libc.stdint cimport int32_t, int64_t
+from libc.stdint cimport (
+    int8_t,
+    int16_t,
+    int32_t,
+    int64_t,
+    uint8_t,
+    uint16_t,
+    uint32_t,
+    uint64_t,
+)
+from libcpp cimport bool
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -18,12 +28,14 @@ from pylibcudf.libcudf.scalar.scalar cimport (
 )
 from pylibcudf.libcudf.types cimport size_type, type_id
 from pylibcudf.libcudf.wrappers.durations cimport (
+    duration_D,
     duration_ms,
     duration_ns,
     duration_s,
     duration_us,
 )
 from pylibcudf.libcudf.wrappers.timestamps cimport (
+    timestamp_D,
     timestamp_ms,
     timestamp_ns,
     timestamp_s,
@@ -78,6 +90,34 @@ cdef class Literal(Expression):
             self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <numeric_scalar[int32_t] &>dereference(self.scalar.c_obj)
             ))
+        elif tid == type_id.INT16:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[int16_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.INT8:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[int8_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.UINT64:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[uint64_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.UINT32:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[uint32_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.UINT16:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[uint16_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.UINT8:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[uint8_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.BOOL8:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[bool] &>dereference(self.scalar.c_obj)
+            ))
         elif tid == type_id.FLOAT64:
             self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <numeric_scalar[double] &>dereference(self.scalar.c_obj)
@@ -110,6 +150,10 @@ cdef class Literal(Expression):
             self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <timestamp_scalar[timestamp_s] &>dereference(self.scalar.c_obj)
             ))
+        elif tid == type_id.TIMESTAMP_DAYS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_D] &>dereference(self.scalar.c_obj)
+            ))
         elif tid == type_id.DURATION_NANOSECONDS:
             self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <duration_scalar[duration_ns] &>dereference(self.scalar.c_obj)
@@ -130,6 +174,10 @@ cdef class Literal(Expression):
             self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <duration_scalar[duration_s] &>dereference(self.scalar.c_obj)
             ))
+        elif tid == type_id.DURATION_DAYS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_D] &>dereference(self.scalar.c_obj)
+            ))
         else:
             raise NotImplementedError(
                 f"Don't know how to make literal with type id {tid}"
diff --git a/python/pylibcudf/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
index d21510bd731..47d79083b66 100644
--- a/python/pylibcudf/pylibcudf/libcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
@@ -27,6 +27,11 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         column_view input
     ) except +
 
+    cdef unique_ptr[column] compute_column(
+        table_view table,
+        expression expr
+    ) except +
+
     cdef unique_ptr[column] transform(
         column_view input,
         string unary_udf,
diff --git a/python/pylibcudf/pylibcudf/libcudf/types.pxd b/python/pylibcudf/pylibcudf/libcudf/types.pxd
index eabae68bc90..60e293e5cdb 100644
--- a/python/pylibcudf/pylibcudf/libcudf/types.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/types.pxd
@@ -70,18 +70,19 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         TIMESTAMP_MILLISECONDS
         TIMESTAMP_MICROSECONDS
         TIMESTAMP_NANOSECONDS
-        DICTIONARY32
-        STRING
-        LIST
-        STRUCT
-        NUM_TYPE_IDS
+        DURATION_DAYS
         DURATION_SECONDS
         DURATION_MILLISECONDS
         DURATION_MICROSECONDS
         DURATION_NANOSECONDS
+        DICTIONARY32
+        STRING
+        LIST
         DECIMAL32
         DECIMAL64
         DECIMAL128
+        STRUCT
+        NUM_TYPE_IDS
 
     cdef cppclass data_type:
         data_type() except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd
index 7c648425eb5..c9c960d0a79 100644
--- a/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd
@@ -1,9 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport int64_t
+from libc.stdint cimport int32_t, int64_t
 
 
 cdef extern from "cudf/wrappers/durations.hpp" namespace "cudf" nogil:
+    ctypedef int32_t duration_D
     ctypedef int64_t duration_s
     ctypedef int64_t duration_ms
     ctypedef int64_t duration_us
diff --git a/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd
index 50d37fd0a68..5dcd144529d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd
@@ -1,9 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport int64_t
+from libc.stdint cimport int32_t, int64_t
 
 
 cdef extern from "cudf/wrappers/timestamps.hpp" namespace "cudf" nogil:
+    ctypedef int32_t timestamp_D
     ctypedef int64_t timestamp_s
     ctypedef int64_t timestamp_ms
     ctypedef int64_t timestamp_us
diff --git a/python/pylibcudf/pylibcudf/tests/test_expressions.py b/python/pylibcudf/pylibcudf/tests/test_expressions.py
index 6eabd6db617..52c81c49b9d 100644
--- a/python/pylibcudf/pylibcudf/tests/test_expressions.py
+++ b/python/pylibcudf/pylibcudf/tests/test_expressions.py
@@ -1,12 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
+import pyarrow.compute as pc
 import pytest
+from utils import assert_column_eq
 
 import pylibcudf as plc
 
-# We can't really evaluate these expressions, so just make sure
-# construction works properly
-
 
 def test_literal_construction_invalid():
     with pytest.raises(ValueError):
@@ -23,7 +22,7 @@ def test_literal_construction_invalid():
     ],
 )
 def test_columnref_construction(tableref):
-    plc.expressions.ColumnReference(1.0, tableref)
+    plc.expressions.ColumnReference(1, tableref)
 
 
 def test_columnnameref_construction():
@@ -48,3 +47,35 @@ def test_columnnameref_construction():
 )
 def test_astoperation_construction(kwargs):
     plc.expressions.Operation(**kwargs)
+
+
+def test_evaluation():
+    table_h = pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
+    lit = pa.scalar(42, type=pa.int64())
+    table = plc.interop.from_arrow(table_h)
+    # expr = abs(b * c - (a + 42))
+    expr = plc.expressions.Operation(
+        plc.expressions.ASTOperator.ABS,
+        plc.expressions.Operation(
+            plc.expressions.ASTOperator.SUB,
+            plc.expressions.Operation(
+                plc.expressions.ASTOperator.MUL,
+                plc.expressions.ColumnReference(1),
+                plc.expressions.ColumnReference(2),
+            ),
+            plc.expressions.Operation(
+                plc.expressions.ASTOperator.ADD,
+                plc.expressions.ColumnReference(0),
+                plc.expressions.Literal(plc.interop.from_arrow(lit)),
+            ),
+        ),
+    )
+
+    expect = pc.abs(
+        pc.subtract(
+            pc.multiply(table_h["b"], table_h["c"]), pc.add(table_h["a"], lit)
+        )
+    )
+    got = plc.transform.compute_column(table, expr)
+
+    assert_column_eq(expect, got)
diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd
index b530f433c97..4fb623158f0 100644
--- a/python/pylibcudf/pylibcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/transform.pxd
@@ -3,6 +3,7 @@ from libcpp cimport bool
 from pylibcudf.libcudf.types cimport bitmask_type, data_type
 
 from .column cimport Column
+from .expressions cimport Expression
 from .gpumemoryview cimport gpumemoryview
 from .table cimport Table
 from .types cimport DataType
@@ -10,6 +11,8 @@ from .types cimport DataType
 
 cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input)
 
+cpdef Column compute_column(Table input, Expression expr)
+
 cpdef tuple[gpumemoryview, int] bools_to_mask(Column input)
 
 cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit)
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index bce9702752a..e8d95cadb0c 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move, pair
@@ -43,6 +44,32 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
     )
 
 
+cpdef Column compute_column(Table input, Expression expr):
+    """Create a column by evaluating an expression on a table.
+
+    For details see :cpp:func:`compute_column`.
+
+    Parameters
+    ----------
+    input : Table
+        Table used for expression evaluation
+    expr : Expression
+        Expression to evaluate
+
+    Returns
+    -------
+    Column of the evaluated expression
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_transform.compute_column(
+            input.view(), dereference(expr.c_obj.get())
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
 cpdef tuple[gpumemoryview, int] bools_to_mask(Column input):
     """Create a bitmask from a column of boolean elements
 

From 5a6d177b259bcda647a393bc1df63f06bb26b56f Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 30 Oct 2024 14:49:50 -0500
Subject: [PATCH 13/41] Fix ``to_parquet`` append behavior with global metadata
 file (#17198)

Closes https://github.com/rapidsai/cudf/issues/17177

When appending to a parquet dataset with Dask cuDF, the original metadata must be converted from `pq.FileMetaData` to `bytes` before it can be passed down to `cudf.io.merge_parquet_filemetadata`.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/cudf/pull/17198
---
 python/dask_cudf/dask_cudf/io/parquet.py      |  6 ++++++
 .../dask_cudf/io/tests/test_parquet.py        | 20 +++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index a781b8242fe..39ac6474958 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -383,6 +383,12 @@ def write_metadata(parts, fmd, fs, path, append=False, **kwargs):
             metadata_path = fs.sep.join([path, "_metadata"])
             _meta = []
             if append and fmd is not None:
+                # Convert to bytes: <https://github.com/rapidsai/cudf/issues/17177>
+                if isinstance(fmd, pq.FileMetaData):
+                    with BytesIO() as myio:
+                        fmd.write_metadata_file(myio)
+                        myio.seek(0)
+                        fmd = np.frombuffer(myio.read(), dtype="uint8")
                 _meta = [fmd]
             _meta.extend([parts[i][0]["meta"] for i in range(len(parts))])
             _meta = (
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index ae5ca480e31..a29cf9a342a 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -644,3 +644,23 @@ def test_read_parquet_arrow_filesystem(tmpdir, min_part_size):
         dd.assert_eq(df, ddf, check_index=False)
         assert isinstance(ddf._meta, cudf.DataFrame)
         assert isinstance(ddf.compute(), cudf.DataFrame)
+
+
+@pytest.mark.parametrize("write_metadata_file", [True, False])
+def test_to_parquet_append(tmpdir, write_metadata_file):
+    df = cudf.DataFrame({"a": [1, 2, 3]})
+    ddf = dask_cudf.from_cudf(df, npartitions=1)
+    ddf.to_parquet(
+        tmpdir,
+        append=True,
+        write_metadata_file=write_metadata_file,
+        write_index=False,
+    )
+    ddf.to_parquet(
+        tmpdir,
+        append=True,
+        write_metadata_file=write_metadata_file,
+        write_index=False,
+    )
+    ddf2 = dask_cudf.read_parquet(tmpdir)
+    dd.assert_eq(cudf.concat([df, df]), ddf2)

From 3cf186c5c31b658f0cb7b5f180de0e72f533d413 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 30 Oct 2024 20:14:44 -0400
Subject: [PATCH 14/41] Add remaining datetime APIs to pylibcudf (#17143)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/17143
---
 python/pylibcudf/pylibcudf/datetime.pxd       |  51 ++-
 python/pylibcudf/pylibcudf/datetime.pyx       | 319 +++++++++++++++++-
 .../pylibcudf/pylibcudf/libcudf/datetime.pxd  |  22 +-
 .../pylibcudf/tests/test_datetime.py          | 152 +++++++++
 4 files changed, 519 insertions(+), 25 deletions(-)

diff --git a/python/pylibcudf/pylibcudf/datetime.pxd b/python/pylibcudf/pylibcudf/datetime.pxd
index 72ce680ba7a..335ef435f9b 100644
--- a/python/pylibcudf/pylibcudf/datetime.pxd
+++ b/python/pylibcudf/pylibcudf/datetime.pxd
@@ -1,15 +1,56 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from pylibcudf.libcudf.datetime cimport datetime_component
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.datetime cimport datetime_component, rounding_frequency
+from pylibcudf.scalar cimport Scalar
 
-from .column cimport Column
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
 
+cpdef Column extract_millisecond_fraction(
+    Column input
+)
+
+cpdef Column extract_microsecond_fraction(
+    Column input
+)
 
-cpdef Column extract_year(
-    Column col
+cpdef Column extract_nanosecond_fraction(
+    Column input
 )
 
 cpdef Column extract_datetime_component(
-    Column col,
+    Column input,
     datetime_component component
 )
+
+cpdef Column ceil_datetimes(
+    Column input,
+    rounding_frequency freq
+)
+
+cpdef Column floor_datetimes(
+    Column input,
+    rounding_frequency freq
+)
+
+cpdef Column round_datetimes(
+    Column input,
+    rounding_frequency freq
+)
+
+cpdef Column add_calendrical_months(
+    Column timestamps,
+    ColumnOrScalar months,
+)
+
+cpdef Column day_of_year(Column input)
+
+cpdef Column is_leap_year(Column input)
+
+cpdef Column last_day_of_month(Column input)
+
+cpdef Column extract_quarter(Column input)
+
+cpdef Column days_in_month(Column input)
diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx
index ac4335cca56..9e5e709d81d 100644
--- a/python/pylibcudf/pylibcudf/datetime.pyx
+++ b/python/pylibcudf/pylibcudf/datetime.pyx
@@ -3,41 +3,106 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.datetime cimport (
+    add_calendrical_months as cpp_add_calendrical_months,
+    ceil_datetimes as cpp_ceil_datetimes,
     datetime_component,
+    day_of_year as cpp_day_of_year,
+    days_in_month as cpp_days_in_month,
     extract_datetime_component as cpp_extract_datetime_component,
-    extract_year as cpp_extract_year,
+    extract_microsecond_fraction as cpp_extract_microsecond_fraction,
+    extract_millisecond_fraction as cpp_extract_millisecond_fraction,
+    extract_nanosecond_fraction as cpp_extract_nanosecond_fraction,
+    extract_quarter as cpp_extract_quarter,
+    floor_datetimes as cpp_floor_datetimes,
+    is_leap_year as cpp_is_leap_year,
+    last_day_of_month as cpp_last_day_of_month,
+    round_datetimes as cpp_round_datetimes,
+    rounding_frequency,
 )
 
 from pylibcudf.libcudf.datetime import \
     datetime_component as DatetimeComponent  # no-cython-lint
+from pylibcudf.libcudf.datetime import \
+    rounding_frequency as RoundingFrequency  # no-cython-lint
+
+from cython.operator cimport dereference
 
 from .column cimport Column
 
+cpdef Column extract_millisecond_fraction(
+    Column input
+):
+    """
+    Extract the millisecond from a datetime column.
+
+    For details, see :cpp:func:`extract_millisecond_fraction`.
+
+    Parameters
+    ----------
+    input : Column
+        The column to extract the millisecond from.
+
+    Returns
+    -------
+    Column
+        Column with the extracted milliseconds.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_extract_millisecond_fraction(input.view())
+    return Column.from_libcudf(move(result))
+
+cpdef Column extract_microsecond_fraction(
+    Column input
+):
+    """
+    Extract the microsecond fraction from a datetime column.
+
+    For details, see :cpp:func:`extract_microsecond_fraction`.
+
+    Parameters
+    ----------
+    input : Column
+        The column to extract the microsecond fraction from.
+
+    Returns
+    -------
+    Column
+        Column with the extracted microsecond fractions.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_extract_microsecond_fraction(input.view())
+    return Column.from_libcudf(move(result))
 
-cpdef Column extract_year(
-    Column values
+cpdef Column extract_nanosecond_fraction(
+    Column input
 ):
     """
-    Extract the year from a datetime column.
+    Extract the nanosecond fraction from a datetime column.
+
+    For details, see :cpp:func:`extract_nanosecond_fraction`.
 
     Parameters
     ----------
-    values : Column
-        The column to extract the year from.
+    input : Column
+        The column to extract the nanosecond fraction from.
 
     Returns
     -------
     Column
-        Column with the extracted years.
+        Column with the extracted nanosecond fractions.
     """
     cdef unique_ptr[column] result
 
     with nogil:
-        result = cpp_extract_year(values.view())
+        result = cpp_extract_nanosecond_fraction(input.view())
     return Column.from_libcudf(move(result))
 
 cpdef Column extract_datetime_component(
-    Column values,
+    Column input,
     datetime_component component
 ):
     """
@@ -47,7 +112,7 @@ cpdef Column extract_datetime_component(
 
     Parameters
     ----------
-    values : Column
+    input : Column
         The column to extract the component from.
     component : DatetimeComponent
         The datetime component to extract.
@@ -60,5 +125,237 @@ cpdef Column extract_datetime_component(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = cpp_extract_datetime_component(values.view(), component)
+        result = cpp_extract_datetime_component(input.view(), component)
+    return Column.from_libcudf(move(result))
+
+cpdef Column ceil_datetimes(
+    Column input,
+    rounding_frequency freq
+):
+    """
+    Round datetimes up to the nearest multiple of the given frequency.
+
+    For details, see :cpp:func:`ceil_datetimes`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+    freq : rounding_frequency
+        The frequency to round up to.
+
+    Returns
+    -------
+    Column
+        Column of the same datetime resolution as the input column.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_ceil_datetimes(input.view(), freq)
+    return Column.from_libcudf(move(result))
+
+cpdef Column floor_datetimes(
+    Column input,
+    rounding_frequency freq
+):
+    """
+    Round datetimes down to the nearest multiple of the given frequency.
+
+    For details, see :cpp:func:`floor_datetimes`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+    freq : rounding_frequency
+        The frequency to round down to.
+
+    Returns
+    -------
+    Column
+        Column of the same datetime resolution as the input column.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_floor_datetimes(input.view(), freq)
+    return Column.from_libcudf(move(result))
+
+cpdef Column round_datetimes(
+    Column input,
+    rounding_frequency freq
+):
+    """
+    Round datetimes to the nearest multiple of the given frequency.
+
+    For details, see :cpp:func:`round_datetimes`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+    freq : rounding_frequency
+        The frequency to round to.
+
+    Returns
+    -------
+    Column
+        Column of the same datetime resolution as the input column.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_round_datetimes(input.view(), freq)
+    return Column.from_libcudf(move(result))
+
+cpdef Column add_calendrical_months(
+    Column input,
+    ColumnOrScalar months,
+):
+    """
+    Adds or subtracts a number of months from the datetime
+    type and returns a timestamp column that is of the same
+    type as the input timestamps column.
+
+    For details, see :cpp:func:`add_calendrical_months`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input timestamp values.
+    months : ColumnOrScalar
+        The number of months to add.
+
+    Returns
+    -------
+    Column
+        Column of computed timestamps.
+    """
+    if not isinstance(months, (Column, Scalar)):
+        raise TypeError("Must pass a Column or Scalar")
+
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_add_calendrical_months(
+            input.view(),
+            months.view() if ColumnOrScalar is Column else
+            dereference(months.get())
+        )
+    return Column.from_libcudf(move(result))
+
+cpdef Column day_of_year(Column input):
+    """
+    Computes the day number since the start of
+    the year from the datetime. The value is between
+    [1, {365-366}].
+
+    For details, see :cpp:func:`day_of_year`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+
+    Returns
+    -------
+    Column
+        Column of day numbers.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_day_of_year(input.view())
+    return Column.from_libcudf(move(result))
+
+cpdef Column is_leap_year(Column input):
+    """
+    Check if the year of the given date is a leap year.
+
+    For details, see :cpp:func:`is_leap_year`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+
+    Returns
+    -------
+    Column
+        Column of bools indicating whether the given year
+        is a leap year.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_is_leap_year(input.view())
+    return Column.from_libcudf(move(result))
+
+cpdef Column last_day_of_month(Column input):
+    """
+    Computes the last day of the month.
+
+    For details, see :cpp:func:`last_day_of_month`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+
+    Returns
+    -------
+    Column
+        Column of ``TIMESTAMP_DAYS`` representing the last day
+        of the month.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_last_day_of_month(input.view())
+    return Column.from_libcudf(move(result))
+
+cpdef Column extract_quarter(Column input):
+    """
+    Returns the quarter (ie. a value from {1, 2, 3, 4})
+    that the date is in.
+
+    For details, see :cpp:func:`extract_quarter`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+
+    Returns
+    -------
+    Column
+        Column indicating which quarter the date is in.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_extract_quarter(input.view())
+    return Column.from_libcudf(move(result))
+
+cpdef Column days_in_month(Column input):
+    """
+    Extract the number of days in the month.
+
+    For details, see :cpp:func:`days_in_month`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+
+    Returns
+    -------
+    Column
+        Column of the number of days in the given month.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_days_in_month(input.view())
     return Column.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
index 73cdfb96af5..8bbc120cff8 100644
--- a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int32_t, uint8_t
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -41,14 +41,14 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
         datetime_component component
     ) except +
 
-    ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency":
-        DAY "cudf::datetime::rounding_frequency::DAY"
-        HOUR "cudf::datetime::rounding_frequency::HOUR"
-        MINUTE "cudf::datetime::rounding_frequency::MINUTE"
-        SECOND "cudf::datetime::rounding_frequency::SECOND"
-        MILLISECOND "cudf::datetime::rounding_frequency::MILLISECOND"
-        MICROSECOND "cudf::datetime::rounding_frequency::MICROSECOND"
-        NANOSECOND "cudf::datetime::rounding_frequency::NANOSECOND"
+    cpdef enum class rounding_frequency(int32_t):
+        DAY
+        HOUR
+        MINUTE
+        SECOND
+        MILLISECOND
+        MICROSECOND
+        NANOSECOND
 
     cdef unique_ptr[column] ceil_datetimes(
         const column_view& column, rounding_frequency freq
@@ -64,6 +64,10 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
         const column_view& timestamps,
         const column_view& months
     ) except +
+    cdef unique_ptr[column] add_calendrical_months(
+        const column_view& timestamps,
+        const scalar& months
+    ) except +
     cdef unique_ptr[column] day_of_year(const column_view& column) except +
     cdef unique_ptr[column] is_leap_year(const column_view& column) except +
     cdef unique_ptr[column] last_day_of_month(
diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py
index a80ab8d9f65..f5f24ef28e2 100644
--- a/python/pylibcudf/pylibcudf/tests/test_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import calendar
 import datetime
 
 import pyarrow as pa
@@ -46,6 +47,21 @@ def component(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        ("day", plc.datetime.RoundingFrequency.DAY),
+        ("hour", plc.datetime.RoundingFrequency.HOUR),
+        ("minute", plc.datetime.RoundingFrequency.MINUTE),
+        ("second", plc.datetime.RoundingFrequency.SECOND),
+        ("millisecond", plc.datetime.RoundingFrequency.MILLISECOND),
+        ("microsecond", plc.datetime.RoundingFrequency.MICROSECOND),
+        ("nanosecond", plc.datetime.RoundingFrequency.NANOSECOND),
+    ]
+)
+def rounding_frequency(request):
+    return request.param
+
+
 def test_extract_datetime_component(datetime_column, component):
     attr, component = component
     kwargs = {}
@@ -59,3 +75,139 @@ def test_extract_datetime_component(datetime_column, component):
     ).cast(pa.int16())
 
     assert_column_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "datetime_func",
+    [
+        "extract_millisecond_fraction",
+        "extract_microsecond_fraction",
+        "extract_nanosecond_fraction",
+    ],
+)
+def test_datetime_extracting_functions(datetime_column, datetime_func):
+    pa_col = plc.interop.to_arrow(datetime_column)
+    got = getattr(plc.datetime, datetime_func)(datetime_column)
+    kwargs = {}
+    attr = datetime_func.split("_")[1]
+    if attr == "weekday":
+        kwargs = {"count_from_zero": False}
+        attr = "day_of_week"
+    expect = getattr(pc, attr)(pa_col, **kwargs).cast(pa.int16())
+    assert_column_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        ("ceil_temporal", "ceil_datetimes"),
+        ("floor_temporal", "floor_datetimes"),
+        ("round_temporal", "round_datetimes"),
+    ],
+)
+def test_rounding_operations(datetime_column, op, rounding_frequency):
+    got = getattr(plc.datetime, op[1])(datetime_column, rounding_frequency[1])
+    pa_col = plc.interop.to_arrow(datetime_column)
+    pa_got = plc.interop.to_arrow(got)
+    expect = getattr(pc, op[0])(
+        pa_col,
+        unit=rounding_frequency[0],
+    ).cast(pa_got.type)
+    assert_column_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "months",
+    [
+        pa.scalar(-3, pa.int32()),
+        pa.scalar(1, pa.int16()),
+        pa.array([1, -3, 2, 4, -1, 5], pa.int32()),
+    ],
+)
+def test_calendrical_months(datetime_column, months):
+    def add_calendrical_months(timestamps, months):
+        result = []
+        if isinstance(months, pa.Array):
+            months_list = months.to_pylist()
+        else:
+            months_list = [months.as_py()] * len(timestamps)
+        for i, dt in enumerate(timestamps):
+            if dt.as_py() is not None:
+                year, month = dt.as_py().year, dt.as_py().month
+                new_month = month + months_list[i]
+                new_year = year + (new_month - 1) // 12
+                result.append(
+                    dt.as_py().replace(
+                        year=new_year, month=(new_month - 1) % 12 + 1
+                    )
+                )
+            else:
+                result.append(None)
+        return pa.array(result)
+
+    pa_col = plc.interop.to_arrow(datetime_column)
+    got = plc.datetime.add_calendrical_months(
+        datetime_column, plc.interop.from_arrow(months)
+    )
+    pa_got = plc.interop.to_arrow(got)
+    expect = add_calendrical_months(pa_col, months).cast(pa_got.type)
+    assert_column_eq(expect, got)
+
+
+def test_day_of_year(datetime_column):
+    got = plc.datetime.day_of_year(datetime_column)
+    pa_got = plc.interop.to_arrow(got)
+    pa_col = plc.interop.to_arrow(datetime_column)
+    expect = pa.array(
+        [
+            d.as_py().timetuple().tm_yday if d.as_py() is not None else None
+            for d in pa_col
+        ],
+        type=pa_got.type,
+    )
+    assert_column_eq(expect, got)
+
+
+def test_is_leap_year(datetime_column):
+    got = plc.datetime.is_leap_year(datetime_column)
+    pa_col = plc.interop.to_arrow(datetime_column)
+    expect = pc.is_leap_year(pa_col)
+    assert_column_eq(expect, got)
+
+
+def test_last_day_of_month(datetime_column):
+    def last_day_of_month(dates):
+        return [
+            d.replace(day=calendar.monthrange(d.year, d.month)[1])
+            if d is not None
+            else d
+            for d in dates.to_pylist()
+        ]
+
+    got = plc.datetime.last_day_of_month(datetime_column)
+    pa_got = plc.interop.to_arrow(got)
+    pa_col = plc.interop.to_arrow(datetime_column)
+    expect = pa.array(last_day_of_month(pa_col), type=pa_got.type)
+    assert_column_eq(expect, got)
+
+
+def test_extract_quarter(datetime_column):
+    got = plc.datetime.extract_quarter(datetime_column)
+    pa_col = plc.interop.to_arrow(datetime_column)
+    pa_got = plc.interop.to_arrow(got)
+    expect = pc.quarter(pa_col).cast(pa_got.type)
+    assert_column_eq(expect, got)
+
+
+def test_days_in_month(datetime_column):
+    def days_in_month(dates):
+        return [
+            calendar.monthrange(d.year, d.month)[1] if d is not None else None
+            for d in dates.to_pylist()
+        ]
+
+    got = plc.datetime.days_in_month(datetime_column)
+    pa_col = plc.interop.to_arrow(datetime_column)
+    pa_got = plc.interop.to_arrow(got)
+    expect = pa.array(days_in_month(pa_col), type=pa_got.type)
+    assert_column_eq(expect, got)

From 0e294b1580612d4106afb5044b33ed33dd6fe0ec Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 30 Oct 2024 17:32:55 -0700
Subject: [PATCH 15/41] Add compute_shared_memory_aggs used by shared memory
 groupby (#17162)

This work is part of splitting the original bulk shared memory groupby PR https://github.com/rapidsai/cudf/pull/16619.

This PR introduces the `compute_shared_memory_aggs` API, which is utilized by the shared memory groupby. The shared memory groupby process consists of two main steps. The first step was introduced in #17147, and this PR implements the second step, where the actual aggregations are performed based on the offsets from the first step. Each thread block is designed to handle up to 128 unique keys. If this limit is exceeded, there won't be enough space to store temporary aggregation results in shared memory, so a flag is set to indicate that follow-up global memory aggregations are needed to complete the remaining aggregation requests.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17162
---
 cpp/CMakeLists.txt                            |   1 +
 .../hash/compute_shared_memory_aggs.cu        | 323 ++++++++++++++++++
 .../hash/compute_shared_memory_aggs.hpp       |  41 +++
 cpp/src/groupby/hash/helpers.cuh              |   9 -
 cpp/src/groupby/hash/single_pass_functors.cuh |  81 +++++
 5 files changed, 446 insertions(+), 9 deletions(-)
 create mode 100644 cpp/src/groupby/hash/compute_shared_memory_aggs.cu
 create mode 100644 cpp/src/groupby/hash/compute_shared_memory_aggs.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 60132f651d2..bfa4bf80724 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -371,6 +371,7 @@ add_library(
   src/groupby/hash/compute_groupby.cu
   src/groupby/hash/compute_mapping_indices.cu
   src/groupby/hash/compute_mapping_indices_null.cu
+  src/groupby/hash/compute_shared_memory_aggs.cu
   src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
new file mode 100644
index 00000000000..12c02a1865e
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_shared_memory_aggs.hpp"
+#include "global_memory_aggregator.cuh"
+#include "helpers.cuh"
+#include "shared_memory_aggregator.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+#include <cudf/utilities/bit.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cooperative_groups.h>
+#include <cuda/std/cstddef>
+
+namespace cudf::groupby::detail::hash {
+namespace {
+/// Functor used by type dispatcher returning the size of the underlying C++ type
+struct size_of_functor {
+  template <typename T>
+  __device__ constexpr cudf::size_type operator()()
+  {
+    return sizeof(T);
+  }
+};
+
+/// Shared memory data alignment
+CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8;
+
+// Prepares shared memory data required by each output column, exits if
+// no enough memory space to perform the shared memory aggregation for the
+// current output column
+__device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
+                                               cudf::size_type& col_end,
+                                               cudf::mutable_table_device_view output_values,
+                                               cudf::size_type output_size,
+                                               cudf::size_type* shmem_agg_res_offsets,
+                                               cudf::size_type* shmem_agg_mask_offsets,
+                                               cudf::size_type cardinality,
+                                               cudf::size_type total_agg_size)
+{
+  col_start                       = col_end;
+  cudf::size_type bytes_allocated = 0;
+
+  auto const valid_col_size =
+    cudf::util::round_up_safe(static_cast<cudf::size_type>(sizeof(bool) * cardinality), ALIGNMENT);
+
+  while (bytes_allocated < total_agg_size && col_end < output_size) {
+    auto const col_idx = col_end;
+    auto const next_col_size =
+      cudf::util::round_up_safe(cudf::type_dispatcher<cudf::dispatch_storage_type>(
+                                  output_values.column(col_idx).type(), size_of_functor{}) *
+                                  cardinality,
+                                ALIGNMENT);
+    auto const next_col_total_size = next_col_size + valid_col_size;
+
+    if (bytes_allocated + next_col_total_size > total_agg_size) {
+      CUDF_UNREACHABLE("Not enough memory for shared memory aggregations");
+    }
+
+    shmem_agg_res_offsets[col_end]  = bytes_allocated;
+    shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size;
+
+    bytes_allocated += next_col_total_size;
+    ++col_end;
+  }
+}
+
+// Each block initialize its own shared memory aggregation results
+__device__ void initialize_shmem_aggregations(cooperative_groups::thread_block const& block,
+                                              cudf::size_type col_start,
+                                              cudf::size_type col_end,
+                                              cudf::mutable_table_device_view output_values,
+                                              cuda::std::byte* shmem_agg_storage,
+                                              cudf::size_type* shmem_agg_res_offsets,
+                                              cudf::size_type* shmem_agg_mask_offsets,
+                                              cudf::size_type cardinality,
+                                              cudf::aggregation::Kind const* d_agg_kinds)
+{
+  for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+    for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
+      auto target =
+        reinterpret_cast<cuda::std::byte*>(shmem_agg_storage + shmem_agg_res_offsets[col_idx]);
+      auto target_mask =
+        reinterpret_cast<bool*>(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]);
+      cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
+                                                  d_agg_kinds[col_idx],
+                                                  initialize_shmem{},
+                                                  target,
+                                                  target_mask,
+                                                  idx);
+    }
+  }
+  block.sync();
+}
+
+__device__ void compute_pre_aggregrations(cudf::size_type col_start,
+                                          cudf::size_type col_end,
+                                          bitmask_type const* row_bitmask,
+                                          bool skip_rows_with_nulls,
+                                          cudf::table_device_view source,
+                                          cudf::size_type num_input_rows,
+                                          cudf::size_type* local_mapping_index,
+                                          cuda::std::byte* shmem_agg_storage,
+                                          cudf::size_type* shmem_agg_res_offsets,
+                                          cudf::size_type* shmem_agg_mask_offsets,
+                                          cudf::aggregation::Kind const* d_agg_kinds)
+{
+  // Aggregates global memory sources to shared memory targets
+  for (auto source_idx = cudf::detail::grid_1d::global_thread_id(); source_idx < num_input_rows;
+       source_idx += cudf::detail::grid_1d::grid_stride()) {
+    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, source_idx)) {
+      auto const target_idx = local_mapping_index[source_idx];
+      for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+        auto const source_col = source.column(col_idx);
+
+        cuda::std::byte* target =
+          reinterpret_cast<cuda::std::byte*>(shmem_agg_storage + shmem_agg_res_offsets[col_idx]);
+        bool* target_mask =
+          reinterpret_cast<bool*>(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]);
+
+        cudf::detail::dispatch_type_and_aggregation(source_col.type(),
+                                                    d_agg_kinds[col_idx],
+                                                    shmem_element_aggregator{},
+                                                    target,
+                                                    target_mask,
+                                                    target_idx,
+                                                    source_col,
+                                                    source_idx);
+      }
+    }
+  }
+}
+
+__device__ void compute_final_aggregations(cooperative_groups::thread_block const& block,
+                                           cudf::size_type col_start,
+                                           cudf::size_type col_end,
+                                           cudf::table_device_view input_values,
+                                           cudf::mutable_table_device_view target,
+                                           cudf::size_type cardinality,
+                                           cudf::size_type* global_mapping_index,
+                                           cuda::std::byte* shmem_agg_storage,
+                                           cudf::size_type* agg_res_offsets,
+                                           cudf::size_type* agg_mask_offsets,
+                                           cudf::aggregation::Kind const* d_agg_kinds)
+{
+  // Aggregates shared memory sources to global memory targets
+  for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
+    auto const target_idx =
+      global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx];
+    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+      auto target_col = target.column(col_idx);
+
+      cuda::std::byte* source =
+        reinterpret_cast<cuda::std::byte*>(shmem_agg_storage + agg_res_offsets[col_idx]);
+      bool* source_mask = reinterpret_cast<bool*>(shmem_agg_storage + agg_mask_offsets[col_idx]);
+
+      cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
+                                                  d_agg_kinds[col_idx],
+                                                  gmem_element_aggregator{},
+                                                  target_col,
+                                                  target_idx,
+                                                  input_values.column(col_idx),
+                                                  source,
+                                                  source_mask,
+                                                  idx);
+    }
+  }
+  block.sync();
+}
+
+/* Takes the local_mapping_index and global_mapping_index to compute
+ * pre (shared) and final (global) aggregates*/
+CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
+                                               bitmask_type const* row_bitmask,
+                                               bool skip_rows_with_nulls,
+                                               cudf::size_type* local_mapping_index,
+                                               cudf::size_type* global_mapping_index,
+                                               cudf::size_type* block_cardinality,
+                                               cudf::table_device_view input_values,
+                                               cudf::mutable_table_device_view output_values,
+                                               cudf::aggregation::Kind const* d_agg_kinds,
+                                               cudf::size_type total_agg_size,
+                                               cudf::size_type offsets_size)
+{
+  auto const block       = cooperative_groups::this_thread_block();
+  auto const cardinality = block_cardinality[block.group_index().x];
+  if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; }
+
+  auto const num_cols = output_values.num_columns();
+
+  __shared__ cudf::size_type col_start;
+  __shared__ cudf::size_type col_end;
+  extern __shared__ cuda::std::byte shmem_agg_storage[];
+
+  cudf::size_type* shmem_agg_res_offsets =
+    reinterpret_cast<cudf::size_type*>(shmem_agg_storage + total_agg_size);
+  cudf::size_type* shmem_agg_mask_offsets =
+    reinterpret_cast<cudf::size_type*>(shmem_agg_storage + total_agg_size + offsets_size);
+
+  if (block.thread_rank() == 0) {
+    col_start = 0;
+    col_end   = 0;
+  }
+  block.sync();
+
+  while (col_end < num_cols) {
+    if (block.thread_rank() == 0) {
+      calculate_columns_to_aggregate(col_start,
+                                     col_end,
+                                     output_values,
+                                     num_cols,
+                                     shmem_agg_res_offsets,
+                                     shmem_agg_mask_offsets,
+                                     cardinality,
+                                     total_agg_size);
+    }
+    block.sync();
+
+    initialize_shmem_aggregations(block,
+                                  col_start,
+                                  col_end,
+                                  output_values,
+                                  shmem_agg_storage,
+                                  shmem_agg_res_offsets,
+                                  shmem_agg_mask_offsets,
+                                  cardinality,
+                                  d_agg_kinds);
+
+    compute_pre_aggregrations(col_start,
+                              col_end,
+                              row_bitmask,
+                              skip_rows_with_nulls,
+                              input_values,
+                              num_rows,
+                              local_mapping_index,
+                              shmem_agg_storage,
+                              shmem_agg_res_offsets,
+                              shmem_agg_mask_offsets,
+                              d_agg_kinds);
+    block.sync();
+
+    compute_final_aggregations(block,
+                               col_start,
+                               col_end,
+                               input_values,
+                               output_values,
+                               cardinality,
+                               global_mapping_index,
+                               shmem_agg_storage,
+                               shmem_agg_res_offsets,
+                               shmem_agg_mask_offsets,
+                               d_agg_kinds);
+  }
+}
+}  // namespace
+
+std::size_t available_shared_memory_size(cudf::size_type grid_size)
+{
+  auto const active_blocks_per_sm =
+    cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
+
+  size_t dynamic_shmem_size = 0;
+  CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
+    &dynamic_shmem_size, single_pass_shmem_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
+  return cudf::util::round_down_safe(static_cast<cudf::size_type>(0.5 * dynamic_shmem_size),
+                                     ALIGNMENT);
+}
+
+void compute_shared_memory_aggs(cudf::size_type grid_size,
+                                std::size_t available_shmem_size,
+                                cudf::size_type num_input_rows,
+                                bitmask_type const* row_bitmask,
+                                bool skip_rows_with_nulls,
+                                cudf::size_type* local_mapping_index,
+                                cudf::size_type* global_mapping_index,
+                                cudf::size_type* block_cardinality,
+                                cudf::table_device_view input_values,
+                                cudf::mutable_table_device_view output_values,
+                                cudf::aggregation::Kind const* d_agg_kinds,
+                                rmm::cuda_stream_view stream)
+{
+  // For each aggregation, need one offset determining where the aggregation is
+  // performed, another indicating the validity of the aggregation
+  auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type);
+  // The rest of shmem is utilized for the actual arrays in shmem
+  CUDF_EXPECTS(available_shmem_size > shmem_offsets_size * 2,
+               "No enough space for shared memory aggregations");
+  auto const shmem_agg_size = available_shmem_size - shmem_offsets_size * 2;
+  single_pass_shmem_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, available_shmem_size, stream>>>(
+    num_input_rows,
+    row_bitmask,
+    skip_rows_with_nulls,
+    local_mapping_index,
+    global_mapping_index,
+    block_cardinality,
+    input_values,
+    output_values,
+    d_agg_kinds,
+    shmem_agg_size,
+    shmem_offsets_size);
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
new file mode 100644
index 00000000000..653821fd53b
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::groupby::detail::hash {
+
+std::size_t available_shared_memory_size(cudf::size_type grid_size);
+
+void compute_shared_memory_aggs(cudf::size_type grid_size,
+                                std::size_t available_shmem_size,
+                                cudf::size_type num_input_rows,
+                                bitmask_type const* row_bitmask,
+                                bool skip_rows_with_nulls,
+                                cudf::size_type* local_mapping_index,
+                                cudf::size_type* global_mapping_index,
+                                cudf::size_type* block_cardinality,
+                                cudf::table_device_view input_values,
+                                cudf::mutable_table_device_view output_values,
+                                cudf::aggregation::Kind const* d_agg_kinds,
+                                rmm::cuda_stream_view stream);
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index 0d117ca35b3..00836567b4f 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -54,15 +54,6 @@ using shmem_extent_t =
 CUDF_HOST_DEVICE auto constexpr window_extent =
   cuco::make_window_extent<GROUPBY_CG_SIZE, GROUPBY_WINDOW_SIZE>(shmem_extent_t{});
 
-/**
- * @brief Returns the smallest multiple of 8 that is greater than or equal to the given integer.
- */
-CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num)
-{
-  std::size_t constexpr base = 8;
-  return cudf::util::div_rounding_up_safe(num, base) * base;
-}
-
 using row_hash_t =
   cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
                                                    cudf::nullate::DYNAMIC>;
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 73791b3aa71..28a5b578e00 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -23,6 +23,87 @@
 #include <cuco/static_set_ref.cuh>
 
 namespace cudf::groupby::detail::hash {
+// TODO: TO BE REMOVED issue tracked via #17171
+template <typename T, cudf::aggregation::Kind k>
+__device__ constexpr bool is_supported()
+{
+  return cudf::is_fixed_width<T>() and
+         ((k == cudf::aggregation::SUM) or (k == cudf::aggregation::SUM_OF_SQUARES) or
+          (k == cudf::aggregation::MIN) or (k == cudf::aggregation::MAX) or
+          (k == cudf::aggregation::COUNT_VALID) or (k == cudf::aggregation::COUNT_ALL) or
+          (k == cudf::aggregation::ARGMIN) or (k == cudf::aggregation::ARGMAX) or
+          (k == cudf::aggregation::STD) or (k == cudf::aggregation::VARIANCE) or
+          (k == cudf::aggregation::PRODUCT) and cudf::detail::is_product_supported<T>());
+}
+
+template <typename T, cudf::aggregation::Kind k>
+__device__ std::enable_if_t<not std::is_same_v<cudf::detail::corresponding_operator_t<k>, void>, T>
+identity_from_operator()
+{
+  using DeviceType = cudf::device_storage_type_t<T>;
+  return cudf::detail::corresponding_operator_t<k>::template identity<DeviceType>();
+}
+
+template <typename T, cudf::aggregation::Kind k, typename Enable = void>
+__device__ std::enable_if_t<std::is_same_v<cudf::detail::corresponding_operator_t<k>, void>, T>
+identity_from_operator()
+{
+  CUDF_UNREACHABLE("Unable to get identity/sentinel from device operator");
+}
+
+template <typename T, cudf::aggregation::Kind k>
+__device__ T get_identity()
+{
+  if ((k == cudf::aggregation::ARGMAX) or (k == cudf::aggregation::ARGMIN)) {
+    if constexpr (cudf::is_timestamp<T>()) {
+      return k == cudf::aggregation::ARGMAX
+               ? T{typename T::duration(cudf::detail::ARGMAX_SENTINEL)}
+               : T{typename T::duration(cudf::detail::ARGMIN_SENTINEL)};
+    } else {
+      using DeviceType = cudf::device_storage_type_t<T>;
+      return k == cudf::aggregation::ARGMAX
+               ? static_cast<DeviceType>(cudf::detail::ARGMAX_SENTINEL)
+               : static_cast<DeviceType>(cudf::detail::ARGMIN_SENTINEL);
+    }
+  }
+  return identity_from_operator<T, k>();
+}
+
+template <typename Target, cudf::aggregation::Kind k, typename Enable = void>
+struct initialize_target_element {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type idx) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Target, cudf::aggregation::Kind k>
+struct initialize_target_element<Target, k, std::enable_if_t<is_supported<Target, k>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type idx) const noexcept
+  {
+    using DeviceType          = cudf::device_storage_type_t<Target>;
+    DeviceType* target_casted = reinterpret_cast<DeviceType*>(target);
+
+    target_casted[idx] = get_identity<DeviceType, k>();
+
+    target_mask[idx] = (k == cudf::aggregation::COUNT_ALL) or (k == cudf::aggregation::COUNT_VALID);
+  }
+};
+
+struct initialize_shmem {
+  template <typename Target, cudf::aggregation::Kind k>
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type idx) const noexcept
+  {
+    initialize_target_element<Target, k>{}(target, target_mask, idx);
+  }
+};
+
 /**
  * @brief Computes single-pass aggregations and store results into a sparse `output_values` table,
  * and populate `set` with indices of unique keys

From 893d0fde7c17a7f8126baddd2f1cf34600f9420e Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 30 Oct 2024 20:46:27 -0400
Subject: [PATCH 16/41] Migrate NVText Tokenizing APIs to pylibcudf (#17100)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17100
---
 cpp/include/nvtext/tokenize.hpp               |   2 +-
 .../api_docs/pylibcudf/nvtext/index.rst       |   1 +
 .../api_docs/pylibcudf/nvtext/tokenize.rst    |   6 +
 python/cudf/cudf/_lib/nvtext/tokenize.pyx     | 161 +++-------
 python/cudf/cudf/core/tokenize_vocabulary.py  |   4 +-
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |   2 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |   2 +
 python/pylibcudf/pylibcudf/nvtext/__init__.py |   2 +
 .../pylibcudf/pylibcudf/nvtext/tokenize.pxd   |  31 ++
 .../pylibcudf/pylibcudf/nvtext/tokenize.pyx   | 286 ++++++++++++++++++
 .../pylibcudf/tests/test_nvtext_tokenize.py   | 101 +++++++
 11 files changed, 476 insertions(+), 122 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py

diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index e61601c6fea..e345587f88b 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -292,7 +292,7 @@ std::unique_ptr<tokenize_vocabulary> load_vocabulary(
  * @throw cudf::logic_error if `delimiter` is invalid
  *
  * @param input Strings column to tokenize
- * @param vocabulary Used to lookup tokens within
+ * @param vocabulary Used to lookup tokens within `input`
  * @param delimiter Used to identify tokens within `input`
  * @param default_id The token id to be used for tokens not found in the `vocabulary`;
  *                   Default is -1
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index e0735a197fd..8c45942ed47 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -12,3 +12,4 @@ nvtext
     normalize
     replace
     stemmer
+    tokenize
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst
new file mode 100644
index 00000000000..85c5a27b09d
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst
@@ -0,0 +1,6 @@
+========
+tokenize
+========
+
+.. automodule:: pylibcudf.nvtext.tokenize
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
index a7e63f1e9ae..f473c48e2f7 100644
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
@@ -2,162 +2,85 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.tokenize cimport (
-    character_tokenize as cpp_character_tokenize,
-    count_tokens as cpp_count_tokens,
-    detokenize as cpp_detokenize,
-    load_vocabulary as cpp_load_vocabulary,
-    tokenize as cpp_tokenize,
-    tokenize_vocabulary as cpp_tokenize_vocabulary,
-    tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
 
+from pylibcudf.nvtext.tokenize import TokenizeVocabulary  # no-cython-lint
+
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
 def _tokenize_scalar(Column strings, object py_delimiter):
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_tokenize(
-                c_strings,
-                c_delimiter[0],
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.tokenize_scalar(
+            strings.to_pylibcudf(mode="read"),
+            py_delimiter.device_value.c_value
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def _tokenize_column(Column strings, Column delimiters):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_delimiters = delimiters.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_tokenize(
-                c_strings,
-                c_delimiters
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.tokenize_column(
+            strings.to_pylibcudf(mode="read"),
+            delimiters.to_pylibcudf(mode="read"),
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def _count_tokens_scalar(Column strings, object py_delimiter):
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_count_tokens(
-                c_strings,
-                c_delimiter[0]
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.count_tokens_scalar(
+            strings.to_pylibcudf(mode="read"),
+            py_delimiter.device_value.c_value
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def _count_tokens_column(Column strings, Column delimiters):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_delimiters = delimiters.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_count_tokens(
-                c_strings,
-                c_delimiters
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.count_tokens_column(
+            strings.to_pylibcudf(mode="read"),
+            delimiters.to_pylibcudf(mode="read")
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def character_tokenize(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_character_tokenize(c_strings)
+    return Column.from_pylibcudf(
+        nvtext.tokenize.character_tokenize(
+            strings.to_pylibcudf(mode="read")
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def detokenize(Column strings, Column indices, object py_separator):
-
-    cdef DeviceScalar separator = py_separator.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_indices = indices.view()
-    cdef const string_scalar* c_separator = <const string_scalar*>separator\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_detokenize(c_strings, c_indices, c_separator[0])
+    return Column.from_pylibcudf(
+        nvtext.tokenize.detokenize(
+            strings.to_pylibcudf(mode="read"),
+            indices.to_pylibcudf(mode="read"),
+            py_separator.device_value.c_value
         )
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-cdef class TokenizeVocabulary:
-    cdef unique_ptr[cpp_tokenize_vocabulary] c_obj
-
-    def __cinit__(self, Column vocab):
-        cdef column_view c_vocab = vocab.view()
-        with nogil:
-            self.c_obj = move(cpp_load_vocabulary(c_vocab))
+    )
 
 
 @acquire_spill_lock()
 def tokenize_with_vocabulary(Column strings,
-                             TokenizeVocabulary vocabulary,
+                             object vocabulary,
                              object py_delimiter,
                              size_type default_id):
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_tokenize_with_vocabulary(
-                c_strings,
-                vocabulary.c_obj.get()[0],
-                c_delimiter[0],
-                default_id
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.tokenize_with_vocabulary(
+            strings.to_pylibcudf(mode="read"),
+            vocabulary,
+            py_delimiter.device_value.c_value,
+            default_id
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
index 99d85c0c5c0..f0ce6e9d5d1 100644
--- a/python/cudf/cudf/core/tokenize_vocabulary.py
+++ b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -20,7 +20,9 @@ class TokenizeVocabulary:
     """
 
     def __init__(self, vocabulary: "cudf.Series"):
-        self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column)
+        self.vocabulary = cpp_tokenize_vocabulary(
+            vocabulary._column.to_pylibcudf(mode="read")
+        )
 
     def tokenize(
         self, text, delimiter: str = "", default_id: int = -1
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index d97c0a73267..8afbadc3020 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-                   ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx
+                   ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx tokenize.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index a658e57018e..504600b5e76 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -9,6 +9,7 @@ from . cimport (
     normalize,
     replace,
     stemmer,
+    tokenize,
 )
 
 __all__ = [
@@ -20,4 +21,5 @@ __all__ = [
     "normalize",
     "replace",
     "stemmer",
+    "tokenize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index 2c1feb089a2..1d5246c6af7 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -9,6 +9,7 @@
     normalize,
     replace,
     stemmer,
+    tokenize,
 )
 
 __all__ = [
@@ -20,4 +21,5 @@
     "normalize",
     "replace",
     "stemmer",
+    "tokenize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
new file mode 100644
index 00000000000..0254b91ad58
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.tokenize cimport tokenize_vocabulary
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+cdef class TokenizeVocabulary:
+    cdef unique_ptr[tokenize_vocabulary] c_obj
+
+cpdef Column tokenize_scalar(Column input, Scalar delimiter=*)
+
+cpdef Column tokenize_column(Column input, Column delimiters)
+
+cpdef Column count_tokens_scalar(Column input, Scalar delimiter=*)
+
+cpdef Column count_tokens_column(Column input, Column delimiters)
+
+cpdef Column character_tokenize(Column input)
+
+cpdef Column detokenize(Column input, Column row_indices, Scalar separator=*)
+
+cpdef TokenizeVocabulary load_vocabulary(Column input)
+
+cpdef Column tokenize_with_vocabulary(
+    Column input,
+    TokenizeVocabulary vocabulary,
+    Scalar delimiter,
+    size_type default_id=*
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
new file mode 100644
index 00000000000..cdecfaabca2
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
@@ -0,0 +1,286 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.tokenize cimport (
+    character_tokenize as cpp_character_tokenize,
+    count_tokens as cpp_count_tokens,
+    detokenize as cpp_detokenize,
+    load_vocabulary as cpp_load_vocabulary,
+    tokenize as cpp_tokenize,
+    tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.types cimport size_type
+
+
+cdef class TokenizeVocabulary:
+    """The Vocabulary object to be used with ``tokenize_with_vocabulary``.
+
+    For details, see :cpp:class:`cudf::nvtext::tokenize_vocabulary`.
+    """
+    def __cinit__(self, Column vocab):
+        cdef column_view c_vocab = vocab.view()
+        with nogil:
+            self.c_obj = move(cpp_load_vocabulary(c_vocab))
+
+cpdef Column tokenize_scalar(Column input, Scalar delimiter=None):
+    """
+    Returns a single column of strings by tokenizing the input
+    strings column using the provided characters as delimiters.
+
+    For details, see cpp:func:`cudf::nvtext::tokenize`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to tokenize
+    delimiter : Scalar
+        String scalar used to separate individual
+        strings into tokens
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef unique_ptr[column] c_result
+
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = cpp_tokenize(
+            input.view(),
+            dereference(<const string_scalar*>delimiter.c_obj.get()),
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column tokenize_column(Column input, Column delimiters):
+    """
+    Returns a single column of strings by tokenizing the input
+    strings column using multiple strings as delimiters.
+
+    For details, see cpp:func:`cudf::nvtext::tokenize`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to tokenize
+    delimiters : Column
+        Strings column used to separate individual strings into tokens
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_tokenize(
+            input.view(),
+            delimiters.view(),
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column count_tokens_scalar(Column input, Scalar delimiter=None):
+    """
+    Returns the number of tokens in each string of a strings column
+    using the provided characters as delimiters.
+
+    For details, see cpp:func:`cudf::nvtext::count_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to count tokens
+    delimiters : Scalar]
+        String scalar used to separate each string into tokens
+
+    Returns
+    -------
+    Column
+        New column of token counts
+    """
+    cdef unique_ptr[column] c_result
+
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = cpp_count_tokens(
+            input.view(),
+            dereference(<const string_scalar*>delimiter.c_obj.get()),
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column count_tokens_column(Column input, Column delimiters):
+    """
+    Returns the number of tokens in each string of a strings column
+    using multiple strings as delimiters.
+
+    For details, see cpp:func:`cudf::nvtext::count_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to count tokens
+    delimiters : Column
+        Strings column used to separate
+        each string into tokens
+
+    Returns
+    -------
+    Column
+        New column of token counts
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_count_tokens(
+            input.view(),
+            delimiters.view(),
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column character_tokenize(Column input):
+    """
+    Returns a single column of strings by converting
+    each character to a string.
+
+    For details, see cpp:func:`cudf::nvtext::character_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to tokenize
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_character_tokenize(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column detokenize(
+    Column input,
+    Column row_indices,
+    Scalar separator=None
+):
+    """
+    Creates a strings column from a strings column of tokens
+    and an associated column of row ids.
+
+    For details, see cpp:func:`cudf::nvtext::detokenize`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to detokenize
+    row_indices : Column
+        The relative output row index assigned
+        for each token in the input column
+    separator : Scalar
+        String to append after concatenating
+        each token to the proper output row
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef unique_ptr[column] c_result
+
+    if separator is None:
+        separator = Scalar.from_libcudf(
+            cpp_make_string_scalar(" ".encode())
+        )
+
+    with nogil:
+        c_result = cpp_detokenize(
+            input.view(),
+            row_indices.view(),
+            dereference(<const string_scalar*>separator.c_obj.get())
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef TokenizeVocabulary load_vocabulary(Column input):
+    """
+    Create a ``TokenizeVocabulary`` object from a strings column.
+
+    For details, see cpp:func:`cudf::nvtext::load_vocabulary`
+
+    Parameters
+    ----------
+    input : Column
+        Strings for the vocabulary
+
+    Returns
+    -------
+    TokenizeVocabulary
+        Object to be used with cpp:func:`cudf::nvtext::tokenize_with_vocabulary`
+    """
+    return TokenizeVocabulary(input)
+
+
+cpdef Column tokenize_with_vocabulary(
+    Column input,
+    TokenizeVocabulary vocabulary,
+    Scalar delimiter,
+    size_type default_id=-1
+):
+    """
+    Returns the token ids for the input string by looking
+    up each delimited token in the given vocabulary.
+
+    For details, see cpp:func:`cudf::nvtext::tokenize_with_vocabulary`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to tokenize
+    vocabulary : TokenizeVocabulary
+        Used to lookup tokens within ``input``
+    delimiter : Scalar
+        Used to identify tokens within ``input``
+    default_id : size_type
+        The token id to be used for tokens not found
+        in the vocabulary; Default is -1
+
+    Returns
+    -------
+    Column
+        Lists column of token ids
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_tokenize_with_vocabulary(
+            input.view(),
+            dereference(vocabulary.c_obj.get()),
+            dereference(<const string_scalar*>delimiter.c_obj.get()),
+            default_id
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py
new file mode 100644
index 00000000000..4ec9a5ee1a5
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    return pa.array(["a", "b c", "d.e:f;"])
+
+
+@pytest.mark.parametrize(
+    "delimiter", [None, plc.interop.from_arrow(pa.scalar("."))]
+)
+def test_tokenize_scalar(input_col, delimiter):
+    result = plc.nvtext.tokenize.tokenize_scalar(
+        plc.interop.from_arrow(input_col), delimiter
+    )
+    if delimiter is None:
+        expected = pa.array(["a", "b", "c", "d.e:f;"])
+    else:
+        expected = pa.array(["a", "b c", "d", "e:f;"])
+    assert_column_eq(result, expected)
+
+
+def test_tokenize_column(input_col):
+    delimiters = pa.array([" ", ".", ":", ";"])
+    result = plc.nvtext.tokenize.tokenize_column(
+        plc.interop.from_arrow(input_col), plc.interop.from_arrow(delimiters)
+    )
+    expected = pa.array(["a", "b", "c", "d", "e", "f"])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "delimiter", [None, plc.interop.from_arrow(pa.scalar("."))]
+)
+def test_count_tokens_scalar(input_col, delimiter):
+    result = plc.nvtext.tokenize.count_tokens_scalar(
+        plc.interop.from_arrow(input_col), delimiter
+    )
+    if delimiter is None:
+        expected = pa.array([1, 2, 1], type=pa.int32())
+    else:
+        expected = pa.array([1, 1, 2], type=pa.int32())
+    assert_column_eq(result, expected)
+
+
+def test_count_tokens_column(input_col):
+    delimiters = pa.array([" ", ".", ":", ";"])
+    result = plc.nvtext.tokenize.count_tokens_column(
+        plc.interop.from_arrow(input_col), plc.interop.from_arrow(delimiters)
+    )
+    expected = pa.array([1, 2, 3], type=pa.int32())
+    assert_column_eq(result, expected)
+
+
+def test_character_tokenize(input_col):
+    result = plc.nvtext.tokenize.character_tokenize(
+        plc.interop.from_arrow(input_col)
+    )
+    expected = pa.array(["a", "b", " ", "c", "d", ".", "e", ":", "f", ";"])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "delimiter", [None, plc.interop.from_arrow(pa.scalar("."))]
+)
+def test_detokenize(input_col, delimiter):
+    row_indices = pa.array([0, 0, 1])
+    result = plc.nvtext.tokenize.detokenize(
+        plc.interop.from_arrow(input_col), plc.interop.from_arrow(row_indices)
+    )
+    expected = pa.array(["a b c", "d.e:f;"])
+    assert_column_eq(result, expected)
+
+
+def test_load_vocabulary(input_col):
+    result = plc.nvtext.tokenize.load_vocabulary(
+        plc.interop.from_arrow(input_col)
+    )
+    assert isinstance(result, plc.nvtext.tokenize.TokenizeVocabulary)
+
+
+@pytest.mark.parametrize("default_id", [-1, 0])
+def test_tokenize_with_vocabulary(input_col, default_id):
+    result = plc.nvtext.tokenize.tokenize_with_vocabulary(
+        plc.interop.from_arrow(input_col),
+        plc.nvtext.tokenize.load_vocabulary(plc.interop.from_arrow(input_col)),
+        plc.interop.from_arrow(pa.scalar(" ")),
+        default_id,
+    )
+    pa_result = plc.interop.to_arrow(result)
+    if default_id == -1:
+        expected = pa.array([[0], [-1, -1], [2]], type=pa_result.type)
+    else:
+        expected = pa.array([[0], [0, 0], [2]], type=pa_result.type)
+    assert_column_eq(result, expected)

From 3f66087c6976433a02c05135ab9c83564118846a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 30 Oct 2024 18:49:00 -0700
Subject: [PATCH 17/41] Fix some documentation rendering for pylibcudf (#17217)

* Fixed/modified some title headers
* Fixed/added pylibcudf section docstrings

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17217
---
 .../user_guide/api_docs/pylibcudf/strings/findall.rst  |  6 +++---
 .../source/user_guide/api_docs/pylibcudf/table.rst     |  2 +-
 python/pylibcudf/pylibcudf/binaryop.pyx                |  1 +
 python/pylibcudf/pylibcudf/filling.pyx                 |  5 +++++
 python/pylibcudf/pylibcudf/strings/__init__.py         |  1 +
 python/pylibcudf/pylibcudf/strings/regex_program.pyx   |  4 ++++
 python/pylibcudf/pylibcudf/strings/replace.pyx         |  1 +
 python/pylibcudf/pylibcudf/types.pyx                   | 10 ++++++++++
 8 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst
index 9850ee10098..699e38ebbe5 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst
@@ -1,6 +1,6 @@
-====
-find
-====
+=======
+findall
+=======
 
 .. automodule:: pylibcudf.strings.findall
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
index e39ca18a12b..4de9bced86f 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
@@ -1,5 +1,5 @@
 =====
-Table
+table
 =====
 
 .. automodule:: pylibcudf.table
diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx
index 51b2b4cfaa3..eef73bf4e9d 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pyx
+++ b/python/pylibcudf/pylibcudf/binaryop.pyx
@@ -100,6 +100,7 @@ cpdef bool is_supported_operation(
         The right hand side data type.
     op : BinaryOperator
         The operation to check.
+
     Returns
     -------
     bool
diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx
index 0372e1132cc..a47004a1e42 100644
--- a/python/pylibcudf/pylibcudf/filling.pyx
+++ b/python/pylibcudf/pylibcudf/filling.pyx
@@ -77,6 +77,10 @@ cpdef void fill_in_place(
         The index at which to stop filling.
     value : Scalar
         The value to fill with.
+
+    Returns
+    -------
+    None
     """
 
     with nogil:
@@ -101,6 +105,7 @@ cpdef Column sequence(size_type size, Scalar init, Scalar step):
         The initial value of the sequence
     step : Scalar
         The step of the sequence
+
     Returns
     -------
     pylibcudf.Column
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index 40fa8261905..fa7294c7dbd 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -32,6 +32,7 @@
     "capitalize",
     "case",
     "char_types",
+    "combine",
     "contains",
     "convert",
     "extract",
diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyx b/python/pylibcudf/pylibcudf/strings/regex_program.pyx
index f426b6888ae..91f585cd637 100644
--- a/python/pylibcudf/pylibcudf/strings/regex_program.pyx
+++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyx
@@ -37,6 +37,10 @@ cdef class RegexProgram:
         flags : Uniont[int, RegexFlags]
             Regex flags for interpreting special characters in the pattern
 
+        Returns
+        -------
+        RegexProgram
+            A new RegexProgram
         """
         cdef unique_ptr[regex_program] c_prog
         cdef regex_flags c_flags
diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx
index 6db7f04fcbb..2b94f5e3fee 100644
--- a/python/pylibcudf/pylibcudf/strings/replace.pyx
+++ b/python/pylibcudf/pylibcudf/strings/replace.pyx
@@ -136,6 +136,7 @@ cpdef Column replace_slice(
         Start position where repl will be added.
     stop : size_type, default -1
         End position (exclusive) to use for replacement.
+
     Returns
     -------
     pylibcudf.Column
diff --git a/python/pylibcudf/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx
index 58c7d97e9bc..a0c31f994a3 100644
--- a/python/pylibcudf/pylibcudf/types.pyx
+++ b/python/pylibcudf/pylibcudf/types.pyx
@@ -79,6 +79,16 @@ cpdef size_type size_of(DataType t):
     Only fixed-width types are supported.
 
     For details, see :cpp:func:`size_of`.
+
+    Parameters
+    ----------
+    t : DataType
+        The DataType to get the size of.
+
+    Returns
+    -------
+    int
+        Size in bytes of an element of the specified type.
     """
     with nogil:
         return cpp_size_of(t.c_obj)

From 0db2463a70c258086a123d93455d1061871868fb Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 30 Oct 2024 23:20:17 -0400
Subject: [PATCH 18/41] Migrate NVText Byte Pair Encoding APIs to pylibcudf
 (#17101)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17101
---
 .../pylibcudf/nvtext/byte_pair_encode.rst     |  6 ++
 .../api_docs/pylibcudf/nvtext/index.rst       |  1 +
 .../cudf/_lib/nvtext/byte_pair_encode.pyx     | 45 +++---------
 python/cudf/cudf/core/byte_pair_encoding.py   |  7 +-
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |  5 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |  2 +
 python/pylibcudf/pylibcudf/nvtext/__init__.py |  2 +
 .../pylibcudf/nvtext/byte_pair_encode.pxd     | 16 +++++
 .../pylibcudf/nvtext/byte_pair_encode.pyx     | 70 +++++++++++++++++++
 .../tests/test_nvtext_byte_pair_encode.py     | 46 ++++++++++++
 10 files changed, 160 insertions(+), 40 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_byte_pair_encode.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst
new file mode 100644
index 00000000000..908fcc4fde6
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst
@@ -0,0 +1,6 @@
+================
+byte_pair_encode
+================
+
+.. automodule:: pylibcudf.nvtext.byte_pair_encode
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index 8c45942ed47..00314bceeb9 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -8,6 +8,7 @@ nvtext
     generate_ngrams
     jaccard
     minhash
+    byte_pair_encode
     ngrams_tokenize
     normalize
     replace
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
index 0d768e24f39..2b2762eead2 100644
--- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -3,49 +3,22 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.byte_pair_encode cimport (
-    bpe_merge_pairs as cpp_bpe_merge_pairs,
-    byte_pair_encoding as cpp_byte_pair_encoding,
-    load_merge_pairs as cpp_load_merge_pairs,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-
 
-cdef class BPEMergePairs:
-    cdef unique_ptr[cpp_bpe_merge_pairs] c_obj
-
-    def __cinit__(self, Column merge_pairs):
-        cdef column_view c_pairs = merge_pairs.view()
-        with nogil:
-            self.c_obj = move(cpp_load_merge_pairs(c_pairs))
+from pylibcudf import nvtext
+from pylibcudf.nvtext.byte_pair_encode import BPEMergePairs  # no-cython-lint
 
 
 @acquire_spill_lock()
 def byte_pair_encoding(
     Column strings,
-    BPEMergePairs merge_pairs,
+    object merge_pairs,
     object separator
 ):
-    cdef column_view c_strings = strings.view()
-    cdef DeviceScalar d_separator = separator.device_value
-    cdef const string_scalar* c_separator = <const string_scalar*>d_separator\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_byte_pair_encoding(
-                c_strings,
-                merge_pairs.c_obj.get()[0],
-                c_separator[0]
-            )
+    return Column.from_pylibcudf(
+        nvtext.byte_pair_encode.byte_pair_encoding(
+            strings.to_pylibcudf(mode="read"),
+            merge_pairs,
+            separator.device_value.c_value
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index 6ca64a0a2be..8d38a5f2272 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -2,9 +2,10 @@
 
 from __future__ import annotations
 
+import pylibcudf as plc
+
 import cudf
 from cudf._lib.nvtext.byte_pair_encode import (
-    BPEMergePairs as cpp_merge_pairs,
     byte_pair_encoding as cpp_byte_pair_encoding,
 )
 
@@ -25,7 +26,9 @@ class BytePairEncoder:
     """
 
     def __init__(self, merges_pair: "cudf.Series"):
-        self.merge_pairs = cpp_merge_pairs(merges_pair._column)
+        self.merge_pairs = plc.nvtext.byte_pair_encode.BPEMergePairs(
+            merges_pair._column.to_pylibcudf(mode="read")
+        )
 
     def __call__(self, text, separator: str = " ") -> cudf.Series:
         """
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index 8afbadc3020..fa3cd448fa3 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -12,8 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-                   ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx tokenize.pyx
+set(cython_sources
+    edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
+    replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index 504600b5e76..a9ede17761b 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from . cimport (
+    byte_pair_encode,
     edit_distance,
     generate_ngrams,
     jaccard,
@@ -17,6 +18,7 @@ __all__ = [
     "generate_ngrams",
     "jaccard",
     "minhash",
+    "byte_pair_encode"
     "ngrams_tokenize",
     "normalize",
     "replace",
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index 1d5246c6af7..87650a02c33 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from . import (
+    byte_pair_encode,
     edit_distance,
     generate_ngrams,
     jaccard,
@@ -17,6 +18,7 @@
     "generate_ngrams",
     "jaccard",
     "minhash",
+    "byte_pair_encode",
     "ngrams_tokenize",
     "normalize",
     "replace",
diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd
new file mode 100644
index 00000000000..e4b93e96b9d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.byte_pair_encode cimport bpe_merge_pairs
+from pylibcudf.scalar cimport Scalar
+
+
+cdef class BPEMergePairs:
+    cdef unique_ptr[bpe_merge_pairs] c_obj
+
+cpdef Column byte_pair_encoding(
+    Column input,
+    BPEMergePairs merge_pairs,
+    Scalar separator=*
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
new file mode 100644
index 00000000000..76caad276d4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.byte_pair_encode cimport (
+    byte_pair_encoding as cpp_byte_pair_encoding,
+    load_merge_pairs as cpp_load_merge_pairs,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.scalar cimport Scalar
+
+
+cdef class BPEMergePairs:
+    """The table of merge pairs for the BPE encoder.
+
+    For details, see :cpp:class:`cudf::nvtext::bpe_merge_pairs`.
+    """
+    def __cinit__(self, Column merge_pairs):
+        cdef column_view c_pairs = merge_pairs.view()
+        with nogil:
+            self.c_obj = move(cpp_load_merge_pairs(c_pairs))
+
+cpdef Column byte_pair_encoding(
+    Column input,
+    BPEMergePairs merge_pairs,
+    Scalar separator=None
+):
+    """
+    Byte pair encode the input strings.
+
+    For details, see cpp:func:`cudf::nvtext::byte_pair_encoding`
+
+    Parameters
+    ----------
+    input : Column
+        Strings to encode.
+    merge_pairs : BPEMergePairs
+       Substrings to rebuild each string on.
+    separator : Scalar
+        String used to build the output after encoding. Default is a space.
+
+    Returns
+    -------
+    Column
+        An encoded column of strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    if separator is None:
+        separator = Scalar.from_libcudf(
+            cpp_make_string_scalar(" ".encode())
+        )
+
+    with nogil:
+        c_result = move(
+            cpp_byte_pair_encoding(
+                input.view(),
+                dereference(merge_pairs.c_obj.get()),
+                dereference(<const string_scalar*>separator.c_obj.get()),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_byte_pair_encode.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_byte_pair_encode.py
new file mode 100644
index 00000000000..7d6718a959b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_byte_pair_encode.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    return pa.array(
+        [
+            "e n",
+            "i t",
+            "i s",
+            "e s",
+            "en t",
+            "c e",
+            "es t",
+            "en ce",
+            "t est",
+            "s ent",
+        ]
+    )
+
+
+@pytest.mark.parametrize(
+    "separator", [None, plc.interop.from_arrow(pa.scalar("e"))]
+)
+def test_byte_pair_encoding(input_col, separator):
+    plc_col = plc.interop.from_arrow(
+        pa.array(["test sentence", "thisis test"])
+    )
+    result = plc.nvtext.byte_pair_encode.byte_pair_encoding(
+        plc_col,
+        plc.nvtext.byte_pair_encode.BPEMergePairs(
+            plc.interop.from_arrow(input_col)
+        ),
+        separator,
+    )
+    if separator is None:
+        expected = pa.array(["test   sent ence", "t h is is   test"])
+    else:
+        expected = pa.array(["teste esenteence", "teheiseise etest"])
+    assert_column_eq(result, expected)

From a69de571ce28a4b71b78ce03f7d34ec70486415e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 30 Oct 2024 22:21:42 -0500
Subject: [PATCH 19/41] Migrate hashing operations to `pylibcudf` (#15418)

This PR creates `pylibcudf` hashing APIs and modifies the cuDF Cython to leverage them. cc @vyasr

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15418
---
 .../all_cuda-118_arch-x86_64.yaml             |   2 +
 .../all_cuda-125_arch-x86_64.yaml             |   2 +
 cpp/include/cudf/hashing.hpp                  |  18 +-
 cpp/src/hash/md5_hash.cu                      |   3 +-
 cpp/src/hash/sha_hash.cuh                     |   3 +-
 cpp/tests/hashing/sha1_test.cpp               |   4 +-
 cpp/tests/hashing/sha224_test.cpp             |   4 +-
 cpp/tests/hashing/sha256_test.cpp             |   4 +-
 cpp/tests/hashing/sha384_test.cpp             |   4 +-
 cpp/tests/hashing/sha512_test.cpp             |   4 +-
 dependencies.yaml                             |   5 +-
 .../user_guide/api_docs/pylibcudf/hashing.rst |   6 +
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 python/cudf/cudf/_lib/hash.pyx                |  57 ++--
 python/cudf/pyproject.toml                    |   2 +
 python/pylibcudf/pylibcudf/CMakeLists.txt     |   1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |   2 +
 python/pylibcudf/pylibcudf/__init__.py        |   2 +
 python/pylibcudf/pylibcudf/hashing.pxd        |  30 ++
 python/pylibcudf/pylibcudf/hashing.pyx        | 240 ++++++++++++++++
 python/pylibcudf/pylibcudf/libcudf/hash.pxd   |  41 +--
 python/pylibcudf/pylibcudf/libcudf/hash.pyx   |   0
 python/pylibcudf/pylibcudf/tests/conftest.py  |  12 +-
 .../pylibcudf/pylibcudf/tests/test_hashing.py | 269 ++++++++++++++++++
 24 files changed, 639 insertions(+), 77 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst
 create mode 100644 python/pylibcudf/pylibcudf/hashing.pxd
 create mode 100644 python/pylibcudf/pylibcudf/hashing.pyx
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/hash.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_hashing.py

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index f3bbaaa8779..24dc3c9a7cc 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -46,6 +46,7 @@ dependencies:
 - librdkafka>=2.5.0,<2.6.0a0
 - librmm==24.12.*,>=0.0.0a0
 - make
+- mmh3
 - moto>=4.0.8
 - msgpack-python
 - myst-nb
@@ -76,6 +77,7 @@ dependencies:
 - pytest-xdist
 - pytest<8
 - python-confluent-kafka>=2.5.0,<2.6.0a0
+- python-xxhash
 - python>=3.10,<3.13
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 38c5b361f70..a2bb2a3fe7f 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -45,6 +45,7 @@ dependencies:
 - librdkafka>=2.5.0,<2.6.0a0
 - librmm==24.12.*,>=0.0.0a0
 - make
+- mmh3
 - moto>=4.0.8
 - msgpack-python
 - myst-nb
@@ -74,6 +75,7 @@ dependencies:
 - pytest-xdist
 - pytest<8
 - python-confluent-kafka>=2.5.0,<2.6.0a0
+- python-xxhash
 - python>=3.10,<3.13
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 0c5327edb91..307a52cd242 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -22,26 +22,27 @@
 
 namespace CUDF_EXPORT cudf {
 
-/**
- * @addtogroup column_hash
- * @{
- * @file
- */
-
 /**
  * @brief Type of hash value
- *
+ * @ingroup column_hash
  */
 using hash_value_type = uint32_t;
 
 /**
  * @brief The default seed value for hash functions
+ * @ingroup column_hash
  */
 static constexpr uint32_t DEFAULT_HASH_SEED = 0;
 
 //! Hash APIs
 namespace hashing {
 
+/**
+ * @addtogroup column_hash
+ * @{
+ * @file
+ */
+
 /**
  * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table
  *
@@ -183,7 +184,8 @@ std::unique_ptr<column> xxhash_64(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/** @} */  // end of group
+
 }  // namespace hashing
 
-/** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index c7bfd4aecf4..a0c51940c87 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -302,7 +302,8 @@ std::unique_ptr<column> md5(table_view const& input,
                              }
                              return md5_leaf_type_check(col.type());
                            }),
-               "Unsupported column type for hash function.");
+               "Unsupported column type for hash function.",
+               cudf::data_type_error);
 
   // Digest size in bytes
   auto constexpr digest_size = 32;
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index ebaec8e2775..eb002cf9c6f 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -513,7 +513,8 @@ std::unique_ptr<column> sha_hash(table_view const& input,
   CUDF_EXPECTS(
     std::all_of(
       input.begin(), input.end(), [](auto const& col) { return sha_leaf_type_check(col.type()); }),
-    "Unsupported column type for hash function.");
+    "Unsupported column type for hash function.",
+    cudf::data_type_error);
 
   // Result column allocation and creation
   auto begin = thrust::make_constant_iterator(Hasher::digest_size);
diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp
index 1e86751bb4c..3aa0bda6ae8 100644
--- a/cpp/tests/hashing/sha1_test.cpp
+++ b/cpp/tests/hashing/sha1_test.cpp
@@ -136,7 +136,7 @@ TEST_F(SHA1HashTest, ListsUnsupported)
 
   auto const input = cudf::table_view({strings_list_col});
 
-  EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha1(input), cudf::data_type_error);
 }
 
 TEST_F(SHA1HashTest, StructsUnsupported)
@@ -145,7 +145,7 @@ TEST_F(SHA1HashTest, StructsUnsupported)
   auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
   auto const input = cudf::table_view({struct_col});
 
-  EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha1(input), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp
index 259e7102ee2..3f6aeb9d5e6 100644
--- a/cpp/tests/hashing/sha224_test.cpp
+++ b/cpp/tests/hashing/sha224_test.cpp
@@ -136,7 +136,7 @@ TEST_F(SHA224HashTest, ListsUnsupported)
 
   auto const input = cudf::table_view({strings_list_col});
 
-  EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha224(input), cudf::data_type_error);
 }
 
 TEST_F(SHA224HashTest, StructsUnsupported)
@@ -145,7 +145,7 @@ TEST_F(SHA224HashTest, StructsUnsupported)
   auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
   auto const input = cudf::table_view({struct_col});
 
-  EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha224(input), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp
index a4affc87874..9519e96fbae 100644
--- a/cpp/tests/hashing/sha256_test.cpp
+++ b/cpp/tests/hashing/sha256_test.cpp
@@ -135,7 +135,7 @@ TEST_F(SHA256HashTest, ListsUnsupported)
 
   auto const input = cudf::table_view({strings_list_col});
 
-  EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha256(input), cudf::data_type_error);
 }
 
 TEST_F(SHA256HashTest, StructsUnsupported)
@@ -144,7 +144,7 @@ TEST_F(SHA256HashTest, StructsUnsupported)
   auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
   auto const input = cudf::table_view({struct_col});
 
-  EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha256(input), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp
index 8a5c090eeea..9de566b9d9b 100644
--- a/cpp/tests/hashing/sha384_test.cpp
+++ b/cpp/tests/hashing/sha384_test.cpp
@@ -154,7 +154,7 @@ TEST_F(SHA384HashTest, ListsUnsupported)
 
   auto const input = cudf::table_view({strings_list_col});
 
-  EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha384(input), cudf::data_type_error);
 }
 
 TEST_F(SHA384HashTest, StructsUnsupported)
@@ -163,7 +163,7 @@ TEST_F(SHA384HashTest, StructsUnsupported)
   auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
   auto const input = cudf::table_view({struct_col});
 
-  EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha384(input), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp
index 77fc56b5f13..95e5245f38e 100644
--- a/cpp/tests/hashing/sha512_test.cpp
+++ b/cpp/tests/hashing/sha512_test.cpp
@@ -154,7 +154,7 @@ TEST_F(SHA512HashTest, ListsUnsupported)
 
   auto const input = cudf::table_view({strings_list_col});
 
-  EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha512(input), cudf::data_type_error);
 }
 
 TEST_F(SHA512HashTest, StructsUnsupported)
@@ -163,7 +163,7 @@ TEST_F(SHA512HashTest, StructsUnsupported)
   auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
   auto const input = cudf::table_view({struct_col});
 
-  EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha512(input), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/dependencies.yaml b/dependencies.yaml
index bd1a5deb878..12038c5e503 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -828,6 +828,7 @@ dependencies:
           - pytest-benchmark
           - pytest-cases>=3.8.2
           - scipy
+          - mmh3
       - output_types: conda
         packages:
           - aiobotocore>=2.2.0
@@ -836,12 +837,14 @@ dependencies:
           - msgpack-python
           - moto>=4.0.8
           - s3fs>=2022.3.0
-      - output_types: pyproject
+          - python-xxhash
+      - output_types: [pyproject, requirements]
         packages:
           - msgpack
           - &tokenizers tokenizers==0.15.2
           - &transformers transformers==4.39.3
           - tzdata
+          - xxhash
     specific:
       - output_types: [conda, requirements]
         matrices:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst
new file mode 100644
index 00000000000..6bd1fbd821b
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst
@@ -0,0 +1,6 @@
+=======
+hashing
+=======
+
+.. automodule:: pylibcudf.hashing
+    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 62e14a67ee5..997ece6d29c 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -19,6 +19,7 @@ This page provides API documentation for pylibcudf.
     filling
     gpumemoryview
     groupby
+    hashing
     interop
     join
     json
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 9b7ab0888d2..89309b36371 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -1,27 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf.core.buffer import acquire_spill_lock
+import pylibcudf as plc
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.hash cimport (
-    md5,
-    murmurhash3_x86_32,
-    sha1,
-    sha224,
-    sha256,
-    sha384,
-    sha512,
-    xxhash_64,
-)
-from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.table cimport Table
 
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport table_view_from_columns
-
-import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -37,32 +22,26 @@ def hash_partition(list source_columns, list columns_to_hash,
 
 @acquire_spill_lock()
 def hash(list source_columns, str method, int seed=0):
-    cdef table_view c_source_view = table_view_from_columns(source_columns)
-    cdef unique_ptr[column] c_result
+    cdef Table ctbl = Table(
+        [c.to_pylibcudf(mode="read") for c in source_columns]
+    )
     if method == "murmur3":
-        with nogil:
-            c_result = move(murmurhash3_x86_32(c_source_view, seed))
+        return Column.from_pylibcudf(plc.hashing.murmurhash3_x86_32(ctbl, seed))
+    elif method == "xxhash64":
+        return Column.from_pylibcudf(plc.hashing.xxhash_64(ctbl, seed))
     elif method == "md5":
-        with nogil:
-            c_result = move(md5(c_source_view))
+        return Column.from_pylibcudf(plc.hashing.md5(ctbl))
     elif method == "sha1":
-        with nogil:
-            c_result = move(sha1(c_source_view))
+        return Column.from_pylibcudf(plc.hashing.sha1(ctbl))
     elif method == "sha224":
-        with nogil:
-            c_result = move(sha224(c_source_view))
+        return Column.from_pylibcudf(plc.hashing.sha224(ctbl))
     elif method == "sha256":
-        with nogil:
-            c_result = move(sha256(c_source_view))
+        return Column.from_pylibcudf(plc.hashing.sha256(ctbl))
     elif method == "sha384":
-        with nogil:
-            c_result = move(sha384(c_source_view))
+        return Column.from_pylibcudf(plc.hashing.sha384(ctbl))
     elif method == "sha512":
-        with nogil:
-            c_result = move(sha512(c_source_view))
-    elif method == "xxhash64":
-        with nogil:
-            c_result = move(xxhash_64(c_source_view, seed))
+        return Column.from_pylibcudf(plc.hashing.sha512(ctbl))
     else:
-        raise ValueError(f"Unsupported hash function: {method}")
-    return Column.from_unique_ptr(move(c_result))
+        raise ValueError(
+            f"Unsupported hashing algorithm {method}."
+        )
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 80201dd84db..b6105c17b3e 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -53,6 +53,7 @@ test = [
     "cramjam",
     "fastavro>=0.22.9",
     "hypothesis",
+    "mmh3",
     "msgpack",
     "pytest-benchmark",
     "pytest-cases>=3.8.2",
@@ -63,6 +64,7 @@ test = [
     "tokenizers==0.15.2",
     "transformers==4.39.3",
     "tzdata",
+    "xxhash",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 pandas-tests = [
     "ipython",
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index 15dd2b4c34f..b1d9656afc2 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -26,6 +26,7 @@ set(cython_sources
     filling.pyx
     gpumemoryview.pyx
     groupby.pyx
+    hashing.pyx
     interop.pyx
     join.pyx
     json.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index 9bdfdab97c2..aa2ce957173 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -13,6 +13,7 @@ from . cimport (
     expressions,
     filling,
     groupby,
+    hashing,
     interop,
     join,
     json,
@@ -63,6 +64,7 @@ __all__ = [
     "filling",
     "gpumemoryview",
     "groupby",
+    "hashing",
     "interop",
     "join",
     "json",
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 4033062b7e2..62a2170f83e 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -22,6 +22,7 @@
     expressions,
     filling,
     groupby,
+    hashing,
     interop,
     io,
     join,
@@ -73,6 +74,7 @@
     "filling",
     "gpumemoryview",
     "groupby",
+    "hashing",
     "interop",
     "io",
     "join",
diff --git a/python/pylibcudf/pylibcudf/hashing.pxd b/python/pylibcudf/pylibcudf/hashing.pxd
new file mode 100644
index 00000000000..2d070ddda69
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/hashing.pxd
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport uint32_t, uint64_t
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef Column murmurhash3_x86_32(
+    Table input,
+    uint32_t seed=*
+)
+
+cpdef Table murmurhash3_x64_128(
+    Table input,
+    uint64_t seed=*
+)
+
+
+cpdef Column xxhash_64(
+    Table input,
+    uint64_t seed=*
+)
+
+cpdef Column md5(Table input)
+cpdef Column sha1(Table input)
+cpdef Column sha224(Table input)
+cpdef Column sha256(Table input)
+cpdef Column sha384(Table input)
+cpdef Column sha512(Table input)
diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx
new file mode 100644
index 00000000000..9ea3d4d1bda
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/hashing.pyx
@@ -0,0 +1,240 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libc.stdint cimport uint32_t, uint64_t
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.hash cimport (
+    DEFAULT_HASH_SEED,
+    md5 as cpp_md5,
+    murmurhash3_x64_128 as cpp_murmurhash3_x64_128,
+    murmurhash3_x86_32 as cpp_murmurhash3_x86_32,
+    sha1 as cpp_sha1,
+    sha224 as cpp_sha224,
+    sha256 as cpp_sha256,
+    sha384 as cpp_sha384,
+    sha512 as cpp_sha512,
+    xxhash_64 as cpp_xxhash_64,
+)
+from pylibcudf.libcudf.table.table cimport table
+
+from .column cimport Column
+from .table cimport Table
+
+LIBCUDF_DEFAULT_HASH_SEED = DEFAULT_HASH_SEED
+
+cpdef Column murmurhash3_x86_32(
+    Table input,
+    uint32_t seed=DEFAULT_HASH_SEED
+):
+    """Computes the MurmurHash3 32-bit hash value of each row in the given table.
+
+    For details, see :cpp:func:`murmurhash3_x86_32`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+    seed : uint32_t
+        Optional seed value to use for the hash function
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_murmurhash3_x86_32(
+            input.view(),
+            seed
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Table murmurhash3_x64_128(
+    Table input,
+    uint64_t seed=DEFAULT_HASH_SEED
+):
+    """Computes the MurmurHash3 64-bit hash value of each row in the given table.
+
+    For details, see :cpp:func:`murmurhash3_x64_128`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+    seed : uint64_t
+        Optional seed value to use for the hash function
+
+    Returns
+    -------
+    pylibcudf.Table
+        A table of two UINT64 columns
+    """
+    cdef unique_ptr[table] c_result
+    with nogil:
+        c_result = cpp_murmurhash3_x64_128(
+            input.view(),
+            seed
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+
+cpdef Column xxhash_64(
+    Table input,
+    uint64_t seed=DEFAULT_HASH_SEED
+):
+    """Computes the xxHash 64-bit hash value of each row in the given table.
+
+    For details, see :cpp:func:`xxhash_64`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+    seed : uint64_t
+        Optional seed value to use for the hash function
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+
+    cdef unique_ptr[column] c_result
+    with  nogil:
+        c_result = cpp_xxhash_64(
+            input.view(),
+            seed
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column md5(Table input):
+    """Computes the MD5 hash value of each row in the given table.
+
+    For details, see :cpp:func:`md5`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the md5 hash of a row from the input
+
+    """
+
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_md5(input.view())
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column sha1(Table input):
+    """Computes the SHA-1 hash value of each row in the given table.
+
+    For details, see :cpp:func:`sha1`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_sha1(input.view())
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column sha224(Table input):
+    """Computes the SHA-224 hash value of each row in the given table.
+
+    For details, see :cpp:func:`sha224`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_sha224(input.view())
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column sha256(Table input):
+    """Computes the SHA-256 hash value of each row in the given table.
+
+    For details, see :cpp:func:`sha256`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_sha256(input.view())
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column sha384(Table input):
+    """Computes the SHA-384 hash value of each row in the given table.
+
+    For details, see :cpp:func:`sha384`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_sha384(input.view())
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column sha512(Table input):
+    """Computes the SHA-512 hash value of each row in the given table.
+
+    For details, see :cpp:func:`sha512`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_sha512(input.view())
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
index 51678ba69d8..c4222bc9dc5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/hash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
@@ -3,6 +3,7 @@
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
+from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
@@ -10,36 +11,44 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 
 cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:
 
-    cdef unique_ptr[column] murmurhash3_x86_32 "cudf::hashing::murmurhash3_x86_32" (
+    cdef unique_ptr[column] murmurhash3_x86_32(
         const table_view& input,
         const uint32_t seed
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] md5 "cudf::hashing::md5" (
+    cdef unique_ptr[table] murmurhash3_x64_128(
+        const table_view& input,
+        const uint64_t seed
+    ) except +libcudf_exception_handler
+
+    cdef unique_ptr[column] md5(
         const table_view& input
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] sha1 "cudf::hashing::sha1" (
+    cdef unique_ptr[column] sha1(
         const table_view& input
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] sha224 "cudf::hashing::sha224" (
+    cdef unique_ptr[column] sha224(
         const table_view& input
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] sha256 "cudf::hashing::sha256" (
+    cdef unique_ptr[column] sha256(
         const table_view& input
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] sha384 "cudf::hashing::sha384" (
+    cdef unique_ptr[column] sha384(
         const table_view& input
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] sha512 "cudf::hashing::sha512" (
+    cdef unique_ptr[column] sha512(
         const table_view& input
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] xxhash_64 "cudf::hashing::xxhash_64" (
+    cdef unique_ptr[column] xxhash_64(
         const table_view& input,
         const uint64_t seed
-    ) except +
+    ) except +libcudf_exception_handler
+
+cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil:
+    cdef uint32_t DEFAULT_HASH_SEED
diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pyx b/python/pylibcudf/pylibcudf/libcudf/hash.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py
index a19a8835498..5265e411c7f 100644
--- a/python/pylibcudf/pylibcudf/tests/conftest.py
+++ b/python/pylibcudf/pylibcudf/tests/conftest.py
@@ -18,13 +18,23 @@
 from utils import ALL_PA_TYPES, DEFAULT_PA_TYPES, NUMERIC_PA_TYPES
 
 
-# This fixture defines the standard set of types that all tests should default to
+def _type_to_str(typ):
+    if isinstance(typ, pa.ListType):
+        return f"list[{_type_to_str(typ.value_type)}]"
+    elif isinstance(typ, pa.StructType):
+        return f"struct[{', '.join(_type_to_str(typ.field(i).type) for i in range(typ.num_fields))}]"
+    else:
+        return str(typ)
+
+
+# This fixture defines [the standard set of types that all tests should default to
 # running on. If there is a need for some tests to run on a different set of types, that
 # type list fixture should also be defined below here if it is likely to be reused
 # across modules. Otherwise it may be defined on a per-module basis.
 @pytest.fixture(
     scope="session",
     params=DEFAULT_PA_TYPES,
+    ids=_type_to_str,
 )
 def pa_type(request):
     return request.param
diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py
new file mode 100644
index 00000000000..83fb50fa4ef
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import hashlib
+import struct
+
+import mmh3
+import numpy as np
+import pyarrow as pa
+import pytest
+import xxhash
+from utils import assert_column_eq, assert_table_eq
+
+import pylibcudf as plc
+
+SEED = 0
+METHODS = ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"]
+
+
+def scalar_to_binary(x):
+    if isinstance(x, str):
+        return x.encode()
+    elif isinstance(x, float):
+        return struct.pack("<d", x)
+    elif isinstance(x, bool):
+        return x.to_bytes(1, byteorder="little", signed=True)
+    elif isinstance(x, int):
+        return x.to_bytes(8, byteorder="little", signed=True)
+    else:
+        raise NotImplementedError
+
+
+def hash_single_uint32(val, seed=0):
+    return mmh3.hash(np.uint32(val).tobytes(), seed=seed, signed=False)
+
+
+def hash_combine_32(lhs, rhs):
+    return np.uint32(lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)))
+
+
+def uint_hash_combine_32(lhs, rhs):
+    return hash_combine_32(np.uint32(lhs), np.uint32(rhs))
+
+
+def libcudf_mmh3_x86_32(binary):
+    seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+    hashval = mmh3.hash(binary, seed)
+    return hash_combine_32(seed, hashval)
+
+
+@pytest.fixture(params=[pa.int64(), pa.float64(), pa.string(), pa.bool_()])
+def scalar_type(request):
+    return request.param
+
+
+@pytest.fixture
+def pa_scalar_input_column(scalar_type):
+    if pa.types.is_integer(scalar_type) or pa.types.is_floating(scalar_type):
+        return pa.array([1, 2, 3], type=scalar_type)
+    elif pa.types.is_string(scalar_type):
+        return pa.array(["a", "b", "c"], type=scalar_type)
+    elif pa.types.is_boolean(scalar_type):
+        return pa.array([True, True, False], type=scalar_type)
+
+
+@pytest.fixture
+def plc_scalar_input_tbl(pa_scalar_input_column):
+    return plc.interop.from_arrow(
+        pa.Table.from_arrays([pa_scalar_input_column], names=["data"])
+    )
+
+
+@pytest.fixture(scope="module")
+def list_struct_table():
+    data = pa.Table.from_pydict(
+        {
+            "list": [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+            "struct": [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}],
+        }
+    )
+    return data
+
+
+def python_hash_value(x, method):
+    if method == "murmurhash3_x86_32":
+        return libcudf_mmh3_x86_32(x)
+    elif method == "murmurhash3_x64_128":
+        hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED)
+        hasher.update(x)
+        # libcudf returns a tuple of two 64-bit integers
+        return hasher.utupledigest()
+    elif method == "xxhash_64":
+        return xxhash.xxh64(
+            x, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+        ).intdigest()
+    else:
+        return getattr(hashlib, method)(x).hexdigest()
+
+
+@pytest.mark.parametrize(
+    "method", ["sha1", "sha224", "sha256", "sha384", "sha512", "md5"]
+)
+def test_hash_column_sha_md5(
+    pa_scalar_input_column, plc_scalar_input_tbl, method
+):
+    plc_hasher = getattr(plc.hashing, method)
+
+    def py_hasher(val):
+        return getattr(hashlib, method)(scalar_to_binary(val)).hexdigest()
+
+    expect = pa.array(
+        [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],
+        type=pa.string(),
+    )
+    got = plc_hasher(plc_scalar_input_tbl)
+    assert_column_eq(got, expect)
+
+
+def test_hash_column_xxhash64(pa_scalar_input_column, plc_scalar_input_tbl):
+    def py_hasher(val):
+        return xxhash.xxh64(
+            scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+        ).intdigest()
+
+    expect = pa.array(
+        [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],
+        type=pa.uint64(),
+    )
+    got = plc.hashing.xxhash_64(plc_scalar_input_tbl, 0)
+
+    assert_column_eq(got, expect)
+
+
+@pytest.mark.parametrize(
+    "method", ["sha1", "sha224", "sha256", "sha384", "sha512"]
+)
+@pytest.mark.parametrize("dtype", ["list", "struct"])
+def test_sha_list_struct_err(list_struct_table, dtype, method):
+    err_types = list_struct_table.select([dtype])
+    plc_tbl = plc.interop.from_arrow(err_types)
+    plc_hasher = getattr(plc.hashing, method)
+
+    with pytest.raises(TypeError):
+        plc_hasher(plc_tbl)
+
+
+def test_md5_struct_err(list_struct_table):
+    err_types = list_struct_table.select(["struct"])
+    plc_tbl = plc.interop.from_arrow(err_types)
+
+    with pytest.raises(TypeError):
+        plc.hashing.md5(plc_tbl)
+
+
+def test_murmurhash3_x86_32(pa_scalar_input_column, plc_scalar_input_tbl):
+    def py_hasher(val):
+        return libcudf_mmh3_x86_32(scalar_to_binary(val))
+
+    got = plc.hashing.murmurhash3_x86_32(plc_scalar_input_tbl, 0)
+    expect = pa.array(
+        [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],
+        type=pa.uint32(),
+    )
+    got = plc.hashing.murmurhash3_x86_32(plc_scalar_input_tbl, 0)
+    assert_column_eq(got, expect)
+
+
+@pytest.mark.filterwarnings("ignore::RuntimeWarning")
+def test_murmurhash3_x86_32_list():
+    pa_tbl = pa.Table.from_pydict(
+        {
+            "list": pa.array(
+                [[1, 2, 3], [4, 5, 6], [7, 8, 9]], type=pa.list_(pa.uint32())
+            )
+        }
+    )
+    plc_tbl = plc.interop.from_arrow(pa_tbl)
+
+    def hash_list(list_):
+        hash_value = uint_hash_combine_32(0, hash_single_uint32(len(list_)))
+
+        for element in list_:
+            hash_value = uint_hash_combine_32(
+                hash_value,
+                hash_single_uint32(
+                    element, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+                ),
+            )
+
+        final = uint_hash_combine_32(
+            plc.hashing.LIBCUDF_DEFAULT_HASH_SEED, hash_value
+        )
+        return final
+
+    expect = pa.array(
+        [hash_list(val) for val in pa_tbl["list"].to_pylist()],
+        type=pa.uint32(),
+    )
+    got = plc.hashing.murmurhash3_x86_32(
+        plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+    )
+    assert_column_eq(got, expect)
+
+
+@pytest.mark.filterwarnings("ignore::RuntimeWarning")
+def test_murmurhash3_x86_32_struct():
+    pa_tbl = pa.table(
+        {
+            "struct": pa.array(
+                [
+                    {"a": 1, "b": 2, "c": 3},
+                    {"a": 4, "b": 5, "c": 6},
+                    {"a": 7, "b": 8, "c": 9},
+                ],
+                type=pa.struct(
+                    [
+                        pa.field("a", pa.uint32()),
+                        pa.field("b", pa.uint32(), pa.field("c", pa.uint32())),
+                    ]
+                ),
+            )
+        }
+    )
+    plc_tbl = plc.interop.from_arrow(pa_tbl)
+
+    def hash_struct(s):
+        seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+        keys = list(s.keys())
+
+        combined_hash = hash_single_uint32(s[keys[0]], seed=seed)
+        combined_hash = uint_hash_combine_32(0, combined_hash)
+        combined_hash = uint_hash_combine_32(seed, combined_hash)
+
+        for key in keys[1:]:
+            current_hash = hash_single_uint32(s[key], seed=seed)
+            combined_hash = uint_hash_combine_32(combined_hash, current_hash)
+
+        return combined_hash
+
+    got = plc.hashing.murmurhash3_x86_32(
+        plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+    )
+
+    expect = pa.array(
+        [hash_struct(val) for val in pa_tbl["struct"].to_pylist()],
+        type=pa.uint32(),
+    )
+    assert_column_eq(got, expect)
+
+
+def test_murmurhash3_x64_128(pa_scalar_input_column, plc_scalar_input_tbl):
+    def py_hasher(val):
+        hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED)
+        hasher.update(val)
+        return hasher.utupledigest()
+
+    tuples = [
+        py_hasher(scalar_to_binary(val))
+        for val in pa_scalar_input_column.to_pylist()
+    ]
+    expect = pa.Table.from_arrays(
+        [
+            pa.array([np.uint64(t[0]) for t in tuples]),
+            pa.array([np.uint64(t[1]) for t in tuples]),
+        ],
+        names=["0", "1"],
+    )
+    got = plc.hashing.murmurhash3_x64_128(plc_scalar_input_tbl, 0)
+
+    assert_table_eq(expect, got)

From a0711d0f8492762877ea7c84e78166413f44f178 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 31 Oct 2024 01:18:15 -0400
Subject: [PATCH 20/41] Migrate NVtext subword tokenizing APIs to pylibcudf
 (#17096)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17096
---
 .../api_docs/pylibcudf/nvtext/index.rst       |  1 +
 .../pylibcudf/nvtext/subword_tokenize.rst     |  6 ++
 .../cudf/_lib/nvtext/subword_tokenize.pyx     | 50 +++--------
 python/cudf/cudf/core/subword_tokenizer.py    |  7 +-
 .../libcudf/nvtext/subword_tokenize.pxd       |  8 +-
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |  2 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |  2 +
 python/pylibcudf/pylibcudf/nvtext/__init__.py |  2 +
 .../pylibcudf/nvtext/subword_tokenize.pxd     | 20 +++++
 .../pylibcudf/nvtext/subword_tokenize.pyx     | 84 +++++++++++++++++++
 .../tests/test_nvtext_subword_tokenize.py     | 63 ++++++++++++++
 11 files changed, 202 insertions(+), 43 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index 00314bceeb9..9ba47fd8d70 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -13,4 +13,5 @@ nvtext
     normalize
     replace
     stemmer
+    subword_tokenize
     tokenize
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst
new file mode 100644
index 00000000000..818714bec6a
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst
@@ -0,0 +1,6 @@
+================
+subword_tokenize
+================
+
+.. automodule:: pylibcudf.nvtext.subword_tokenize
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
index ee442ece5c6..5e0bfb74705 100644
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
@@ -5,35 +5,16 @@ from libc.stdint cimport uint32_t
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.subword_tokenize cimport (
-    hashed_vocabulary as cpp_hashed_vocabulary,
-    load_vocabulary_file as cpp_load_vocabulary_file,
-    move as tr_move,
-    subword_tokenize as cpp_subword_tokenize,
-    tokenizer_result as cpp_tokenizer_result,
-)
 
 from cudf._lib.column cimport Column
 
-
-cdef class Hashed_Vocabulary:
-    cdef unique_ptr[cpp_hashed_vocabulary] c_obj
-
-    def __cinit__(self, hash_file):
-        cdef string c_hash_file = <string>str(hash_file).encode()
-        with nogil:
-            self.c_obj = move(cpp_load_vocabulary_file(c_hash_file))
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
 def subword_tokenize_inmem_hash(
     Column strings,
-    Hashed_Vocabulary hashed_vocabulary,
+    object hashed_vocabulary,
     uint32_t max_sequence_length=64,
     uint32_t stride=48,
     bool do_lower=True,
@@ -42,21 +23,16 @@ def subword_tokenize_inmem_hash(
     """
     Subword tokenizes text series by using the pre-loaded hashed vocabulary
     """
-    cdef column_view c_strings = strings.view()
-    cdef cpp_tokenizer_result c_result
-    with nogil:
-        c_result = tr_move(
-            cpp_subword_tokenize(
-                c_strings,
-                hashed_vocabulary.c_obj.get()[0],
-                max_sequence_length,
-                stride,
-                do_lower,
-                do_truncate,
-            )
-        )
+    result = nvtext.subword_tokenize.subword_tokenize(
+        strings.to_pylibcudf(mode="read"),
+        hashed_vocabulary,
+        max_sequence_length,
+        stride,
+        do_lower,
+        do_truncate,
+    )
     # return the 3 tensor components
-    tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids))
-    masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))
-    metadata = Column.from_unique_ptr(move(c_result.tensor_metadata))
+    tokens = Column.from_pylibcudf(result[0])
+    masks = Column.from_pylibcudf(result[1])
+    metadata = Column.from_pylibcudf(result[2])
     return tokens, masks, metadata
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index 9e59b134b73..dda1f199078 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -6,8 +6,9 @@
 
 import cupy as cp
 
+import pylibcudf as plc
+
 from cudf._lib.nvtext.subword_tokenize import (
-    Hashed_Vocabulary as cpp_hashed_vocabulary,
     subword_tokenize_inmem_hash as cpp_subword_tokenize,
 )
 
@@ -50,7 +51,9 @@ class SubwordTokenizer:
 
     def __init__(self, hash_file: str, do_lower_case: bool = True):
         self.do_lower_case = do_lower_case
-        self.vocab_file = cpp_hashed_vocabulary(hash_file)
+        self.vocab_file = plc.nvtext.subword_tokenize.HashedVocabulary(
+            hash_file
+        )
 
     def __call__(
         self,
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
index aabac0a617b..8dac86d688d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
@@ -9,14 +9,14 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
-    cdef cppclass tokenizer_result "nvtext::tokenizer_result":
+    cdef cppclass tokenizer_result:
         uint32_t nrows_tensor
         uint32_t sequence_length
         unique_ptr[column] tensor_token_ids
         unique_ptr[column] tensor_attention_mask
         unique_ptr[column] tensor_metadata
 
-    cdef struct hashed_vocabulary "nvtext::hashed_vocabulary":
+    cdef cppclass hashed_vocabulary:
         uint16_t first_token_id
         uint16_t separator_token_id
         uint16_t unknown_token_id
@@ -26,6 +26,8 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
         unique_ptr[column] table
         unique_ptr[column] bin_coefficients
         unique_ptr[column] bin_offsets
+        unique_ptr[column] cp_metadata
+        unique_ptr[column] aux_cp_table
 
     cdef unique_ptr[hashed_vocabulary] load_vocabulary_file(
         const string &filename_hashed_vocabulary
@@ -33,7 +35,7 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
 
     cdef tokenizer_result subword_tokenize(
         const column_view & strings,
-        hashed_vocabulary & hashed_vocablary_obj,
+        hashed_vocabulary & hashed_vocabulary_obj,
         uint32_t max_sequence_length,
         uint32_t stride,
         bool do_lower,
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index fa3cd448fa3..93e3fb15259 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 set(cython_sources
     edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
-    replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx
+    replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx subword_tokenize.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index a9ede17761b..ef837167eb9 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -10,6 +10,7 @@ from . cimport (
     normalize,
     replace,
     stemmer,
+    subword_tokenize,
     tokenize,
 )
 
@@ -23,5 +24,6 @@ __all__ = [
     "normalize",
     "replace",
     "stemmer",
+    "subword_tokenize",
     "tokenize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index 87650a02c33..4f125d3a733 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -10,6 +10,7 @@
     normalize,
     replace,
     stemmer,
+    subword_tokenize,
     tokenize,
 )
 
@@ -23,5 +24,6 @@
     "normalize",
     "replace",
     "stemmer",
+    "subword_tokenize",
     "tokenize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd
new file mode 100644
index 00000000000..091c7b897ac
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport uint32_t
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.subword_tokenize cimport hashed_vocabulary
+
+
+cdef class HashedVocabulary:
+    cdef unique_ptr[hashed_vocabulary] c_obj
+
+cpdef tuple[Column, Column, Column] subword_tokenize(
+    Column input,
+    HashedVocabulary vocabulary_table,
+    uint32_t max_sequence_length,
+    uint32_t stride,
+    bool do_lower_case,
+    bool do_truncate,
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx
new file mode 100644
index 00000000000..04643d3bd84
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx
@@ -0,0 +1,84 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libc.stdint cimport uint32_t
+from libcpp cimport bool
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.subword_tokenize cimport (
+    load_vocabulary_file as cpp_load_vocabulary_file,
+    move as tr_move,
+    subword_tokenize as cpp_subword_tokenize,
+    tokenizer_result as cpp_tokenizer_result,
+)
+
+
+cdef class HashedVocabulary:
+    """The vocabulary data for use with the subword_tokenize function.
+
+    For details, see :cpp:class:`cudf::nvtext::hashed_vocabulary`.
+    """
+    def __cinit__(self, hash_file):
+        cdef string c_hash_file = <string>str(hash_file).encode()
+        with nogil:
+            self.c_obj = move(cpp_load_vocabulary_file(c_hash_file))
+
+cpdef tuple[Column, Column, Column] subword_tokenize(
+    Column input,
+    HashedVocabulary vocabulary_table,
+    uint32_t max_sequence_length,
+    uint32_t stride,
+    bool do_lower_case,
+    bool do_truncate,
+):
+    """
+    Creates a tokenizer that cleans the text, splits it into
+    tokens and returns token-ids from an input vocabulary.
+
+    For details, see cpp:func:`subword_tokenize`
+
+    Parameters
+    ----------
+    input : Column
+        The input strings to tokenize.
+    vocabulary_table : HashedVocabulary
+        The vocabulary table pre-loaded into this object.
+    max_sequence_length : uint32_t
+        Limit of the number of token-ids per row in final tensor for each string.
+    stride : uint32_t
+        Each row in the output token-ids will replicate
+        ``max_sequence_length`` - ``stride`` the token-ids
+        from the previous row, unless it is the first string.
+    do_lower_case : bool
+        If true, the tokenizer will convert uppercase characters in the
+        input stream to lower-case and strip accents from those characters.
+        If false, accented and uppercase characters are not transformed.
+    do_truncate : bool
+        If true, the tokenizer will discard all the token-ids after
+        ``max_sequence_length`` for each input string. If false, it
+        will use a new row in the output token-ids to continue
+        generating the output.
+
+    Returns
+    -------
+    tuple[Column, Column, Column]
+        A tuple of three columns containing the
+        tokens, masks, and metadata.
+    """
+    cdef cpp_tokenizer_result c_result
+    with nogil:
+        c_result = tr_move(
+            cpp_subword_tokenize(
+                input.view(),
+                dereference(vocabulary_table.c_obj.get()),
+                max_sequence_length,
+                stride,
+                do_lower_case,
+                do_truncate,
+            )
+        )
+    cdef Column tokens = Column.from_libcudf(move(c_result.tensor_token_ids))
+    cdef Column masks = Column.from_libcudf(move(c_result.tensor_attention_mask))
+    cdef Column metadata = Column.from_libcudf(move(c_result.tensor_metadata))
+    return tokens, masks, metadata
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py
new file mode 100644
index 00000000000..516d0f7f78d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture
+def vocab_file(tmpdir):
+    hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt")
+    content = "1\n0\n10\n"
+    coefficients = [65559] * 10
+    for c in coefficients:
+        content = content + str(c) + " 0\n"
+    table = [0] * 10
+    table[0] = 3015668
+    content = content + "10\n"
+    for v in table:
+        content = content + str(v) + "\n"
+    content = content + "100\n101\n102\n\n"
+    hash_file.write(content)
+    return str(hash_file)
+
+
+@pytest.fixture
+def column_input():
+    return pa.array(["This is a test"])
+
+
+@pytest.mark.parametrize("max_sequence_length", [64, 128])
+@pytest.mark.parametrize("stride", [32, 64])
+@pytest.mark.parametrize("do_lower_case", [True, False])
+@pytest.mark.parametrize("do_truncate", [True, False])
+def test_subword_tokenize(
+    vocab_file,
+    column_input,
+    max_sequence_length,
+    stride,
+    do_lower_case,
+    do_truncate,
+):
+    vocab = plc.nvtext.subword_tokenize.HashedVocabulary(vocab_file)
+    tokens, masks, metadata = plc.nvtext.subword_tokenize.subword_tokenize(
+        plc.interop.from_arrow(column_input),
+        vocab,
+        max_sequence_length,
+        stride,
+        do_lower_case,
+        do_truncate,
+    )
+    expected_tokens = pa.array(
+        [100] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32()
+    )
+    expected_masks = pa.array(
+        [1] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32()
+    )
+    expected_metadata = pa.array([0, 0, 3], type=pa.uint32())
+
+    assert_column_eq(tokens, expected_tokens)
+    assert_column_eq(masks, expected_masks)
+    assert_column_eq(metadata, expected_metadata)

From 01cfcffcf077db6d27e770d61d204257c6ca6481 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 31 Oct 2024 03:38:46 -0400
Subject: [PATCH 21/41] Remove unsanitized nulls from input strings columns in
 reduction gtests (#17202)

Input strings column containing unsanitized nulls may result in undefined behavior.
This PR fixes the input data to not include string characters in null rows in gtests for `REDUCTION_TESTS`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/17202
---
 cpp/tests/reductions/list_rank_test.cpp  |  2 +-
 cpp/tests/reductions/rank_tests.cpp      |  2 +-
 cpp/tests/reductions/reduction_tests.cpp | 29 ++++++++----
 cpp/tests/reductions/scan_tests.cpp      | 60 ++++++++++++++----------
 4 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/cpp/tests/reductions/list_rank_test.cpp b/cpp/tests/reductions/list_rank_test.cpp
index 736b5081d8f..cb412f1e925 100644
--- a/cpp/tests/reductions/list_rank_test.cpp
+++ b/cpp/tests/reductions/list_rank_test.cpp
@@ -131,7 +131,7 @@ TEST_F(ListRankScanTest, ListOfStruct)
      false,
      false}};
   auto col2 = cudf::test::strings_column_wrapper{
-    {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"},
+    {"x", "x", "a", "a", "b", "", "a", "b", "a", "b", "a", "c", "a", "c", "", "", "b", "b"},
     {true,
      true,
      true,
diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp
index 19633211192..130458548fc 100644
--- a/cpp/tests/reductions/rank_tests.cpp
+++ b/cpp/tests/reductions/rank_tests.cpp
@@ -125,7 +125,7 @@ auto make_input_column()
 {
   if constexpr (std::is_same_v<TypeParam, cudf::string_view>) {
     return cudf::test::strings_column_wrapper{
-      {"0", "0", "4", "4", "4", "5", "7", "7", "7", "9", "9", "9"},
+      {"0", "0", "4", "4", "4", "", "7", "7", "7", "9", "9", "9"},
       cudf::test::iterators::null_at(5)};
   } else {
     using fw_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam>;
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index c09cde8f9e4..67083f19b3a 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -1255,6 +1255,12 @@ TEST_P(StringReductionTest, MinMax)
   // data and valid arrays
   std::vector<std::string> host_strings(GetParam());
   std::vector<bool> host_bools({true, false, true, true, true, true, false, false, true});
+  std::transform(thrust::counting_iterator<std::size_t>(0),
+                 thrust::counting_iterator<std::size_t>(host_strings.size()),
+                 host_strings.begin(),
+                 [host_strings, host_bools](auto idx) {
+                   return host_bools[idx] ? host_strings[idx] : std::string{};
+                 });
   bool succeed(true);
   std::string initial_value = "init";
 
@@ -1381,7 +1387,7 @@ TEST_F(StringReductionTest, AllNull)
   std::vector<std::string> host_strings(
     {"one", "two", "three", "four", "five", "six", "seven", "eight", "nine"});
   std::vector<bool> host_bools(host_strings.size(), false);
-  auto initial_value = cudf::make_string_scalar("init");
+  auto initial_value = cudf::make_string_scalar("");
   initial_value->set_valid_async(false);
 
   // string column with nulls
@@ -3082,21 +3088,28 @@ TEST_F(StructReductionTest, StructReductionMinMaxWithNulls)
   using cudf::test::iterators::null_at;
   using cudf::test::iterators::nulls_at;
 
-  // `null` means null at child column.
-  // `NULL` means null at parent column.
   auto const input = [] {
     auto child1 = STRINGS_CW{{"año",
                               "bit",
-                              "₹1" /*null*/,
-                              "aaa" /*NULL*/,
+                              "",     // child null
+                              "aaa",  // parent null
                               "zit",
                               "bat",
                               "aab",
-                              "$1" /*null*/,
-                              "€1" /*NULL*/,
+                              "",    // child null
+                              "€1",  // parent null
                               "wut"},
                              nulls_at({2, 7})};
-    auto child2 = INTS_CW{{1, 2, 3 /*null*/, 4 /*NULL*/, 5, 6, 7, 8 /*null*/, 9 /*NULL*/, 10},
+    auto child2 = INTS_CW{{1,
+                           2,
+                           0,  // child null
+                           4,  // parent null
+                           5,
+                           6,
+                           7,
+                           0,  // child null
+                           9,  // parent NULL
+                           10},
                           nulls_at({2, 7})};
     return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
   }();
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index 72d92c5ac53..5f911597b02 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -412,12 +412,13 @@ TEST_F(ScanStringsTest, MoreStringsMinMax)
 {
   int row_count = 512;
 
-  auto data_begin = cudf::detail::make_counting_transform_iterator(0, [](auto idx) {
+  auto validity = cudf::detail::make_counting_transform_iterator(
+    0, [](auto idx) -> bool { return (idx % 23) != 22; });
+  auto data_begin = cudf::detail::make_counting_transform_iterator(0, [validity](auto idx) {
+    if (validity[idx] == 0) return std::string{};
     char const s = static_cast<char>('a' + (idx % 26));
     return std::string{1, s};
   });
-  auto validity   = cudf::detail::make_counting_transform_iterator(
-    0, [](auto idx) -> bool { return (idx % 23) != 22; });
   cudf::test::strings_column_wrapper col(data_begin, data_begin + row_count, validity);
 
   thrust::host_vector<std::string> v(data_begin, data_begin + row_count);
@@ -620,21 +621,28 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls)
   using cudf::test::iterators::null_at;
   using cudf::test::iterators::nulls_at;
 
-  // `null` means null at child column.
-  // `NULL` means null at parent column.
   auto const input = [] {
     auto child1 = STRINGS_CW{{"año",
                               "bit",
-                              "₹1" /*null*/,
-                              "aaa" /*NULL*/,
+                              "",     // child null
+                              "aaa",  // parent null
                               "zit",
                               "bat",
                               "aab",
-                              "$1" /*null*/,
-                              "€1" /*NULL*/,
+                              "",    // child null
+                              "€1",  // parent null
                               "wut"},
                              nulls_at({2, 7})};
-    auto child2 = INTS_CW{{1, 2, 3 /*null*/, 4 /*NULL*/, 5, 6, 7, 8 /*null*/, 9 /*NULL*/, 10},
+    auto child2 = INTS_CW{{1,
+                           2,
+                           0,  // child null
+                           4,  // parent null
+                           5,
+                           6,
+                           7,
+                           0,  // child null
+                           9,  // parent null
+                           10},
                           nulls_at({2, 7})};
     return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
   }();
@@ -692,25 +700,25 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls)
     auto const expected = [] {
       auto child1 = STRINGS_CW{{"año",
                                 "año",
-                                "" /*null*/,
-                                "" /*NULL*/,
-                                "" /*NULL*/,
-                                "" /*NULL*/,
-                                "" /*NULL*/,
-                                "" /*NULL*/,
-                                "" /*NULL*/,
-                                "" /*NULL*/},
+                                "",   // child null
+                                "",   // parent null
+                                "",   // parent null
+                                "",   // parent null
+                                "",   // parent null
+                                "",   // parent null
+                                "",   // parent null
+                                ""},  // parent null
                                null_at(2)};
       auto child2 = INTS_CW{{1,
                              1,
-                             0 /*null*/,
-                             0 /*NULL*/,
-                             0 /*NULL*/,
-                             0 /*NULL*/,
-                             0 /*NULL*/,
-                             0 /*NULL*/,
-                             0 /*NULL*/,
-                             0 /*NULL*/},
+                             0,   // child null
+                             0,   // parent null
+                             0,   // parent null
+                             0,   // parent null
+                             0,   // parent null
+                             0,   // parent null
+                             0,   // parent null
+                             0},  // parent null
                             null_at(2)};
       return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})};
     }();

From cafcf6a6bc538c6eed6544ebd0e44169bfc483de Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 31 Oct 2024 08:16:26 -0400
Subject: [PATCH 22/41] Add jaccard_index to generated cuDF docs (#17199)

Adds the `jaccard_index` API to the generated docs.
Also noticed `minhash` is not present and so added here as well.
Also removed duplicate `rsplit` entry from the `.rst` file

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17199
---
 docs/cudf/source/user_guide/api_docs/string_handling.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/cudf/source/user_guide/api_docs/string_handling.rst b/docs/cudf/source/user_guide/api_docs/string_handling.rst
index ab0f085e1a6..91d3e33960b 100644
--- a/docs/cudf/source/user_guide/api_docs/string_handling.rst
+++ b/docs/cudf/source/user_guide/api_docs/string_handling.rst
@@ -60,6 +60,7 @@ strings and apply several methods to it. These can be accessed like
    isupper
    istimestamp
    istitle
+   jaccard_index
    join
    len
    like
@@ -67,6 +68,7 @@ strings and apply several methods to it. These can be accessed like
    lower
    lstrip
    match
+   minhash
    ngrams
    ngrams_tokenize
    normalize_characters
@@ -90,7 +92,6 @@ strings and apply several methods to it. These can be accessed like
    slice_from
    slice_replace
    split
-   rsplit
    startswith
    strip
    swapcase

From e512258973ae50174066f7ef8bbe84a1f95437f0 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 31 Oct 2024 08:27:47 -0400
Subject: [PATCH 23/41] Move strings::concatenate benchmark to nvbench (#17211)

Moves the `cudf::strings::concatenate` benchmark source from google-bench to nvbench.
This also removes the restrictions on the parameters to allows specifying arbitrary number of rows and string width.

Reference #16948

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Mark Harris (https://github.com/harrism)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17211
---
 cpp/benchmarks/CMakeLists.txt     |  2 +-
 cpp/benchmarks/string/combine.cpp | 58 +++++++++++--------------------
 2 files changed, 22 insertions(+), 38 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 2a4ac789046..68781889c53 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -356,7 +356,6 @@ ConfigureNVBench(
 # * strings benchmark -------------------------------------------------------------------
 ConfigureBench(
   STRINGS_BENCH
-  string/combine.cpp
   string/convert_datetime.cpp
   string/convert_durations.cpp
   string/convert_fixed_point.cpp
@@ -374,6 +373,7 @@ ConfigureNVBench(
   STRINGS_NVBENCH
   string/case.cpp
   string/char_types.cpp
+  string/combine.cpp
   string/contains.cpp
   string/copy_if_else.cpp
   string/copy_range.cpp
diff --git a/cpp/benchmarks/string/combine.cpp b/cpp/benchmarks/string/combine.cpp
index 7acfb1ffb0d..d6ccfae63e8 100644
--- a/cpp/benchmarks/string/combine.cpp
+++ b/cpp/benchmarks/string/combine.cpp
@@ -14,57 +14,41 @@
  * limitations under the License.
  */
 
-#include "string_bench_args.hpp"
-
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-class StringCombine : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-static void BM_combine(benchmark::State& state)
+static void bench_combine(nvbench::state& state)
 {
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
-  data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  data_profile const profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
   auto const table = create_random_table(
-    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{n_rows}, table_profile);
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, profile);
   cudf::strings_column_view input1(table->view().column(0));
   cudf::strings_column_view input2(table->view().column(1));
   cudf::string_scalar separator("+");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::strings::concatenate(table->view(), separator);
-  }
-
-  state.SetBytesProcessed(state.iterations() * (input1.chars_size(cudf::get_default_stream()) +
-                                                input2.chars_size(cudf::get_default_stream())));
-}
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size =
+    input1.chars_size(stream) + input2.chars_size(stream) + (num_rows * separator.size());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 4;
-  int const max_rowlen = 1 << 11;
-  int const len_mult   = 4;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = cudf::strings::concatenate(table->view(), separator);
+  });
 }
 
-#define STRINGS_BENCHMARK_DEFINE(name)          \
-  BENCHMARK_DEFINE_F(StringCombine, name)       \
-  (::benchmark::State & st) { BM_combine(st); } \
-  BENCHMARK_REGISTER_F(StringCombine, name)     \
-    ->Apply(generate_bench_args)                \
-    ->UseManualTime()                           \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(concat)
+NVBENCH_BENCH(bench_combine)
+  .set_name("concat")
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});

From 9657c9a5dc4c4a1bf9fd7b55cfeb53c60dda3c66 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Thu, 31 Oct 2024 07:19:48 -0700
Subject: [PATCH 24/41] Fix `Schema.Builder` does not propagate precision value
 to `Builder` instance (#17214)

When calling `Schema.Builder.build()`, the value `topLevelPrecision` should be passed into the constructor of the `Schema` class. However, it was forgotten.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/17214
---
 java/src/main/java/ai/rapids/cudf/Schema.java | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index 6da591d659f..ae8a0e17f9d 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -36,13 +36,13 @@ public class Schema {
   private static final int UNKNOWN_PRECISION = -1;
 
   /**
-  * Store precision for the top level column, only applicable if the column is a decimal type.
-  * <p/>
-  * This variable is not designed to be used by any libcudf's APIs since libcudf does not support
-  * precisions for fixed point numbers.
-  * Instead, it is used only to pass down the precision values from Spark's DecimalType to the
-  * JNI level, where some JNI functions require these values to perform their operations.
-  */
+   * Store precision for the top level column, only applicable if the column is a decimal type.
+   * <p/>
+   * This variable is not designed to be used by any libcudf's APIs since libcudf does not support
+   * precisions for fixed point numbers.
+   * Instead, it is used only to pass down the precision values from Spark's DecimalType to the
+   * JNI level, where some JNI functions require these values to perform their operations.
+   */
   private final int topLevelPrecision;
 
   private final List<String> childNames;
@@ -429,7 +429,7 @@ public Schema build() {
           children.add(b.build());
         }
       }
-      return new Schema(topLevelType, names, children);
+      return new Schema(topLevelType, topLevelPrecision, names, children);
     }
   }
 }

From 3db6a0e4b3c29fa4818e04075301d3a4863d1bc8 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 31 Oct 2024 11:52:49 -0400
Subject: [PATCH 25/41] Add TokenizeVocabulary to api docs (#17208)

Adds the `TokenizeVocabulary` class to the cuDF API guide.
Also removes the `SubwordTokenizer` which is to be deprecated in the future.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17208
---
 docs/cudf/source/user_guide/api_docs/index.rst       |  2 +-
 .../source/user_guide/api_docs/subword_tokenize.rst  | 12 ------------
 .../user_guide/api_docs/tokenize_vocabulary.rst      | 12 ++++++++++++
 3 files changed, 13 insertions(+), 13 deletions(-)
 delete mode 100644 docs/cudf/source/user_guide/api_docs/subword_tokenize.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/tokenize_vocabulary.rst

diff --git a/docs/cudf/source/user_guide/api_docs/index.rst b/docs/cudf/source/user_guide/api_docs/index.rst
index d05501f4a4a..f711327f9ed 100644
--- a/docs/cudf/source/user_guide/api_docs/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/index.rst
@@ -19,7 +19,7 @@ This page provides a list of all publicly accessible modules, methods and classe
     general_utilities
     window
     io
-    subword_tokenize
+    tokenize_vocabulary
     string_handling
     list_handling
     struct_handling
diff --git a/docs/cudf/source/user_guide/api_docs/subword_tokenize.rst b/docs/cudf/source/user_guide/api_docs/subword_tokenize.rst
deleted file mode 100644
index cd240fe4db4..00000000000
--- a/docs/cudf/source/user_guide/api_docs/subword_tokenize.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-================
-SubwordTokenizer
-================
-.. currentmodule:: cudf.core.subword_tokenizer
-
-Constructor
-~~~~~~~~~~~
-.. autosummary::
-   :toctree: api/
-
-   SubwordTokenizer
-   SubwordTokenizer.__call__
diff --git a/docs/cudf/source/user_guide/api_docs/tokenize_vocabulary.rst b/docs/cudf/source/user_guide/api_docs/tokenize_vocabulary.rst
new file mode 100644
index 00000000000..1b5c965f3c9
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/tokenize_vocabulary.rst
@@ -0,0 +1,12 @@
+==================
+TokenizeVocabulary
+==================
+.. currentmodule:: cudf.core.tokenize_vocabulary
+
+Constructor
+~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   TokenizeVocabulary
+   TokenizeVocabulary.tokenize

From f99ef41a8f01395635dadd4decba182e6318fc72 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 31 Oct 2024 11:55:26 -0400
Subject: [PATCH 26/41] Move detail header floating_conversion.hpp to detail
 subdirectory (#17209)

Moves the 'cudf/fixed_point/floating_conversion.hpp' to `cudf/fixed_point/detail/` subdirectory since it only contains declarations and definition in the `detail` namespace.
It had previously been its own module. https://docs.rapids.ai/api/libcudf/stable/modules.html

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17209
---
 .../fixed_point/{ => detail}/floating_conversion.hpp   | 10 ----------
 cpp/include/cudf/unary.hpp                             |  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)
 rename cpp/include/cudf/fixed_point/{ => detail}/floating_conversion.hpp (99%)

diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
similarity index 99%
rename from cpp/include/cudf/fixed_point/floating_conversion.hpp
rename to cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
index f0d50edccd1..fce08b4a5c4 100644
--- a/cpp/include/cudf/fixed_point/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
@@ -26,14 +26,6 @@
 #include <cstring>
 
 namespace CUDF_EXPORT numeric {
-
-/**
- * @addtogroup floating_conversion
- * @{
- * @file
- * @brief fixed_point <--> floating-point conversion functions.
- */
-
 namespace detail {
 
 /**
@@ -1141,6 +1133,4 @@ CUDF_HOST_DEVICE inline FloatingType convert_integral_to_floating(Rep const& val
 }
 
 }  // namespace detail
-
-/** @} */  // end of group
 }  // namespace CUDF_EXPORT numeric
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 53e0f3a15d2..046e9745a71 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
+#include <cudf/fixed_point/detail/floating_conversion.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/fixed_point/floating_conversion.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>

From f7020f12a5b84362b5172eb3be55c75acb04949e Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 31 Oct 2024 14:08:37 -0400
Subject: [PATCH 27/41] Expose stream-ordering in partitioning API (#17213)

Add stream parameter to public APIs:
```
cudf::partition
cudf::round_robin_partition
```
Added stream gtest for above two functions and for `hash_partition`.

Reference: https://github.com/rapidsai/cudf/issues/13744

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17213
---
 cpp/include/cudf/partitioning.hpp       |  4 ++
 cpp/src/partitioning/partitioning.cu    |  3 +-
 cpp/src/partitioning/round_robin.cu     |  4 +-
 cpp/tests/CMakeLists.txt                |  1 +
 cpp/tests/streams/partitioning_test.cpp | 73 +++++++++++++++++++++++++
 5 files changed, 82 insertions(+), 3 deletions(-)
 create mode 100644 cpp/tests/streams/partitioning_test.cpp

diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp
index 385da993262..f9a68e4fffc 100644
--- a/cpp/include/cudf/partitioning.hpp
+++ b/cpp/include/cudf/partitioning.hpp
@@ -70,6 +70,7 @@ enum class hash_id {
  * @param partition_map Non-nullable column of integer values that map each row
  * in `t` to it's partition.
  * @param num_partitions The total number of partitions
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Pair containing the reordered table and vector of `num_partitions +
  * 1` offsets to each partition such that the size of partition `i` is
@@ -79,6 +80,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   table_view const& t,
   column_view const& partition_map,
   size_type num_partitions,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -242,6 +244,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
  * @param[in] input The input table to be round-robin partitioned
  * @param[in] num_partitions Number of partitions for the table
  * @param[in] start_partition Index of the 1st partition
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  *
  * @return A std::pair consisting of a unique_ptr to the partitioned table
@@ -251,6 +254,7 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robi
   table_view const& input,
   cudf::size_type num_partitions,
   cudf::size_type start_partition   = 0,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 17008e80e79..ebab3beb08f 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -834,10 +834,11 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   table_view const& t,
   column_view const& partition_map,
   size_type num_partitions,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::partition(t, partition_map, num_partitions, cudf::get_default_stream(), mr);
+  return detail::partition(t, partition_map, num_partitions, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 5a4c90a67a5..ab6ab393878 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -273,11 +273,11 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robi
   table_view const& input,
   cudf::size_type num_partitions,
   cudf::size_type start_partition,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_robin_partition(
-    input, num_partitions, start_partition, cudf::get_default_stream(), mr);
+  return detail::round_robin_partition(input, num_partitions, start_partition, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b78a64d0e55..a5e1cf646b4 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -710,6 +710,7 @@ ConfigureTest(STREAM_MULTIBYTE_SPLIT_TEST streams/io/multibyte_split_test.cpp ST
 ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_PARTITIONING_TEST streams/partitioning_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
 ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/partitioning_test.cpp b/cpp/tests/streams/partitioning_test.cpp
new file mode 100644
index 00000000000..636c5c1f1f9
--- /dev/null
+++ b/cpp/tests/streams/partitioning_test.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/partitioning.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+
+using cudf::test::fixed_width_column_wrapper;
+using cudf::test::strings_column_wrapper;
+
+class PartitionTest : public cudf::test::BaseFixture {};
+
+TEST_F(PartitionTest, Struct)
+{
+  fixed_width_column_wrapper<numeric::decimal32, int32_t> A({1, 2}, {0, 1});
+  auto struct_col         = cudf::test::structs_column_wrapper({A}, {0, 1}).release();
+  auto table_to_partition = cudf::table_view{{*struct_col}};
+  fixed_width_column_wrapper<int32_t> map{9, 2};
+
+  auto num_partitions = 12;
+  auto result =
+    cudf::partition(table_to_partition, map, num_partitions, cudf::test::get_default_stream());
+}
+
+TEST_F(PartitionTest, EmptyInput)
+{
+  auto const empty_column    = fixed_width_column_wrapper<int32_t>{};
+  auto const num_partitions  = 5;
+  auto const start_partition = 0;
+  auto const [out_table, out_offsets] =
+    cudf::round_robin_partition(cudf::table_view{{empty_column}},
+                                num_partitions,
+                                start_partition,
+                                cudf::test::get_default_stream());
+}
+
+TEST_F(PartitionTest, ZeroPartitions)
+{
+  fixed_width_column_wrapper<float> floats({1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
+  fixed_width_column_wrapper<int16_t> integers({1, 2, 3, 4, 5, 6, 7, 8});
+  strings_column_wrapper strings({"a", "bb", "ccc", "d", "ee", "fff", "gg", "h"});
+  auto input = cudf::table_view({floats, integers, strings});
+
+  auto columns_to_hash = std::vector<cudf::size_type>({2});
+
+  cudf::size_type const num_partitions = 0;
+  auto [output, offsets]               = cudf::hash_partition(input,
+                                                columns_to_hash,
+                                                num_partitions,
+                                                cudf::hash_id::HASH_MURMUR3,
+                                                cudf::DEFAULT_HASH_SEED,
+                                                cudf::test::get_default_stream());
+}

From 02a50e8aa708877f06d5a19a3aa070b08aff5b1f Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 31 Oct 2024 14:09:22 -0400
Subject: [PATCH 28/41] Remove `nvtext::load_vocabulary` from pylibcudf
 (#17220)

This PR follow up #17100 to address the last review here https://github.com/rapidsai/cudf/pull/17100#pullrequestreview-2406700961

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17220
---
 python/cudf/cudf/core/tokenize_vocabulary.py  |  5 +--
 .../pylibcudf/pylibcudf/nvtext/tokenize.pxd   |  2 --
 .../pylibcudf/pylibcudf/nvtext/tokenize.pyx   | 36 ++++---------------
 .../pylibcudf/tests/test_nvtext_tokenize.py   | 11 ++----
 4 files changed, 12 insertions(+), 42 deletions(-)

diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
index f0ce6e9d5d1..1e31376cce8 100644
--- a/python/cudf/cudf/core/tokenize_vocabulary.py
+++ b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -2,9 +2,10 @@
 
 from __future__ import annotations
 
+import pylibcudf as plc
+
 import cudf
 from cudf._lib.nvtext.tokenize import (
-    TokenizeVocabulary as cpp_tokenize_vocabulary,
     tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
 )
 
@@ -20,7 +21,7 @@ class TokenizeVocabulary:
     """
 
     def __init__(self, vocabulary: "cudf.Series"):
-        self.vocabulary = cpp_tokenize_vocabulary(
+        self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary(
             vocabulary._column.to_pylibcudf(mode="read")
         )
 
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
index 0254b91ad58..0aed9702d61 100644
--- a/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
@@ -21,8 +21,6 @@ cpdef Column character_tokenize(Column input)
 
 cpdef Column detokenize(Column input, Column row_indices, Scalar separator=*)
 
-cpdef TokenizeVocabulary load_vocabulary(Column input)
-
 cpdef Column tokenize_with_vocabulary(
     Column input,
     TokenizeVocabulary vocabulary,
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
index cdecfaabca2..ec02e8ebf4e 100644
--- a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
@@ -43,8 +43,7 @@ cpdef Column tokenize_scalar(Column input, Scalar delimiter=None):
     input : Column
         Strings column to tokenize
     delimiter : Scalar
-        String scalar used to separate individual
-        strings into tokens
+        String scalar used to separate individual strings into tokens
 
     Returns
     -------
@@ -106,7 +105,7 @@ cpdef Column count_tokens_scalar(Column input, Scalar delimiter=None):
     ----------
     input : Column
         Strings column to count tokens
-    delimiters : Scalar]
+    delimiters : Scalar
         String scalar used to separate each string into tokens
 
     Returns
@@ -141,8 +140,7 @@ cpdef Column count_tokens_column(Column input, Column delimiters):
     input : Column
         Strings column to count tokens
     delimiters : Column
-        Strings column used to separate
-        each string into tokens
+        Strings column used to separate each string into tokens
 
     Returns
     -------
@@ -198,11 +196,9 @@ cpdef Column detokenize(
     input : Column
         Strings column to detokenize
     row_indices : Column
-        The relative output row index assigned
-        for each token in the input column
+        The relative output row index assigned for each token in the input column
     separator : Scalar
-        String to append after concatenating
-        each token to the proper output row
+        String to append after concatenating each token to the proper output row
 
     Returns
     -------
@@ -225,25 +221,6 @@ cpdef Column detokenize(
 
     return Column.from_libcudf(move(c_result))
 
-cpdef TokenizeVocabulary load_vocabulary(Column input):
-    """
-    Create a ``TokenizeVocabulary`` object from a strings column.
-
-    For details, see cpp:func:`cudf::nvtext::load_vocabulary`
-
-    Parameters
-    ----------
-    input : Column
-        Strings for the vocabulary
-
-    Returns
-    -------
-    TokenizeVocabulary
-        Object to be used with cpp:func:`cudf::nvtext::tokenize_with_vocabulary`
-    """
-    return TokenizeVocabulary(input)
-
-
 cpdef Column tokenize_with_vocabulary(
     Column input,
     TokenizeVocabulary vocabulary,
@@ -265,8 +242,7 @@ cpdef Column tokenize_with_vocabulary(
     delimiter : Scalar
         Used to identify tokens within ``input``
     default_id : size_type
-        The token id to be used for tokens not found
-        in the vocabulary; Default is -1
+        The token id to be used for tokens not found in the vocabulary; Default is -1
 
     Returns
     -------
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py
index 4ec9a5ee1a5..f1b4a5637e1 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py
@@ -78,18 +78,13 @@ def test_detokenize(input_col, delimiter):
     assert_column_eq(result, expected)
 
 
-def test_load_vocabulary(input_col):
-    result = plc.nvtext.tokenize.load_vocabulary(
-        plc.interop.from_arrow(input_col)
-    )
-    assert isinstance(result, plc.nvtext.tokenize.TokenizeVocabulary)
-
-
 @pytest.mark.parametrize("default_id", [-1, 0])
 def test_tokenize_with_vocabulary(input_col, default_id):
     result = plc.nvtext.tokenize.tokenize_with_vocabulary(
         plc.interop.from_arrow(input_col),
-        plc.nvtext.tokenize.load_vocabulary(plc.interop.from_arrow(input_col)),
+        plc.nvtext.tokenize.TokenizeVocabulary(
+            plc.interop.from_arrow(input_col)
+        ),
         plc.interop.from_arrow(pa.scalar(" ")),
         default_id,
     )

From a83debbb22e2d82194af3430d4700b20edfaa079 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 31 Oct 2024 12:38:45 -0700
Subject: [PATCH 29/41] Fix groupby.get_group with length-1 tuple with
 list-like grouper (#17216)

closes #17187

Adds similar logic as implemented in pandas: https://github.com/pandas-dev/pandas/blob/main/pandas/core/groupby/groupby.py#L751-L758

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17216
---
 python/cudf/cudf/core/groupby/groupby.py |  5 +++++
 python/cudf/cudf/tests/test_groupby.py   | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6630bd96c01..e59b948aba9 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -481,6 +481,11 @@ def get_group(self, name, obj=None):
                 "instead of ``gb.get_group(name, obj=df)``.",
                 FutureWarning,
             )
+        if is_list_like(self._by):
+            if isinstance(name, tuple) and len(name) == 1:
+                name = name[0]
+            else:
+                raise KeyError(name)
         return obj.iloc[self.indices[name]]
 
     @_performance_tracking
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 6b222841622..e4422e204bc 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -4059,3 +4059,19 @@ def test_ndim():
     pgb = pser.groupby([0, 0, 1])
     ggb = gser.groupby(cudf.Series([0, 0, 1]))
     assert pgb.ndim == ggb.ndim
+
+
+@pytest.mark.skipif(
+    not PANDAS_GE_220, reason="pandas behavior applicable in >=2.2"
+)
+def test_get_group_list_like():
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    result = df.groupby(["a"]).get_group((1,))
+    expected = df.to_pandas().groupby(["a"]).get_group((1,))
+    assert_eq(result, expected)
+
+    with pytest.raises(KeyError):
+        df.groupby(["a"]).get_group((1, 2))
+
+    with pytest.raises(KeyError):
+        df.groupby(["a"]).get_group([1])

From 6055393b215c62809a7248094a5848253121651e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 31 Oct 2024 13:42:23 -0700
Subject: [PATCH 30/41] Fix binop with LHS numpy datetimelike scalar (#17226)

closes #17087

For binops, cudf tries to convert a 0D numpy array to a numpy scalar via `.dtype.type(value)`, but `.dtype.type` requires other parameters if its a `numpy.datetime64` or `numpy.timedelta64`. Indexing via `[()]` will perform this conversion correctly.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17226
---
 python/cudf/cudf/core/column/column.py |  4 ++--
 python/cudf/cudf/tests/test_binops.py  | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index d2cd6e8ac8f..d2f9d208c77 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -580,8 +580,8 @@ def _wrap_binop_normalization(self, other):
         if cudf.utils.utils.is_na_like(other):
             return cudf.Scalar(other, dtype=self.dtype)
         if isinstance(other, np.ndarray) and other.ndim == 0:
-            # Try and maintain the dtype
-            other = other.dtype.type(other.item())
+            # Return numpy scalar
+            other = other[()]
         return self.normalize_binop_value(other)
 
     def _scatter_by_slice(
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 949fa909b5b..71b6bbd688d 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -3431,3 +3431,16 @@ def test_binop_eq_ne_index_series(data1, data2):
     expected = gi.to_pandas() != gs.to_pandas()
 
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("scalar", [np.datetime64, np.timedelta64])
+def test_binop_lhs_numpy_datetimelike_scalar(scalar):
+    slr1 = scalar(1, "ms")
+    slr2 = scalar(1, "ns")
+    result = slr1 < cudf.Series([slr2])
+    expected = slr1 < pd.Series([slr2])
+    assert_eq(result, expected)
+
+    result = slr2 < cudf.Series([slr1])
+    expected = slr2 < pd.Series([slr1])
+    assert_eq(result, expected)

From 0929115e6455ed06c0e9db498cbe33ae85d5c37c Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 31 Oct 2024 21:52:59 +0000
Subject: [PATCH 31/41] Support for polars 1.12 in cudf-polars (#17227)

No new updates are required, we must just no longer xfail a test if running with 1.12

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17227
---
 conda/environments/all_cuda-118_arch-x86_64.yaml           | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml           | 2 +-
 dependencies.yaml                                          | 2 +-
 python/cudf_polars/cudf_polars/containers/dataframe.py     | 3 ++-
 .../cudf_polars/cudf_polars/dsl/expressions/aggregation.py | 1 +
 python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py | 4 ++--
 python/cudf_polars/cudf_polars/dsl/expressions/boolean.py  | 3 ++-
 python/cudf_polars/cudf_polars/dsl/expressions/datetime.py | 3 ++-
 python/cudf_polars/cudf_polars/dsl/expressions/literal.py  | 1 +
 .../cudf_polars/cudf_polars/dsl/expressions/selection.py   | 1 +
 python/cudf_polars/cudf_polars/dsl/expressions/string.py   | 3 ++-
 python/cudf_polars/cudf_polars/dsl/expressions/unary.py    | 1 +
 python/cudf_polars/cudf_polars/dsl/ir.py                   | 3 ++-
 python/cudf_polars/cudf_polars/dsl/to_ast.py               | 4 ++--
 python/cudf_polars/cudf_polars/dsl/translate.py            | 3 ++-
 python/cudf_polars/cudf_polars/typing/__init__.py          | 4 ++--
 python/cudf_polars/cudf_polars/utils/dtypes.py             | 3 ++-
 python/cudf_polars/cudf_polars/utils/versions.py           | 7 ++++---
 python/cudf_polars/pyproject.toml                          | 4 ++--
 python/cudf_polars/tests/containers/test_column.py         | 3 ++-
 python/cudf_polars/tests/containers/test_dataframe.py      | 3 ++-
 python/cudf_polars/tests/dsl/test_expr.py                  | 3 ++-
 python/cudf_polars/tests/dsl/test_to_ast.py                | 3 ++-
 python/cudf_polars/tests/dsl/test_traversal.py             | 4 ++--
 python/cudf_polars/tests/expressions/test_literal.py       | 3 ++-
 python/cudf_polars/tests/expressions/test_sort.py          | 3 ++-
 python/cudf_polars/tests/test_join.py                      | 3 ++-
 python/cudf_polars/tests/utils/test_broadcast.py           | 3 ++-
 28 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 24dc3c9a7cc..9d9fec97731 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -66,7 +66,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.12
+- polars>=1.11,<1.13
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<18.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index a2bb2a3fe7f..19e3eafd641 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -64,7 +64,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.12
+- polars>=1.11,<1.13
 - pre-commit
 - pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
diff --git a/dependencies.yaml b/dependencies.yaml
index 12038c5e503..90255ca674c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -727,7 +727,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.11,<1.12
+          - polars>=1.11,<1.13
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index 2c195f6637c..08bc9d0ea3f 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -9,10 +9,11 @@
 from typing import TYPE_CHECKING, cast
 
 import pyarrow as pa
-import pylibcudf as plc
 
 import polars as pl
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 from cudf_polars.utils import dtypes
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
index 41b1defab39..2af9fdaacc5 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
@@ -10,6 +10,7 @@
 from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
+
 import pylibcudf as plc
 
 from cudf_polars.containers import Column
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
index 11a47e7ea51..245bdbefe88 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
@@ -8,10 +8,10 @@
 
 from typing import TYPE_CHECKING, ClassVar
 
-import pylibcudf as plc
-
 from polars.polars import _expr_nodes as pl_expr
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
index 9c14a8386f3..8db8172ebd1 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
@@ -10,10 +10,11 @@
 from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
-import pylibcudf as plc
 
 from polars.polars import _expr_nodes as pl_expr
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 from cudf_polars.dsl.expressions.base import (
     ExecutionContext,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
index 596e193d8fe..65fa4bfa62f 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -9,10 +9,11 @@
 from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
-import pylibcudf as plc
 
 from polars.polars import _expr_nodes as pl_expr
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
index c8aa993b994..c16313bf83c 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
@@ -9,6 +9,7 @@
 from typing import TYPE_CHECKING, Any
 
 import pyarrow as pa
+
 import pylibcudf as plc
 
 from cudf_polars.containers import Column
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
index 0247256e507..77d7d4c0d22 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
@@ -9,6 +9,7 @@
 from typing import TYPE_CHECKING
 
 import pyarrow as pa
+
 import pylibcudf as plc
 
 from cudf_polars.containers import Column
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
index 62b54c63a8d..8b66c9d4676 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -10,11 +10,12 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 
 from polars.exceptions import InvalidOperationError
 from polars.polars import _expr_nodes as pl_expr
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
 from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
index 53f6ed29239..6f22544c050 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -8,6 +8,7 @@
 from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
+
 import pylibcudf as plc
 
 from cudf_polars.containers import Column
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 1aa6741d417..04aa74024cd 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -20,11 +20,12 @@
 from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
-import pylibcudf as plc
 from typing_extensions import assert_never
 
 import polars as pl
 
+import pylibcudf as plc
+
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import Column, DataFrame
 from cudf_polars.dsl.nodebase import Node
diff --git a/python/cudf_polars/cudf_polars/dsl/to_ast.py b/python/cudf_polars/cudf_polars/dsl/to_ast.py
index ffdae81de55..9a0838631cc 100644
--- a/python/cudf_polars/cudf_polars/dsl/to_ast.py
+++ b/python/cudf_polars/cudf_polars/dsl/to_ast.py
@@ -8,11 +8,11 @@
 from functools import partial, reduce, singledispatch
 from typing import TYPE_CHECKING, TypeAlias
 
+from polars.polars import _expr_nodes as pl_expr
+
 import pylibcudf as plc
 from pylibcudf import expressions as plc_expr
 
-from polars.polars import _expr_nodes as pl_expr
-
 from cudf_polars.dsl import expr
 from cudf_polars.dsl.traversal import CachingVisitor
 from cudf_polars.typing import GenericTransformer
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index c28f2c2651a..5181214819e 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -12,13 +12,14 @@
 from typing import TYPE_CHECKING, Any
 
 import pyarrow as pa
-import pylibcudf as plc
 from typing_extensions import assert_never
 
 import polars as pl
 import polars.polars as plrs
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
+import pylibcudf as plc
+
 from cudf_polars.dsl import expr, ir
 from cudf_polars.dsl.traversal import make_recursive, reuse_if_unchanged
 from cudf_polars.typing import NodeTraverser
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index a27a3395c35..57c5fdaa7cf 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -8,10 +8,10 @@
 from collections.abc import Hashable, Mapping
 from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeVar, Union
 
-import pylibcudf as plc
-
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
+import pylibcudf as plc
+
 if TYPE_CHECKING:
     from collections.abc import Callable
     from typing import TypeAlias
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 4154a404e98..1d0479802ca 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -8,11 +8,12 @@
 from functools import cache
 
 import pyarrow as pa
-import pylibcudf as plc
 from typing_extensions import assert_never
 
 import polars as pl
 
+import pylibcudf as plc
+
 __all__ = ["from_polars", "downcast_arrow_lists", "can_cast"]
 
 
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index 4a7ad6b3cf2..a119cab3b74 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -12,11 +12,12 @@
 
 POLARS_VERSION = parse(__version__)
 
-POLARS_VERSION_LT_18 = POLARS_VERSION < parse("1.8")
+POLARS_VERSION_LT_111 = POLARS_VERSION < parse("1.11")
+POLARS_VERSION_LT_112 = POLARS_VERSION < parse("1.12")
 
 
 def _ensure_polars_version():
-    if POLARS_VERSION_LT_18:
+    if POLARS_VERSION_LT_111:
         raise ImportError(
-            "cudf_polars requires py-polars v1.8 or greater."
+            "cudf_polars requires py-polars v1.11 or greater."
         )  # pragma: no cover
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 2afdab1be4b..a2c62ef9460 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.11,<1.12",
+    "polars>=1.11,<1.13",
     "pylibcudf==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -188,7 +188,7 @@ required-imports = ["from __future__ import annotations"]
 
 [tool.ruff.lint.isort.sections]
 polars = ["polars"]
-rapids = ["rmm", "cudf"]
+rapids = ["rmm", "pylibcudf"]
 
 [tool.ruff.format]
 docstring-code-format = true
diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py
index 1f26ab1af9f..95541b4ecc3 100644
--- a/python/cudf_polars/tests/containers/test_column.py
+++ b/python/cudf_polars/tests/containers/test_column.py
@@ -4,9 +4,10 @@
 from __future__ import annotations
 
 import pyarrow
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 
 
diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py
index 5c68fb8f0aa..d68c8d90163 100644
--- a/python/cudf_polars/tests/containers/test_dataframe.py
+++ b/python/cudf_polars/tests/containers/test_dataframe.py
@@ -3,11 +3,12 @@
 
 from __future__ import annotations
 
-import pylibcudf as plc
 import pytest
 
 import polars as pl
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column, DataFrame
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
diff --git a/python/cudf_polars/tests/dsl/test_expr.py b/python/cudf_polars/tests/dsl/test_expr.py
index 84e33262869..de8fec301fe 100644
--- a/python/cudf_polars/tests/dsl/test_expr.py
+++ b/python/cudf_polars/tests/dsl/test_expr.py
@@ -3,9 +3,10 @@
 
 from __future__ import annotations
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 from cudf_polars.dsl import expr
 
 
diff --git a/python/cudf_polars/tests/dsl/test_to_ast.py b/python/cudf_polars/tests/dsl/test_to_ast.py
index a7b779a6ec9..57d794d4890 100644
--- a/python/cudf_polars/tests/dsl/test_to_ast.py
+++ b/python/cudf_polars/tests/dsl/test_to_ast.py
@@ -3,12 +3,13 @@
 
 from __future__ import annotations
 
-import pylibcudf as plc
 import pytest
 
 import polars as pl
 from polars.testing import assert_frame_equal
 
+import pylibcudf as plc
+
 import cudf_polars.dsl.ir as ir_nodes
 from cudf_polars import translate_ir
 from cudf_polars.containers.dataframe import DataFrame, NamedColumn
diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py
index 6505a786855..15c644d7978 100644
--- a/python/cudf_polars/tests/dsl/test_traversal.py
+++ b/python/cudf_polars/tests/dsl/test_traversal.py
@@ -5,11 +5,11 @@
 
 from functools import singledispatch
 
-import pylibcudf as plc
-
 import polars as pl
 from polars.testing import assert_frame_equal
 
+import pylibcudf as plc
+
 from cudf_polars import translate_ir
 from cudf_polars.dsl import expr, ir
 from cudf_polars.dsl.traversal import (
diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py
index ced49bdc254..52bc4a9ac71 100644
--- a/python/cudf_polars/tests/expressions/test_literal.py
+++ b/python/cudf_polars/tests/expressions/test_literal.py
@@ -2,11 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import pylibcudf as plc
 import pytest
 
 import polars as pl
 
+import pylibcudf as plc
+
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py
index 2a37683478b..62df8ce1498 100644
--- a/python/cudf_polars/tests/expressions/test_sort.py
+++ b/python/cudf_polars/tests/expressions/test_sort.py
@@ -4,11 +4,12 @@
 
 import itertools
 
-import pylibcudf as plc
 import pytest
 
 import polars as pl
 
+import pylibcudf as plc
+
 from cudf_polars import translate_ir
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 501560d15b8..8ca7a7b9264 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -13,6 +13,7 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
+from cudf_polars.utils.versions import POLARS_VERSION_LT_112
 
 
 @pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"])
@@ -88,7 +89,7 @@ def test_left_join_with_slice(left, right, join_nulls, zlice):
     if zlice is not None:
         q_expect = q.collect().slice(*zlice)
         q = q.slice(*zlice)
-        if zlice == (1, 5) or zlice == (0, 2):
+        if POLARS_VERSION_LT_112 and (zlice == (1, 5) or zlice == (0, 2)):
             # https://github.com/pola-rs/polars/issues/19403
             # https://github.com/pola-rs/polars/issues/19405
             ctx = pytest.raises(AssertionError)
diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py
index e7770bfadac..3b3b4f0f8db 100644
--- a/python/cudf_polars/tests/utils/test_broadcast.py
+++ b/python/cudf_polars/tests/utils/test_broadcast.py
@@ -3,9 +3,10 @@
 
 from __future__ import annotations
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 from cudf_polars.dsl.ir import broadcast
 

From b5b47fe24480c3a57e58ec6646db03034d8f5d4a Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 31 Oct 2024 17:30:59 -0500
Subject: [PATCH 32/41] use rapids-generate-pip-constraints to pin to oldest
 dependencies in CI (#17131)

Follow-up to https://github.com/rapidsai/cudf/pull/16570#discussion_r1735300231

Proposes using the new `rapids-generate-pip-constraints` tool from `gha-tools` to generate a list of pip constraints pinning to the oldest supported verisons of dependencies here.

## Notes for Reviewers

### How I tested this

https://github.com/rapidsai/gha-tools/pull/114#issuecomment-2445377824

You can also see one the most recent `wheel-tests-cudf` builds here:

* oldest-deps: numpy 1.x ([build link](https://github.com/rapidsai/cudf/actions/runs/11615430314/job/32347576688?pr=17131))
* latest-deps: numpy 2.x ([build link](https://github.com/rapidsai/cudf/actions/runs/11615430314/job/32347577095?pr=17131))

#

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17131
---
 ci/cudf_pandas_scripts/run_tests.sh | 11 ++---------
 ci/test_wheel_cudf.sh               | 11 ++---------
 ci/test_wheel_cudf_polars.sh        | 12 +++---------
 ci/test_wheel_dask_cudf.sh          | 12 +++---------
 4 files changed, 10 insertions(+), 36 deletions(-)

diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index f6bdc6f9484..61361fffb07 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -54,15 +54,8 @@ else
     RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
     RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
-    echo "" > ./constraints.txt
-    if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-        # `test_python_cudf_pandas` constraints are for `[test]` not `[cudf-pandas-tests]`
-        rapids-dependency-file-generator \
-            --output requirements \
-            --file-key test_python_cudf_pandas \
-            --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-        | tee ./constraints.txt
-    fi
+    # generate constraints (possibly pinning to oldest support versions of dependencies)
+    rapids-generate-pip-constraints test_python_cudf_pandas ./constraints.txt
 
     python -m pip install \
         -v \
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index a701bfe15e0..ce12744c9e3 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -12,15 +12,8 @@ RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 
 rapids-logger "Install cudf, pylibcudf, and test requirements"
 
-# Constrain to minimum dependency versions if job is set up as "oldest"
-echo "" > ./constraints.txt
-if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-    rapids-dependency-file-generator \
-        --output requirements \
-        --file-key py_test_cudf \
-        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-      | tee ./constraints.txt
-fi
+# generate constraints (possibly pinning to oldest support versions of dependencies)
+rapids-generate-pip-constraints py_test_cudf ./constraints.txt
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 05f882a475b..2884757e46b 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -29,15 +29,9 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 rapids-logger "Installing cudf_polars and its dependencies"
-# Constraint to minimum dependency versions if job is set up as "oldest"
-echo "" > ./constraints.txt
-if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-    rapids-dependency-file-generator \
-        --output requirements \
-        --file-key py_test_cudf_polars \
-        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-      | tee ./constraints.txt
-fi
+
+# generate constraints (possibly pinning to oldest support versions of dependencies)
+rapids-generate-pip-constraints py_test_cudf_polars ./constraints.txt
 
 # echo to expand wildcard before adding `[test]` requires for pip
 python -m pip install \
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 361a42ccda9..e15949f4bdb 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -12,15 +12,9 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 rapids-logger "Install dask_cudf, cudf, pylibcudf, and test requirements"
-# Constraint to minimum dependency versions if job is set up as "oldest"
-echo "" > ./constraints.txt
-if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-    rapids-dependency-file-generator \
-        --output requirements \
-        --file-key py_test_dask_cudf \
-        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-      | tee ./constraints.txt
-fi
+
+# generate constraints (possibly pinning to oldest support versions of dependencies)
+rapids-generate-pip-constraints py_test_dask_cudf ./constraints.txt
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \

From 0a8728425d53866a7bd201524a8f3e32b64ad16b Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 1 Nov 2024 09:40:00 -0400
Subject: [PATCH 33/41] Expose streams in public round APIs (#16925)

Contributes to #13744

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16925
---
 cpp/include/cudf/round.hpp       |  3 +++
 cpp/src/round/round.cu           |  3 ++-
 cpp/tests/CMakeLists.txt         |  1 +
 cpp/tests/streams/round_test.cpp | 40 ++++++++++++++++++++++++++++++++
 4 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 cpp/tests/streams/round_test.cpp

diff --git a/cpp/include/cudf/round.hpp b/cpp/include/cudf/round.hpp
index ba56ff34b97..158e6df7e5f 100644
--- a/cpp/include/cudf/round.hpp
+++ b/cpp/include/cudf/round.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
@@ -66,6 +67,7 @@ enum class rounding_method : int32_t { HALF_UP, HALF_EVEN };
  * @param decimal_places Number of decimal places to round to (default 0). If negative, this
  * specifies the number of positions to the left of the decimal point.
  * @param method         Rounding method
+ * @param stream         CUDA stream used for device memory operations and kernel launches
  * @param mr             Device memory resource used to allocate the returned column's device memory
  *
  * @return Column with each of the values rounded
@@ -74,6 +76,7 @@ std::unique_ptr<column> round(
   column_view const& input,
   int32_t decimal_places            = 0,
   rounding_method method            = rounding_method::HALF_UP,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 8988d73fb02..332c440aea9 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -358,10 +358,11 @@ std::unique_ptr<column> round(column_view const& input,
 std::unique_ptr<column> round(column_view const& input,
                               int32_t decimal_places,
                               rounding_method method,
+                              rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round(input, decimal_places, method, cudf::get_default_stream(), mr);
+  return detail::round(input, decimal_places, method, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a5e1cf646b4..6d3d1454462 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -716,6 +716,7 @@ ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testi
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_ROUND_TEST streams/round_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_STREAM_COMPACTION_TEST streams/stream_compaction_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/round_test.cpp b/cpp/tests/streams/round_test.cpp
new file mode 100644
index 00000000000..b8fda022db8
--- /dev/null
+++ b/cpp/tests/streams/round_test.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/round.hpp>
+
+#include <vector>
+
+class RoundTest : public cudf::test::BaseFixture {};
+
+TEST_F(RoundTest, RoundHalfToEven)
+{
+  std::vector<double> vals = {1.729, 17.29, 172.9, 1729};
+  cudf::test::fixed_width_column_wrapper<double> input(vals.begin(), vals.end());
+  cudf::round(input, 0, cudf::rounding_method::HALF_UP, cudf::test::get_default_stream());
+}
+
+TEST_F(RoundTest, RoundHalfAwayFromEven)
+{
+  std::vector<double> vals = {1.5, 2.5, 1.35, 1.45, 15, 25};
+  cudf::test::fixed_width_column_wrapper<double> input(vals.begin(), vals.end());
+  cudf::round(input, -1, cudf::rounding_method::HALF_EVEN, cudf::test::get_default_stream());
+}

From 8219d28161e4d193b44e1fd5d0d0417812ca6892 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 1 Nov 2024 11:10:48 -0400
Subject: [PATCH 34/41] Minor I/O code quality improvements (#17105)

This PR makes small improvements for the I/O code. Specifically,
- Place type constraint on a template class to allow only for rvalue argument. In addition, replace `std::move` with `std::forward` to make the code more *apparently* consistent with the convention, i.e. use `std::move()` on the rvalue references, and `std::forward` on the forwarding references (Effective modern C++ item 25).
- Alleviate (but not completely resolve) an existing cuFile driver close issue by removing the explicit driver close call. See #17121
- Minor typo fix (`struct` &#8594; `class`).

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17105
---
 cpp/include/cudf/io/datasource.hpp         | 21 ++++++++++++++-------
 cpp/src/io/utilities/file_io_utilities.cpp |  6 +++++-
 cpp/src/io/utilities/file_io_utilities.hpp |  2 +-
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 7d2cc4ad493..7bec40893fd 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -79,7 +79,7 @@ class datasource {
     template <typename Container>
     static std::unique_ptr<buffer> create(Container&& data_owner)
     {
-      return std::make_unique<owning_buffer<Container>>(std::move(data_owner));
+      return std::make_unique<owning_buffer<Container>>(std::forward<Container>(data_owner));
     }
   };
 
@@ -335,13 +335,19 @@ class datasource {
   template <typename Container>
   class owning_buffer : public buffer {
    public:
+    // Require that the argument passed to the constructor be an rvalue (Container&& being an rvalue
+    // reference).
+    static_assert(std::is_rvalue_reference_v<Container&&>,
+                  "The container argument passed to the constructor must be an rvalue.");
+
     /**
      * @brief Moves the input container into the newly created object.
      *
-     * @param data_owner The container to construct the buffer from (ownership is transferred)
+     * @param moved_data_owner The container to construct the buffer from. Callers should explicitly
+     * pass std::move(data_owner) to this function to transfer the ownership.
      */
-    owning_buffer(Container&& data_owner)
-      : _data(std::move(data_owner)), _data_ptr(_data.data()), _size(_data.size())
+    owning_buffer(Container&& moved_data_owner)
+      : _data(std::move(moved_data_owner)), _data_ptr(_data.data()), _size(_data.size())
     {
     }
 
@@ -349,12 +355,13 @@ class datasource {
      * @brief Moves the input container into the newly created object, and exposes a subspan of the
      * buffer.
      *
-     * @param data_owner The container to construct the buffer from (ownership is transferred)
+     * @param moved_data_owner The container to construct the buffer from. Callers should explicitly
+     * pass std::move(data_owner) to this function to transfer the ownership.
      * @param data_ptr Pointer to the start of the subspan
      * @param size The size of the subspan
      */
-    owning_buffer(Container&& data_owner, uint8_t const* data_ptr, size_t size)
-      : _data(std::move(data_owner)), _data_ptr(data_ptr), _size(size)
+    owning_buffer(Container&& moved_data_owner, uint8_t const* data_ptr, size_t size)
+      : _data(std::move(moved_data_owner)), _data_ptr(data_ptr), _size(size)
     {
     }
 
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 93cdccfbb9f..cf19bc591cc 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -108,7 +108,11 @@ class cufile_shim {
 
   ~cufile_shim()
   {
-    if (driver_close != nullptr) driver_close();
+    // Explicit cuFile driver close should not be performed here to avoid segfault. However, in the
+    // absence of driver_close(), cuFile will implicitly do that, which in most cases causes
+    // segfault anyway. TODO: Revisit this conundrum once cuFile is fixed.
+    // https://github.com/rapidsai/cudf/issues/17121
+
     if (cf_lib != nullptr) dlclose(cf_lib);
   }
 
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 7e47b5b3d10..584b6213fa3 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -104,7 +104,7 @@ class cufile_shim;
 /**
  * @brief Class that provides RAII for cuFile file registration.
  */
-struct cufile_registered_file {
+class cufile_registered_file {
   void register_handle();
 
  public:

From 6ce9ea4fa53aa6a5e6c55bc01a9f449d2558beb8 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 1 Nov 2024 15:06:16 -0400
Subject: [PATCH 35/41] Change default KvikIO parameters in cuDF: set the
 thread pool size to 4, and compatibility mode to ON (#17185)

This PR adjusts the default KvikIO parameters in light of recent discussions.
- Set KvikIO compatibility mode to ON (previously unspecified). This is to avoid the overhead of KvikIO validating the cuFile library when most of the time clients are not using cuFile/GDS.
- Set KvikIO thread pool size to 4 (previous 8). See the reason below.

In addition, this PR updates the documentation on `LIBCUDF_CUFILE_POLICY`.

---
It is reported that Dask-cuDF on a 8-GPU node with Lustre file system has major performance regression when the KvikIO thread pool size is 8.

|KVIKIO_NTHREADS| 8 | 4 | 2 | 1 |
|----------------------------|---|----|---|----------|
|Dask-cuDF time [s]| 16 | 3.9 | 4.0 | 4.3 |
|cuDF time [s]| 3.4 | 3.4 | 3.8 | 4.9 |

Additional benchmark on Grace Hopper ([Parquet](https://docs.google.com/spreadsheets/d/1ZxuFTcu67kMVpESHwT0Cr-CAeAP7YmLDrcHxNTt22aU), [CSV](https://docs.google.com/spreadsheets/d/1yFLO-cdxG6jjPwHMtoUbPGMXilRaglush2U6KdrEAvA)) indicates no performance regression by switching the thread pool size from 8 to 4. For the time being, we choose 4 as an empirical sweet spot.

Closes #16512

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17185
---
 cpp/include/cudf/io/config_utils.hpp  |  9 ++++++---
 cpp/src/io/utilities/config_utils.cpp |  7 +++++--
 cpp/src/io/utilities/data_sink.cpp    |  2 +-
 cpp/src/io/utilities/datasource.cpp   |  2 +-
 docs/cudf/source/user_guide/io/io.md  | 24 ++++++++++++++++++------
 5 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/cpp/include/cudf/io/config_utils.hpp b/cpp/include/cudf/io/config_utils.hpp
index 13a76d50346..070b59a117c 100644
--- a/cpp/include/cudf/io/config_utils.hpp
+++ b/cpp/include/cudf/io/config_utils.hpp
@@ -37,10 +37,13 @@ bool is_gds_enabled();
 bool is_kvikio_enabled();
 
 /**
- * @brief Set kvikIO thread pool size according to the environment variable KVIKIO_NTHREADS. If
- * KVIKIO_NTHREADS is not set, use 8 threads by default.
+ * @brief Set KvikIO parameters, including:
+ * - Compatibility mode, according to the environment variable KVIKIO_COMPAT_MODE. If
+ *   KVIKIO_COMPAT_MODE is not set, enable it by default, which enforces the use of POSIX I/O.
+ * - Thread pool size, according to the environment variable KVIKIO_NTHREADS. If KVIKIO_NTHREADS is
+ *   not set, use 4 threads by default.
  */
-void set_thread_pool_nthreads_from_env();
+void set_up_kvikio();
 
 }  // namespace cufile_integration
 
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index b66742569d9..3307b4fa539 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -52,11 +52,14 @@ bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_
 
 bool is_kvikio_enabled() { return get_env_policy() == usage_policy::KVIKIO; }
 
-void set_thread_pool_nthreads_from_env()
+void set_up_kvikio()
 {
   static std::once_flag flag{};
   std::call_once(flag, [] {
-    auto nthreads = getenv_or<unsigned int>("KVIKIO_NTHREADS", 8U);
+    auto const compat_mode = kvikio::detail::getenv_or<bool>("KVIKIO_COMPAT_MODE", true);
+    kvikio::defaults::compat_mode_reset(compat_mode);
+
+    auto const nthreads = getenv_or<unsigned int>("KVIKIO_NTHREADS", 4u);
     kvikio::defaults::thread_pool_nthreads_reset(nthreads);
   });
 }
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index a8a275919d8..15de5d85614 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -42,7 +42,7 @@ class file_sink : public data_sink {
     if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); }
 
     if (cufile_integration::is_kvikio_enabled()) {
-      cufile_integration::set_thread_pool_nthreads_from_env();
+      cufile_integration::set_up_kvikio();
       _kvikio_file = kvikio::FileHandle(filepath, "w");
       CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 9668b30e9a9..15a4a270ce0 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -48,7 +48,7 @@ class file_source : public datasource {
   {
     detail::force_init_cuda_context();
     if (cufile_integration::is_kvikio_enabled()) {
-      cufile_integration::set_thread_pool_nthreads_from_env();
+      cufile_integration::set_up_kvikio();
       _kvikio_file = kvikio::FileHandle(filepath);
       CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
diff --git a/docs/cudf/source/user_guide/io/io.md b/docs/cudf/source/user_guide/io/io.md
index 97b961b455b..62db062cc45 100644
--- a/docs/cudf/source/user_guide/io/io.md
+++ b/docs/cudf/source/user_guide/io/io.md
@@ -91,16 +91,28 @@ SDK is available for download
 [here](https://developer.nvidia.com/gpudirect-storage). GDS is also
 included in CUDA Toolkit 11.4 and higher.
 
-Use of GPUDirect Storage in cuDF is enabled by default, but can be
-disabled through the environment variable `LIBCUDF_CUFILE_POLICY`.
+Use of GPUDirect Storage in cuDF is disabled by default, but can be
+enabled through the environment variable `LIBCUDF_CUFILE_POLICY`.
 This variable also controls the GDS compatibility mode.
 
 There are four valid values for the environment variable:
 
-- "GDS": Enable GDS use; GDS compatibility mode is *off*.
-- "ALWAYS": Enable GDS use; GDS compatibility mode is *on*.
-- "KVIKIO": Enable GDS through [KvikIO](https://github.com/rapidsai/kvikio).
-- "OFF": Completely disable GDS use.
+- "GDS": Enable GDS use. If the cuFile library cannot be properly loaded,
+fall back to the GDS compatibility mode.
+- "ALWAYS": Enable GDS use. If the cuFile library cannot be properly loaded,
+throw an exception.
+- "KVIKIO": Enable GDS compatibility mode through [KvikIO](https://github.com/rapidsai/kvikio).
+Note that KvikIO also provides the environment variable `KVIKIO_COMPAT_MODE` for GDS
+control that may alter the effect of "KVIKIO" option in cuDF:
+  - By default, `KVIKIO_COMPAT_MODE` is unset. In this case, cuDF enforces
+    the GDS compatibility mode, and the system configuration check for GDS I/O
+    is never performed.
+  - If `KVIKIO_COMPAT_MODE=ON`, this is the same with the above case.
+  - If `KVIKIO_COMPAT_MODE=OFF`, KvikIO enforces GDS I/O without system
+    configuration check, and will error out if GDS requirements are not met. The
+    only exceptional case is that if the system does not support files being
+    opened with the `O_DIRECT` flag, the GDS compatibility mode will be used.
+- "OFF": Completely disable GDS and kvikIO use.
 
 If no value is set, behavior will be the same as the "KVIKIO" option.
 

From 3d07509deb9f589e1f986dc7f822392467ffcdde Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 1 Nov 2024 19:50:57 -0700
Subject: [PATCH 36/41] Add `num_iterations` axis to the multi-threaded Parquet
 benchmarks (#17231)

Added an axis that controls the number of times each thread reads its input. Running with a higher number of iterations should better show how work from different threads pipelines.
The new axis, "num_iterations", is added to all multi-threaded Parquet reader benchmarks.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: https://github.com/rapidsai/cudf/pull/17231
---
 .../io/parquet/parquet_reader_multithread.cpp | 57 ++++++++++++-------
 1 file changed, 38 insertions(+), 19 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index 7121cb9f034..bf7039269bc 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -45,6 +45,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state)
   auto const num_cols       = state.get_int64("num_cols");
   size_t const read_size_mb = get_read_size(state) / (1024 * 1024);
   return {test_name + ", " + std::to_string(num_cols) + " columns, " +
+          std::to_string(state.get_int64("num_iterations")) + " iterations, " +
           std::to_string(state.get_int64("num_threads")) + " threads " + " (" +
           std::to_string(read_size_mb) + " MB each)"};
 }
@@ -90,9 +91,10 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
                                           std::vector<cudf::type_id> const& d_types,
                                           std::string const& label)
 {
-  size_t const data_size = state.get_int64("total_data_size");
-  auto const num_threads = state.get_int64("num_threads");
-  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
+  size_t const data_size    = state.get_int64("total_data_size");
+  auto const num_threads    = state.get_int64("num_threads");
+  auto const num_iterations = state.get_int64("num_iterations");
+  auto const source_type    = retrieve_io_type_enum(state.get_string("io_type"));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   BS::thread_pool threads(num_threads);
@@ -109,12 +111,15 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
 
   nvtxRangePushA(("(read) " + label).c_str());
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
-             [&](nvbench::launch& launch, auto& timer) {
+             [&, num_files = num_files](nvbench::launch& launch, auto& timer) {
                auto read_func = [&](int index) {
                  auto const stream = streams[index % num_threads];
                  cudf::io::parquet_reader_options read_opts =
                    cudf::io::parquet_reader_options::builder(source_info_vector[index]);
-                 cudf::io::read_parquet(read_opts, stream, cudf::get_current_device_resource_ref());
+                 for (int i = 0; i < num_iterations; ++i) {
+                   cudf::io::read_parquet(
+                     read_opts, stream, cudf::get_current_device_resource_ref());
+                 }
                };
 
                threads.pause();
@@ -128,7 +133,8 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
   nvtxRangePop();
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_element_count(num_iterations * static_cast<double>(data_size) / time,
+                          "bytes_per_second");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
   state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
@@ -173,6 +179,7 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
 {
   size_t const data_size    = state.get_int64("total_data_size");
   auto const num_threads    = state.get_int64("num_threads");
+  auto const num_iterations = state.get_int64("num_iterations");
   size_t const input_limit  = state.get_int64("input_limit");
   size_t const output_limit = state.get_int64("output_limit");
   auto const source_type    = retrieve_io_type_enum(state.get_string("io_type"));
@@ -192,22 +199,25 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   nvtxRangePushA(("(read) " + label).c_str());
   std::vector<cudf::io::table_with_metadata> chunks;
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
-             [&](nvbench::launch& launch, auto& timer) {
+             [&, num_files = num_files](nvbench::launch& launch, auto& timer) {
                auto read_func = [&](int index) {
                  auto const stream = streams[index % num_threads];
                  cudf::io::parquet_reader_options read_opts =
                    cudf::io::parquet_reader_options::builder(source_info_vector[index]);
-                 // divide chunk limits by number of threads so the number of chunks produced is the
-                 // same for all cases. this seems better than the alternative, which is to keep the
-                 // limits the same. if we do that, as the number of threads goes up, the number of
-                 // chunks goes down - so are actually benchmarking the same thing in that case?
-                 auto reader = cudf::io::chunked_parquet_reader(
-                   output_limit / num_threads, input_limit / num_threads, read_opts, stream);
-
-                 // read all the chunks
-                 do {
-                   auto table = reader.read_chunk();
-                 } while (reader.has_next());
+                 for (int i = 0; i < num_iterations; ++i) {
+                   // divide chunk limits by number of threads so the number of chunks produced is
+                   // the same for all cases. this seems better than the alternative, which is to
+                   // keep the limits the same. if we do that, as the number of threads goes up, the
+                   // number of chunks goes down - so are actually benchmarking the same thing in
+                   // that case?
+                   auto reader = cudf::io::chunked_parquet_reader(
+                     output_limit / num_threads, input_limit / num_threads, read_opts, stream);
+
+                   // read all the chunks
+                   do {
+                     auto table = reader.read_chunk();
+                   } while (reader.has_next());
+                 }
                };
 
                threads.pause();
@@ -221,7 +231,8 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   nvtxRangePop();
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_element_count(num_iterations * static_cast<double>(data_size) / time,
+                          "bytes_per_second");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
   state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
@@ -267,6 +278,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_string_axis("io_type", {"PINNED_BUFFER"});
@@ -277,6 +289,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_string_axis("io_type", {"PINNED_BUFFER"});
@@ -287,6 +300,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_string_axis("io_type", {"PINNED_BUFFER"});
@@ -297,6 +311,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_string_axis("io_type", {"PINNED_BUFFER"});
@@ -308,6 +323,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
@@ -320,6 +336,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
@@ -332,6 +349,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
@@ -344,6 +362,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})

From 0d37506682453a3849fffc74cec4778d609a18ff Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Sun, 3 Nov 2024 23:07:09 -0500
Subject: [PATCH 37/41] Expose stream-ordering in subword tokenizer API
 (#17206)

Add stream parameter to public APIs:
```
nvtext::subword_tokenize
nvtext::load_vocabulary_file
```
Added stream gtest.

Reference: #13744

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17206
---
 cpp/include/nvtext/subword_tokenize.hpp       |  4 +
 cpp/src/text/subword/load_hash_file.cu        |  6 +-
 cpp/src/text/subword/subword_tokenize.cu      | 11 +--
 cpp/tests/CMakeLists.txt                      |  1 +
 .../streams/text/subword_tokenize_test.cpp    | 81 +++++++++++++++++++
 5 files changed, 93 insertions(+), 10 deletions(-)
 create mode 100644 cpp/tests/streams/text/subword_tokenize_test.cpp

diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index c4210699975..4d06aa5d4bc 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -62,11 +62,13 @@ struct hashed_vocabulary {
  * @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file.
  *        Note that this is the file AFTER python/perfect_hash.py has been used
  *        for preprocessing.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Memory resource to allocate any returned objects.
  * @return vocabulary hash-table elements
  */
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -147,6 +149,7 @@ struct tokenizer_result {
  * @param do_truncate If true, the tokenizer will discard all the token-ids after
  *        `max_sequence_length` for each input string. If false, it will use a new row
  *        in the output token-ids to continue generating the output.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Memory resource to allocate any returned objects.
  * @return token-ids, attention-mask, and metadata
  */
@@ -157,6 +160,7 @@ tokenizer_result subword_tokenize(
   uint32_t stride,
   bool do_lower_case,
   bool do_truncate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index eca703e2604..b13ad0a7de8 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -289,10 +289,12 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
 }  // namespace detail
 
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
-  std::string const& filename_hashed_vocabulary, rmm::device_async_resource_ref mr)
+  std::string const& filename_hashed_vocabulary,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::load_vocabulary_file(filename_hashed_vocabulary, cudf::get_default_stream(), mr);
+  return detail::load_vocabulary_file(filename_hashed_vocabulary, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index d7e04a0c208..dee589d6daf 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -293,17 +293,12 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   uint32_t stride,
                                   bool do_lower_case,
                                   bool do_truncate,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::subword_tokenize(strings,
-                                  vocabulary_table,
-                                  max_sequence_length,
-                                  stride,
-                                  do_lower_case,
-                                  do_truncate,
-                                  cudf::get_default_stream(),
-                                  mr);
+  return detail::subword_tokenize(
+    strings, vocabulary_table, max_sequence_length, stride, do_lower_case, do_truncate, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 6d3d1454462..23632f6fbba 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -743,6 +743,7 @@ ConfigureTest(
   streams/text/ngrams_test.cpp
   streams/text/replace_test.cpp
   streams/text/stemmer_test.cpp
+  streams/text/subword_tokenize_test.cpp
   streams/text/tokenize_test.cpp
   STREAM_MODE
   testing
diff --git a/cpp/tests/streams/text/subword_tokenize_test.cpp b/cpp/tests/streams/text/subword_tokenize_test.cpp
new file mode 100644
index 00000000000..9474e6b269c
--- /dev/null
+++ b/cpp/tests/streams/text/subword_tokenize_test.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/subword_tokenize.hpp>
+
+#include <fstream>
+#include <vector>
+
+// Global environment for temporary files
+auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+struct TextSubwordTest : public cudf::test::BaseFixture {};
+
+// Create a fake hashed vocab text file for the tests in this source file.
+// The vocab only includes the following words:
+//  'this', 'is', 'a', 'test', 'tést'
+// The period '.' character also has a token id.
+void create_hashed_vocab(std::string const& hash_file)
+{
+  constexpr size_t coefsize = 23;
+  std::vector<std::pair<int, int>> coefficients(coefsize, {65559, 0});
+  std::ofstream outfile(hash_file, std::ofstream::out);
+  outfile << "1\n0\n" << coefficients.size() << "\n";
+  for (auto c : coefficients) {
+    outfile << c.first << " " << c.second << "\n";
+  }
+  std::vector<uint64_t> hash_table(coefsize, 0);
+  outfile << hash_table.size() << "\n";
+  hash_table[0]  = 3015668L;              // based on values
+  hash_table[1]  = 6205475701751155871L;  // from the
+  hash_table[5]  = 6358029;               // bert_hash_table.txt
+  hash_table[16] = 451412625363L;         // file for the test
+  hash_table[20] = 6206321707968235495L;  // words above
+  for (auto h : hash_table) {
+    outfile << h << "\n";
+  }
+  outfile << "100\n101\n102\n\n";
+}
+
+TEST(TextSubwordTest, Tokenize)
+{
+  uint32_t const nrows = 100;
+  std::vector<char const*> h_strings(nrows, "This is a test. A test this is.");
+  cudf::test::strings_column_wrapper strings(h_strings.cbegin(), h_strings.cend());
+  std::string const hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
+  create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file, cudf::test::get_default_stream());
+
+  uint32_t const max_sequence_length = 16;
+  uint32_t const stride              = 16;
+
+  auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
+                                         *vocab,
+                                         max_sequence_length,
+                                         stride,
+                                         true,   // do_lower_case
+                                         false,  // do_truncate
+                                         cudf::test::get_default_stream());
+}

From 31c2b94427a447bf3b4d708d8d3bf08d430042b1 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 4 Nov 2024 13:28:19 +0100
Subject: [PATCH 38/41] auditwheel: --exclude libkvikio.so

---
 ci/build_wheel_cudf.sh      | 1 +
 ci/build_wheel_libcudf.sh   | 1 +
 ci/build_wheel_pylibcudf.sh | 1 +
 3 files changed, 3 insertions(+)

diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index fef4416a366..ae4eb0d5c66 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -23,6 +23,7 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"
 python -m auditwheel repair \
     --exclude libcudf.so \
     --exclude libnvcomp.so \
+    --exclude libkvikio.so \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index b3d6778ea04..aabd3814a24 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -33,6 +33,7 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 mkdir -p ${package_dir}/final_dist
 python -m auditwheel repair \
     --exclude libnvcomp.so.4 \
+    --exclude libkvikio.so \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
index 839d98846fe..c4a89f20f5f 100755
--- a/ci/build_wheel_pylibcudf.sh
+++ b/ci/build_wheel_pylibcudf.sh
@@ -21,6 +21,7 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"
 python -m auditwheel repair \
     --exclude libcudf.so \
     --exclude libnvcomp.so \
+    --exclude libkvikio.so \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 

From af71f88144f3c39273812c8c64c8db8787eb5bd5 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 4 Nov 2024 13:30:14 +0100
Subject: [PATCH 39/41] USE_LIBKVIKIO_RUNTIME_WHEEL

---
 ci/build_wheel_libcudf.sh     |  2 +-
 python/libcudf/CMakeLists.txt | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index aabd3814a24..a96f2971a96 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -25,7 +25,7 @@ python -m pip install \
 # 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
 export PIP_NO_BUILD_ISOLATION=0
 
-export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
+export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON;-DUSE_LIBKVIKIO_RUNTIME_WHEEL=ON"
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
index 5f9a04d3cee..bedc3dafcab 100644
--- a/python/libcudf/CMakeLists.txt
+++ b/python/libcudf/CMakeLists.txt
@@ -23,6 +23,9 @@ project(
 )
 
 option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of the system library" OFF)
+option(USE_LIBKVIKIO_RUNTIME_WHEEL
+       "Use the libkvikio wheel at runtime instead of the system library" OFF
+)
 
 # Check if cudf is already available. If so, it is the user's responsibility to ensure that the
 # CMake package is also available at build time of the Python cudf package.
@@ -58,3 +61,12 @@ if(USE_NVCOMP_RUNTIME_WHEEL)
     APPEND
   )
 endif()
+
+if(USE_LIBKVIKIO_RUNTIME_WHEEL)
+  set(rpaths "$ORIGIN/../../libkvikio/lib64")
+  set_property(
+    TARGET cudf
+    PROPERTY INSTALL_RPATH ${rpaths}
+    APPEND
+  )
+endif()

From 6b3c996679e6ad8dceceb51da2ab4fe2c58cb331 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 4 Nov 2024 13:30:37 +0100
Subject: [PATCH 40/41] depends_on_libkvikio

---
 dependencies.yaml             | 1 +
 python/libcudf/pyproject.toml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/dependencies.yaml b/dependencies.yaml
index 90255ca674c..a92f7258f4b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -178,6 +178,7 @@ files:
       table: project
     includes:
       - depends_on_nvcomp
+      - depends_on_libkvikio
   py_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index c6d9ae56467..62726bb0df4 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -38,6 +38,7 @@ classifiers = [
     "Environment :: GPU :: NVIDIA CUDA",
 ]
 dependencies = [
+    "libkvikio==24.12.*,>=0.0.0a0",
     "nvidia-nvcomp==4.1.0.6",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From 2d8eeafe4959357a17f6ad488811837e0a07ba65 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 4 Nov 2024 13:31:09 +0100
Subject: [PATCH 41/41] Download wheel from
 <https://github.com/rapidsai/kvikio/pull/527>

Revert before PR merge
---
 ci/build_wheel_libcudf.sh                  | 7 +++++++
 ci/cudf_pandas_scripts/pandas-tests/run.sh | 8 ++++++++
 ci/cudf_pandas_scripts/run_tests.sh        | 7 +++++++
 ci/test_wheel_cudf.sh                      | 7 +++++++
 ci/test_wheel_cudf_polars.sh               | 7 +++++++
 ci/test_wheel_dask_cudf.sh                 | 7 +++++++
 6 files changed, 43 insertions(+)

diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index a96f2971a96..c010eb40ded 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -15,6 +15,13 @@ rapids-dependency-file-generator \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true" \
 | tee /tmp/requirements-build.txt
 
+
+# Download wheel from <https://github.com/rapidsai/kvikio/pull/527>
+LIBKVIKIO_CHANNEL=$(
+  RAPIDS_PY_WHEEL_NAME=libkvikio_cu12 rapids-get-pr-wheel-artifact kvikio 527 cpp
+)
+echo ${LIBKVIKIO_CHANNEL}/libkvikio_*.whl >> /tmp/requirements-build.txt
+
 rapids-logger "Installing build requirements"
 python -m pip install \
     -v \
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index e5cd4436a3a..639a873193e 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -17,8 +17,16 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from
 RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
+# Download wheel from <https://github.com/rapidsai/kvikio/pull/527>
+LIBKVIKIO_CHANNEL=$(
+RAPIDS_PY_WHEEL_NAME=libkvikio_${RAPIDS_PY_CUDA_SUFFIX} rapids-get-pr-wheel-artifact kvikio 527 cpp  # also python?
+)
+echo ${LIBKVIKIO_CHANNEL}/libkvikio_*.whl >> /tmp/requirements-build.txt
+
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
+  -v \
+  -r /tmp/requirements-build.txt \
   "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,pandas-tests]" \
   "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 61361fffb07..37089815410 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -57,8 +57,15 @@ else
     # generate constraints (possibly pinning to oldest support versions of dependencies)
     rapids-generate-pip-constraints test_python_cudf_pandas ./constraints.txt
 
+    # Download wheel from <https://github.com/rapidsai/kvikio/pull/527>
+    LIBKVIKIO_CHANNEL=$(
+    RAPIDS_PY_WHEEL_NAME=libkvikio_${RAPIDS_PY_CUDA_SUFFIX} rapids-get-pr-wheel-artifact kvikio 527 cpp  # also python?
+    )
+    echo ${LIBKVIKIO_CHANNEL}/libkvikio_*.whl >> /tmp/requirements-build.txt
+
     python -m pip install \
         -v \
+        -r /tmp/requirements-build.txt \
         --constraint ./constraints.txt \
         "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,cudf-pandas-tests]" \
         "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index ce12744c9e3..ac5acf9089d 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -15,9 +15,16 @@ rapids-logger "Install cudf, pylibcudf, and test requirements"
 # generate constraints (possibly pinning to oldest support versions of dependencies)
 rapids-generate-pip-constraints py_test_cudf ./constraints.txt
 
+# Download wheel from <https://github.com/rapidsai/kvikio/pull/527>
+LIBKVIKIO_CHANNEL=$(
+  RAPIDS_PY_WHEEL_NAME=libkvikio_${RAPIDS_PY_CUDA_SUFFIX} rapids-get-pr-wheel-artifact kvikio 527 cpp  # also python?
+)
+echo ${LIBKVIKIO_CHANNEL}/libkvikio_*.whl >> /tmp/requirements-build.txt
+
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
     -v \
+    -r /tmp/requirements-build.txt \
     --constraint ./constraints.txt \
   "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
   "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 2884757e46b..340c29eae4c 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -33,9 +33,16 @@ rapids-logger "Installing cudf_polars and its dependencies"
 # generate constraints (possibly pinning to oldest support versions of dependencies)
 rapids-generate-pip-constraints py_test_cudf_polars ./constraints.txt
 
+# Download wheel from <https://github.com/rapidsai/kvikio/pull/527>
+LIBKVIKIO_CHANNEL=$(
+  RAPIDS_PY_WHEEL_NAME=libkvikio_${RAPIDS_PY_CUDA_SUFFIX} rapids-get-pr-wheel-artifact kvikio 527 cpp  # also python?
+)
+echo ${LIBKVIKIO_CHANNEL}/libkvikio_*.whl >> /tmp/requirements-build.txt
+
 # echo to expand wildcard before adding `[test]` requires for pip
 python -m pip install \
     -v \
+    -r /tmp/requirements-build.txt \
     --constraint ./constraints.txt \
     "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
     "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index e15949f4bdb..93c8262fc32 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -16,9 +16,16 @@ rapids-logger "Install dask_cudf, cudf, pylibcudf, and test requirements"
 # generate constraints (possibly pinning to oldest support versions of dependencies)
 rapids-generate-pip-constraints py_test_dask_cudf ./constraints.txt
 
+# Download wheel from <https://github.com/rapidsai/kvikio/pull/527>
+LIBKVIKIO_CHANNEL=$(
+RAPIDS_PY_WHEEL_NAME=libkvikio_${RAPIDS_PY_CUDA_SUFFIX} rapids-get-pr-wheel-artifact kvikio 527 cpp  # also python?
+)
+echo ${LIBKVIKIO_CHANNEL}/libkvikio_*.whl >> /tmp/requirements-build.txt
+
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
   -v \
+  -r /tmp/requirements-build.txt \
   --constraint ./constraints.txt \
   "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \