From 2e0d2d6a0859b2cad34a36513b6977cf2bbe172f Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 8 Nov 2024 13:15:26 -0800
Subject: [PATCH 01/19] Improve the performance of low cardinality groupby
 (#16619)

This PR enhances groupby performance for low-cardinality input cases. When applicable, it leverages shared memory for initial aggregation, followed by global memory aggregation to reduce atomic contention and improve performance.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16619
---
 cpp/CMakeLists.txt                            |   5 +-
 cpp/src/groupby/groupby.cu                    |   1 -
 cpp/src/groupby/hash/compute_aggregations.cu  |  29 +++
 cpp/src/groupby/hash/compute_aggregations.cuh | 185 ++++++++++++++++++
 ...pass_aggs.hpp => compute_aggregations.hpp} |  16 +-
 .../groupby/hash/compute_aggregations_null.cu |  29 +++
 .../hash/compute_global_memory_aggs.cu        |  32 +++
 .../hash/compute_global_memory_aggs.cuh       |  89 +++++++++
 .../hash/compute_global_memory_aggs.hpp       |  42 ++++
 .../hash/compute_global_memory_aggs_null.cu   |  32 +++
 cpp/src/groupby/hash/compute_groupby.cu       |  43 +---
 cpp/src/groupby/hash/compute_groupby.hpp      |  17 --
 .../hash/compute_shared_memory_aggs.cu        |  19 +-
 .../hash/compute_shared_memory_aggs.hpp       |   7 +-
 .../groupby/hash/compute_single_pass_aggs.cu  |  99 ----------
 .../hash/create_sparse_results_table.cu       | 115 ++++++++---
 .../hash/create_sparse_results_table.hpp      |  27 ++-
 cpp/src/groupby/hash/helpers.cuh              |   2 -
 cpp/src/groupby/hash/single_pass_functors.cuh | 118 ++++++++++-
 19 files changed, 699 insertions(+), 208 deletions(-)
 create mode 100644 cpp/src/groupby/hash/compute_aggregations.cu
 create mode 100644 cpp/src/groupby/hash/compute_aggregations.cuh
 rename cpp/src/groupby/hash/{compute_single_pass_aggs.hpp => compute_aggregations.hpp} (70%)
 create mode 100644 cpp/src/groupby/hash/compute_aggregations_null.cu
 create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.cu
 create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.cuh
 create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.hpp
 create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
 delete mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d3bf7019e35..559826ac232 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -394,11 +394,14 @@ add_library(
   src/filling/repeat.cu
   src/filling/sequence.cu
   src/groupby/groupby.cu
+  src/groupby/hash/compute_aggregations.cu
+  src/groupby/hash/compute_aggregations_null.cu
+  src/groupby/hash/compute_global_memory_aggs.cu
+  src/groupby/hash/compute_global_memory_aggs_null.cu
   src/groupby/hash/compute_groupby.cu
   src/groupby/hash/compute_mapping_indices.cu
   src/groupby/hash/compute_mapping_indices_null.cu
   src/groupby/hash/compute_shared_memory_aggs.cu
-  src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index cc0682b68b9..6eb82618e2a 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -29,7 +29,6 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/reduction/detail/histogram.hpp>
-#include <cudf/strings/string_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu
new file mode 100644
index 00000000000..cac6c2224f0
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_aggregations.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_aggregations.cuh"
+#include "compute_aggregations.hpp"
+
+namespace cudf::groupby::detail::hash {
+template rmm::device_uvector<cudf::size_type> compute_aggregations<global_set_t>(
+  int64_t num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  global_set_t& global_set,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
new file mode 100644
index 00000000000..e8b29a0e7a8
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_aggregations.hpp"
+#include "compute_global_memory_aggs.hpp"
+#include "compute_mapping_indices.hpp"
+#include "compute_shared_memory_aggs.hpp"
+#include "create_sparse_results_table.hpp"
+#include "flatten_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuda/std/atomic>
+#include <thrust/for_each.h>
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes all aggregations from `requests` that require a single pass
+ * over the data and stores the results in `sparse_results`
+ */
+template <typename SetType>
+rmm::device_uvector<cudf::size_type> compute_aggregations(
+  int64_t num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  SetType& global_set,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream)
+{
+  // flatten the aggs to a table that can be operated on by aggregate_row
+  auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+  auto const d_agg_kinds                   = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, rmm::mr::get_current_device_resource());
+
+  auto const grid_size =
+    max_occupancy_grid_size<typename SetType::ref_type<cuco::insert_and_find_tag>>(num_rows);
+  auto const available_shmem_size = get_available_shared_memory_size(grid_size);
+  auto const has_sufficient_shmem =
+    available_shmem_size > (compute_shmem_offsets_size(flattened_values.num_columns()) * 2);
+  auto const has_dictionary_request = std::any_of(
+    requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) {
+      return cudf::is_dictionary(request.values.type());
+    });
+  auto const is_shared_memory_compatible = !has_dictionary_request and has_sufficient_shmem;
+
+  // Performs naive global memory aggregations when the workload is not compatible with shared
+  // memory, such as when aggregating dictionary columns or when there is insufficient dynamic
+  // shared memory for shared memory aggregations.
+  if (!is_shared_memory_compatible) {
+    return compute_global_memory_aggs(num_rows,
+                                      skip_rows_with_nulls,
+                                      row_bitmask,
+                                      flattened_values,
+                                      d_agg_kinds.data(),
+                                      agg_kinds,
+                                      global_set,
+                                      aggs,
+                                      sparse_results,
+                                      stream);
+  }
+
+  // 'populated_keys' contains inserted row_indices (keys) of global hash set
+  rmm::device_uvector<cudf::size_type> populated_keys(num_rows, stream);
+  // 'local_mapping_index' maps from the global row index of the input table to its block-wise rank
+  rmm::device_uvector<cudf::size_type> local_mapping_index(num_rows, stream);
+  // 'global_mapping_index' maps from the block-wise rank to the row index of global aggregate table
+  rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
+                                                            stream);
+  rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
+
+  // Flag indicating whether a global memory aggregation fallback is required or not
+  rmm::device_scalar<cuda::std::atomic_flag> needs_global_memory_fallback(stream);
+
+  auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
+
+  compute_mapping_indices(grid_size,
+                          num_rows,
+                          global_set_ref,
+                          row_bitmask,
+                          skip_rows_with_nulls,
+                          local_mapping_index.data(),
+                          global_mapping_index.data(),
+                          block_cardinality.data(),
+                          needs_global_memory_fallback.data(),
+                          stream);
+
+  cuda::std::atomic_flag h_needs_fallback;
+  // Cannot use `device_scalar::value` as it requires a copy constructor, which
+  // `atomic_flag` doesn't have.
+  CUDF_CUDA_TRY(cudaMemcpyAsync(&h_needs_fallback,
+                                needs_global_memory_fallback.data(),
+                                sizeof(cuda::std::atomic_flag),
+                                cudaMemcpyDefault,
+                                stream.value()));
+  stream.synchronize();
+  auto const needs_fallback = h_needs_fallback.test();
+
+  // make table that will hold sparse results
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_agg_kinds.data(),
+                                                         agg_kinds,
+                                                         needs_fallback,
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
+  // prepare to launch kernel to do the actual aggregation
+  auto d_values       = table_device_view::create(flattened_values, stream);
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+
+  compute_shared_memory_aggs(grid_size,
+                             available_shmem_size,
+                             num_rows,
+                             row_bitmask,
+                             skip_rows_with_nulls,
+                             local_mapping_index.data(),
+                             global_mapping_index.data(),
+                             block_cardinality.data(),
+                             *d_values,
+                             *d_sparse_table,
+                             d_agg_kinds.data(),
+                             stream);
+
+  // The shared memory groupby is designed so that each thread block can handle up to 128 unique
+  // keys. When a block reaches this cardinality limit, shared memory becomes insufficient to store
+  // the temporary aggregation results. In these situations, we must fall back to a global memory
+  // aggregator to process the remaining aggregation requests.
+  if (needs_fallback) {
+    auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator{0},
+                       num_rows,
+                       global_memory_fallback_fn{global_set_ref,
+                                                 *d_values,
+                                                 *d_sparse_table,
+                                                 d_agg_kinds.data(),
+                                                 block_cardinality.data(),
+                                                 stride,
+                                                 row_bitmask,
+                                                 skip_rows_with_nulls});
+    extract_populated_keys(global_set, populated_keys, stream);
+  }
+
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggs.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
+  }
+
+  return populated_keys;
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp
similarity index 70%
rename from cpp/src/groupby/hash/compute_single_pass_aggs.hpp
rename to cpp/src/groupby/hash/compute_aggregations.hpp
index a7434bdf61a..829c3c808b0 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_aggregations.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace cudf::groupby::detail::hash {
 /**
@@ -28,11 +29,12 @@ namespace cudf::groupby::detail::hash {
  * over the data and stores the results in `sparse_results`
  */
 template <typename SetType>
-void compute_single_pass_aggs(int64_t num_keys,
-                              bool skip_rows_with_nulls,
-                              bitmask_type const* row_bitmask,
-                              SetType set,
-                              cudf::host_span<cudf::groupby::aggregation_request const> requests,
-                              cudf::detail::result_cache* sparse_results,
-                              rmm::cuda_stream_view stream);
+rmm::device_uvector<cudf::size_type> compute_aggregations(
+  int64_t num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  SetType& global_set,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_aggregations_null.cu b/cpp/src/groupby/hash/compute_aggregations_null.cu
new file mode 100644
index 00000000000..1d7184227ea
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_aggregations_null.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_aggregations.cuh"
+#include "compute_aggregations.hpp"
+
+namespace cudf::groupby::detail::hash {
+template rmm::device_uvector<cudf::size_type> compute_aggregations<nullable_global_set_t>(
+  int64_t num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  nullable_global_set_t& global_set,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu
new file mode 100644
index 00000000000..6025686953e
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_global_memory_aggs.cuh"
+#include "compute_global_memory_aggs.hpp"
+
+namespace cudf::groupby::detail::hash {
+template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<global_set_t>(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  global_set_t& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh
new file mode 100644
index 00000000000..00db149c6d9
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_global_memory_aggs.hpp"
+#include "create_sparse_results_table.hpp"
+#include "flatten_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/static_set.cuh>
+#include <thrust/for_each.h>
+
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  SetType& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream)
+{
+  auto constexpr uses_global_memory_aggs = true;
+  // 'populated_keys' contains inserted row_indices (keys) of global hash set
+  rmm::device_uvector<cudf::size_type> populated_keys(num_rows, stream);
+
+  // make table that will hold sparse results
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_agg_kinds,
+                                                         agg_kinds,
+                                                         uses_global_memory_aggs,
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
+
+  // prepare to launch kernel to do the actual aggregation
+  auto d_values       = table_device_view::create(flattened_values, stream);
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto global_set_ref = global_set.ref(cuco::op::insert_and_find);
+
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::counting_iterator{0},
+    num_rows,
+    hash::compute_single_pass_aggs_fn{
+      global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask, skip_rows_with_nulls});
+  extract_populated_keys(global_set, populated_keys, stream);
+
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggregations.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggregations[i], std::move(sparse_result_cols[i]));
+  }
+
+  return populated_keys;
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp
new file mode 100644
index 00000000000..0777b9ffd93
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  SetType& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
new file mode 100644
index 00000000000..209e2b7f20a
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_global_memory_aggs.cuh"
+#include "compute_global_memory_aggs.hpp"
+
+namespace cudf::groupby::detail::hash {
+template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<nullable_global_set_t>(
+  cudf::size_type num_rows,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  nullable_global_set_t& global_set,
+  std::vector<std::unique_ptr<aggregation>>& aggregations,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index 59457bea694..e1dbf2a3d9e 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
+#include "compute_aggregations.hpp"
 #include "compute_groupby.hpp"
-#include "compute_single_pass_aggs.hpp"
 #include "helpers.cuh"
 #include "sparse_to_dense_results.hpp"
 
@@ -29,7 +29,6 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cuco/static_set.cuh>
@@ -38,18 +37,6 @@
 #include <memory>
 
 namespace cudf::groupby::detail::hash {
-template <typename SetType>
-rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
-                                                      size_type num_keys,
-                                                      rmm::cuda_stream_view stream)
-{
-  rmm::device_uvector<size_type> populated_keys(num_keys, stream);
-  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
-
-  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
-  return populated_keys;
-}
-
 template <typename Equal, typename Hash>
 std::unique_ptr<table> compute_groupby(table_view const& keys,
                                        host_span<aggregation_request const> requests,
@@ -67,8 +54,8 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
   // column is indexed by the hash set
   cudf::detail::result_cache sparse_results(requests.size());
 
-  auto const set = cuco::static_set{
-    num_keys,
+  auto set = cuco::static_set{
+    cuco::extent<int64_t>{num_keys},
     cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% load factor
     cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
     d_row_equal,
@@ -84,17 +71,13 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
       : rmm::device_buffer{};
 
   // Compute all single pass aggs first
-  compute_single_pass_aggs(num_keys,
-                           skip_rows_with_nulls,
-                           static_cast<bitmask_type*>(row_bitmask.data()),
-                           set.ref(cuco::insert_and_find),
-                           requests,
-                           &sparse_results,
-                           stream);
-
-  // Extract the populated indices from the hash set and create a gather map.
-  // Gathering using this map from sparse results will give dense results.
-  auto gather_map = extract_populated_keys(set, keys.num_rows(), stream);
+  auto gather_map = compute_aggregations(num_keys,
+                                         skip_rows_with_nulls,
+                                         static_cast<bitmask_type*>(row_bitmask.data()),
+                                         set,
+                                         requests,
+                                         &sparse_results,
+                                         stream);
 
   // Compact all results from sparse_results and insert into cache
   sparse_to_dense_results(requests,
@@ -114,12 +97,6 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
                               mr);
 }
 
-template rmm::device_uvector<size_type> extract_populated_keys<global_set_t>(
-  global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream);
-
-template rmm::device_uvector<size_type> extract_populated_keys<nullable_global_set_t>(
-  nullable_global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream);
-
 template std::unique_ptr<table> compute_groupby<row_comparator_t, row_hash_t>(
   table_view const& keys,
   host_span<aggregation_request const> requests,
diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp
index 7bb3a60ff07..77243dc0a4f 100644
--- a/cpp/src/groupby/hash/compute_groupby.hpp
+++ b/cpp/src/groupby/hash/compute_groupby.hpp
@@ -22,28 +22,11 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <memory>
 
 namespace cudf::groupby::detail::hash {
-/**
- * @brief Computes and returns a device vector containing all populated keys in
- * `key_set`.
- *
- * @tparam SetType Type of key hash set
- *
- * @param key_set Key hash set
- * @param num_keys Number of input keys
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return An array of unique keys contained in `key_set`
- */
-template <typename SetType>
-rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
-                                                      size_type num_keys,
-                                                      rmm::cuda_stream_view stream);
-
 /**
  * @brief Computes groupby using hash table.
  *
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index 12c02a1865e..f0361ccced2 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -47,9 +47,8 @@ struct size_of_functor {
 /// Shared memory data alignment
 CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8;
 
-// Prepares shared memory data required by each output column, exits if
-// no enough memory space to perform the shared memory aggregation for the
-// current output column
+// Allocates shared memory required for output columns. Exits if there is insufficient memory to
+// perform shared memory aggregation for the current output column.
 __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
                                                cudf::size_type& col_end,
                                                cudf::mutable_table_device_view output_values,
@@ -74,9 +73,7 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
                                 ALIGNMENT);
     auto const next_col_total_size = next_col_size + valid_col_size;
 
-    if (bytes_allocated + next_col_total_size > total_agg_size) {
-      CUDF_UNREACHABLE("Not enough memory for shared memory aggregations");
-    }
+    if (bytes_allocated + next_col_total_size > total_agg_size) { break; }
 
     shmem_agg_res_offsets[col_end]  = bytes_allocated;
     shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size;
@@ -275,7 +272,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
 }
 }  // namespace
 
-std::size_t available_shared_memory_size(cudf::size_type grid_size)
+std::size_t get_available_shared_memory_size(cudf::size_type grid_size)
 {
   auto const active_blocks_per_sm =
     cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
@@ -302,11 +299,11 @@ void compute_shared_memory_aggs(cudf::size_type grid_size,
 {
   // For each aggregation, need one offset determining where the aggregation is
   // performed, another indicating the validity of the aggregation
-  auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type);
+  auto const offsets_size = compute_shmem_offsets_size(output_values.num_columns());
   // The rest of shmem is utilized for the actual arrays in shmem
-  CUDF_EXPECTS(available_shmem_size > shmem_offsets_size * 2,
+  CUDF_EXPECTS(available_shmem_size > offsets_size * 2,
                "No enough space for shared memory aggregations");
-  auto const shmem_agg_size = available_shmem_size - shmem_offsets_size * 2;
+  auto const shmem_agg_size = available_shmem_size - offsets_size * 2;
   single_pass_shmem_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, available_shmem_size, stream>>>(
     num_input_rows,
     row_bitmask,
@@ -318,6 +315,6 @@ void compute_shared_memory_aggs(cudf::size_type grid_size,
     output_values,
     d_agg_kinds,
     shmem_agg_size,
-    shmem_offsets_size);
+    offsets_size);
 }
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
index 653821fd53b..346956cdab0 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
@@ -22,8 +22,12 @@
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf::groupby::detail::hash {
+std::size_t get_available_shared_memory_size(cudf::size_type grid_size);
 
-std::size_t available_shared_memory_size(cudf::size_type grid_size);
+std::size_t constexpr compute_shmem_offsets_size(cudf::size_type num_cols)
+{
+  return sizeof(cudf::size_type) * num_cols;
+}
 
 void compute_shared_memory_aggs(cudf::size_type grid_size,
                                 std::size_t available_shmem_size,
@@ -37,5 +41,4 @@ void compute_shared_memory_aggs(cudf::size_type grid_size,
                                 cudf::mutable_table_device_view output_values,
                                 cudf::aggregation::Kind const* d_agg_kinds,
                                 rmm::cuda_stream_view stream);
-
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
deleted file mode 100644
index e292543e6e9..00000000000
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "compute_single_pass_aggs.hpp"
-#include "create_sparse_results_table.hpp"
-#include "flatten_single_pass_aggs.hpp"
-#include "helpers.cuh"
-#include "single_pass_functors.cuh"
-#include "var_hash_functor.cuh"
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/aggregation/aggregation.cuh>
-#include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <algorithm>
-#include <memory>
-#include <vector>
-
-namespace cudf::groupby::detail::hash {
-/**
- * @brief Computes all aggregations from `requests` that require a single pass
- * over the data and stores the results in `sparse_results`
- */
-template <typename SetType>
-void compute_single_pass_aggs(int64_t num_keys,
-                              bool skip_rows_with_nulls,
-                              bitmask_type const* row_bitmask,
-                              SetType set,
-                              host_span<aggregation_request const> requests,
-                              cudf::detail::result_cache* sparse_results,
-                              rmm::cuda_stream_view stream)
-{
-  // flatten the aggs to a table that can be operated on by aggregate_row
-  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
-
-  // make table that will hold sparse results
-  table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream);
-  // prepare to launch kernel to do the actual aggregation
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto d_values       = table_device_view::create(flattened_values, stream);
-  auto const d_aggs   = cudf::detail::make_device_uvector_async(
-    agg_kinds, stream, cudf::get_current_device_resource_ref());
-
-  thrust::for_each_n(
-    rmm::exec_policy_nosync(stream),
-    thrust::make_counting_iterator(0),
-    num_keys,
-    hash::compute_single_pass_aggs_fn{
-      set, *d_values, *d_sparse_table, d_aggs.data(), row_bitmask, skip_rows_with_nulls});
-  // Add results back to sparse_results cache
-  auto sparse_result_cols = sparse_table.release();
-  for (size_t i = 0; i < aggs.size(); i++) {
-    // Note that the cache will make a copy of this temporary aggregation
-    sparse_results->add_result(
-      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
-  }
-}
-
-template void compute_single_pass_aggs<hash_set_ref_t<cuco::insert_and_find_tag>>(
-  int64_t num_keys,
-  bool skip_rows_with_nulls,
-  bitmask_type const* row_bitmask,
-  hash_set_ref_t<cuco::insert_and_find_tag> set,
-  host_span<aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  rmm::cuda_stream_view stream);
-
-template void compute_single_pass_aggs<nullable_hash_set_ref_t<cuco::insert_and_find_tag>>(
-  int64_t num_keys,
-  bool skip_rows_with_nulls,
-  bitmask_type const* row_bitmask,
-  nullable_hash_set_ref_t<cuco::insert_and_find_tag> set,
-  host_span<aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  rmm::cuda_stream_view stream);
-}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu
index 22fa4fc584c..bc32e306b3f 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.cu
+++ b/cpp/src/groupby/hash/create_sparse_results_table.cu
@@ -15,53 +15,110 @@
  */
 
 #include "create_sparse_results_table.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
 
+#include <cudf/aggregation.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cuco/static_set.cuh>
 
 #include <algorithm>
 #include <memory>
 #include <vector>
 
 namespace cudf::groupby::detail::hash {
+template <typename SetType>
+void extract_populated_keys(SetType const& key_set,
+                            rmm::device_uvector<cudf::size_type>& populated_keys,
+                            rmm::cuda_stream_view stream)
+{
+  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
+
+  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
+}
+
 // make table that will hold sparse results
-cudf::table create_sparse_results_table(table_view const& flattened_values,
-                                        std::vector<aggregation::Kind> aggs,
+template <typename GlobalSetType>
+cudf::table create_sparse_results_table(cudf::table_view const& flattened_values,
+                                        cudf::aggregation::Kind const* d_agg_kinds,
+                                        std::vector<cudf::aggregation::Kind> agg_kinds,
+                                        bool direct_aggregations,
+                                        GlobalSetType const& global_set,
+                                        rmm::device_uvector<cudf::size_type>& populated_keys,
                                         rmm::cuda_stream_view stream)
 {
   // TODO single allocation - room for performance improvement
-  std::vector<std::unique_ptr<column>> sparse_columns;
-  sparse_columns.reserve(flattened_values.num_columns());
-  std::transform(
-    flattened_values.begin(),
-    flattened_values.end(),
-    aggs.begin(),
-    std::back_inserter(sparse_columns),
-    [stream](auto const& col, auto const& agg) {
-      bool nullable =
-        (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL)
-          ? false
-          : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD);
-      auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED;
+  std::vector<std::unique_ptr<cudf::column>> sparse_columns;
+  std::transform(flattened_values.begin(),
+                 flattened_values.end(),
+                 agg_kinds.begin(),
+                 std::back_inserter(sparse_columns),
+                 [stream](auto const& col, auto const& agg) {
+                   auto const nullable =
+                     (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL)
+                       ? false
+                       : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
+                          agg == cudf::aggregation::STD);
+                   auto const mask_flag =
+                     (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
+                   auto const col_type = cudf::is_dictionary(col.type())
+                                           ? cudf::dictionary_column_view(col).keys().type()
+                                           : col.type();
+                   return make_fixed_width_column(
+                     cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
+                 });
+  cudf::table sparse_table(std::move(sparse_columns));
+  // If no direct aggregations, initialize the sparse table
+  // only for the keys inserted in global hash set
+  if (!direct_aggregations) {
+    auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream);
+    extract_populated_keys(global_set, populated_keys, stream);
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      populated_keys.size(),
+      initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds});
+  }
+  // Else initialize the whole table
+  else {
+    cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view();
+    cudf::detail::initialize_with_identity(sparse_table_view, agg_kinds, stream);
+  }
+  return sparse_table;
+}
 
-      auto col_type = cudf::is_dictionary(col.type())
-                        ? cudf::dictionary_column_view(col).keys().type()
-                        : col.type();
+template void extract_populated_keys<global_set_t>(
+  global_set_t const& key_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
 
-      return make_fixed_width_column(
-        cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
-    });
+template void extract_populated_keys<nullable_global_set_t>(
+  nullable_global_set_t const& key_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
 
-  table sparse_table(std::move(sparse_columns));
-  mutable_table_view table_view = sparse_table.mutable_view();
-  cudf::detail::initialize_with_identity(table_view, aggs, stream);
-  return sparse_table;
-}
+template cudf::table create_sparse_results_table<global_set_t>(
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> agg_kinds,
+  bool direct_aggregations,
+  global_set_t const& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
+
+template cudf::table create_sparse_results_table<nullable_global_set_t>(
+  cudf::table_view const& flattened_values,
+  cudf::aggregation::Kind const* d_agg_kinds,
+  std::vector<cudf::aggregation::Kind> agg_kinds,
+  bool direct_aggregations,
+  nullable_global_set_t const& global_set,
+  rmm::device_uvector<cudf::size_type>& populated_keys,
+  rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp
index c1d4e0d3f20..8155ce852e0 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.hpp
+++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp
@@ -15,18 +15,41 @@
  */
 #pragma once
 
+#include <cudf/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <vector>
 
 namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes and returns a device vector containing all populated keys in
+ * `key_set`.
+ *
+ * @tparam SetType Type of the key hash set
+ *
+ * @param key_set Key hash set
+ * @param populated_keys Array of unique keys
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return An array of unique keys contained in `key_set`
+ */
+template <typename SetType>
+void extract_populated_keys(SetType const& key_set,
+                            rmm::device_uvector<cudf::size_type>& populated_keys,
+                            rmm::cuda_stream_view stream);
+
 // make table that will hold sparse results
-cudf::table create_sparse_results_table(table_view const& flattened_values,
-                                        std::vector<aggregation::Kind> aggs_kinds,
+template <typename GlobalSetType>
+cudf::table create_sparse_results_table(cudf::table_view const& flattened_values,
+                                        cudf::aggregation::Kind const* d_agg_kinds,
+                                        std::vector<cudf::aggregation::Kind> agg_kinds,
+                                        bool direct_aggregations,
+                                        GlobalSetType const& global_set,
+                                        rmm::device_uvector<cudf::size_type>& populated_keys,
                                         rmm::cuda_stream_view stream);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index 00836567b4f..f950e03e0fb 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -23,8 +23,6 @@
 #include <cuco/static_set.cuh>
 
 namespace cudf::groupby::detail::hash {
-// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
-// types and `cg_size = 1`for flat data to improve performance
 /// Number of threads to handle each input element
 CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1;
 
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 28a5b578e00..048c9252773 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -15,12 +15,14 @@
  */
 #pragma once
 
-#include <cudf/detail/aggregation/aggregation.hpp>
+#include "helpers.cuh"
+
 #include <cudf/detail/aggregation/device_aggregators.cuh>
-#include <cudf/groupby.hpp>
-#include <cudf/utilities/bit.hpp>
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/utilities/traits.cuh>
 
-#include <cuco/static_set_ref.cuh>
+#include <cuda/std/cstddef>
 
 namespace cudf::groupby::detail::hash {
 // TODO: TO BE REMOVED issue tracked via #17171
@@ -104,6 +106,114 @@ struct initialize_shmem {
   }
 };
 
+template <typename Target, cudf::aggregation::Kind k, typename Enable = void>
+struct initialize_target_element_gmem {
+  __device__ void operator()(cudf::mutable_column_device_view, cudf::size_type) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Target, cudf::aggregation::Kind k>
+struct initialize_target_element_gmem<
+  Target,
+  k,
+  std::enable_if_t<is_supported<Target, k>() && cudf::is_fixed_width<Target>() &&
+                   !cudf::is_fixed_point<Target>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index) const noexcept
+  {
+    using DeviceType                     = cudf::device_storage_type_t<Target>;
+    target.element<Target>(target_index) = get_identity<DeviceType, k>();
+  }
+};
+
+template <typename Target, cudf::aggregation::Kind k>
+struct initialize_target_element_gmem<
+  Target,
+  k,
+  std::enable_if_t<is_supported<Target, k>() && cudf::is_fixed_point<Target>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index) const noexcept
+  {
+    using DeviceType                         = cudf::device_storage_type_t<Target>;
+    target.element<DeviceType>(target_index) = get_identity<DeviceType, k>();
+  }
+};
+
+struct initialize_gmem {
+  template <typename Target, cudf::aggregation::Kind k>
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index) const noexcept
+  {
+    initialize_target_element_gmem<Target, k>{}(target, target_index);
+  }
+};
+
+struct initialize_sparse_table {
+  cudf::size_type const* row_indices;
+  cudf::mutable_table_device_view sparse_table;
+  cudf::aggregation::Kind const* __restrict__ aggs;
+  initialize_sparse_table(cudf::size_type const* row_indices,
+                          cudf::mutable_table_device_view sparse_table,
+                          cudf::aggregation::Kind const* aggs)
+    : row_indices(row_indices), sparse_table(sparse_table), aggs(aggs)
+  {
+  }
+  __device__ void operator()(cudf::size_type i)
+  {
+    auto key_idx = row_indices[i];
+    for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) {
+      cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(),
+                                                  aggs[col_idx],
+                                                  initialize_gmem{},
+                                                  sparse_table.column(col_idx),
+                                                  key_idx);
+    }
+  }
+};
+
+template <typename SetType>
+struct global_memory_fallback_fn {
+  SetType set;
+  cudf::table_device_view input_values;
+  cudf::mutable_table_device_view output_values;
+  cudf::aggregation::Kind const* __restrict__ aggs;
+  cudf::size_type* block_cardinality;
+  cudf::size_type stride;
+  bitmask_type const* __restrict__ row_bitmask;
+  bool skip_rows_with_nulls;
+
+  global_memory_fallback_fn(SetType set,
+                            cudf::table_device_view input_values,
+                            cudf::mutable_table_device_view output_values,
+                            cudf::aggregation::Kind const* aggs,
+                            cudf::size_type* block_cardinality,
+                            cudf::size_type stride,
+                            bitmask_type const* row_bitmask,
+                            bool skip_rows_with_nulls)
+    : set(set),
+      input_values(input_values),
+      output_values(output_values),
+      aggs(aggs),
+      block_cardinality(block_cardinality),
+      stride(stride),
+      row_bitmask(row_bitmask),
+      skip_rows_with_nulls(skip_rows_with_nulls)
+  {
+  }
+
+  __device__ void operator()(cudf::size_type i)
+  {
+    auto const block_id = (i % stride) / GROUPBY_BLOCK_SIZE;
+    if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD and
+        (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i))) {
+      auto const result = set.insert_and_find(i);
+      cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs);
+    }
+  }
+};
+
 /**
  * @brief Computes single-pass aggregations and store results into a sparse `output_values` table,
  * and populate `set` with indices of unique keys

From d295f17f4468004367fe60088854ac5513519d32 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 8 Nov 2024 16:22:08 -0500
Subject: [PATCH 02/19] Add `cudf::calendrical_month_sequence` to pylibcudf
 (#17277)

Apart of #15162. Also adds tests for `pylibcudf.filling`.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17277
---
 python/cudf/cudf/_lib/datetime.pyx            | 21 ++---
 python/pylibcudf/pylibcudf/filling.pxd        |  6 ++
 python/pylibcudf/pylibcudf/filling.pyx        | 37 ++++++++
 .../pylibcudf/pylibcudf/tests/test_filling.py | 91 +++++++++++++++++++
 4 files changed, 140 insertions(+), 15 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_filling.py

diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 2c7a585f4b1..7e8f29dac93 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -4,13 +4,7 @@ import warnings
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 cimport pylibcudf.libcudf.datetime as libcudf_datetime
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.filling cimport calendrical_month_sequence
-from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.datetime import DatetimeComponent, RoundingFrequency
 
@@ -143,20 +137,17 @@ def is_leap_year(Column col):
 
 @acquire_spill_lock()
 def date_range(DeviceScalar start, size_type n, offset):
-    cdef unique_ptr[column] c_result
     cdef size_type months = (
         offset.kwds.get("years", 0) * 12
         + offset.kwds.get("months", 0)
     )
-
-    cdef const scalar* c_start = start.get_raw_ptr()
-    with nogil:
-        c_result = move(calendrical_month_sequence(
+    return Column.from_pylibcudf(
+        plc.filling.calendrical_month_sequence(
             n,
-            c_start[0],
-            months
-        ))
-    return Column.from_unique_ptr(move(c_result))
+            start.c_value,
+            months,
+        )
+    )
 
 
 @acquire_spill_lock()
diff --git a/python/pylibcudf/pylibcudf/filling.pxd b/python/pylibcudf/pylibcudf/filling.pxd
index b9345f8cd42..56aef086e1b 100644
--- a/python/pylibcudf/pylibcudf/filling.pxd
+++ b/python/pylibcudf/pylibcudf/filling.pxd
@@ -33,3 +33,9 @@ cpdef Table repeat(
     Table input_table,
     ColumnOrSize count
 )
+
+cpdef Column calendrical_month_sequence(
+    size_type n,
+    Scalar init,
+    size_type months,
+)
diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx
index a47004a1e42..313605ead16 100644
--- a/python/pylibcudf/pylibcudf/filling.pyx
+++ b/python/pylibcudf/pylibcudf/filling.pyx
@@ -9,6 +9,7 @@ from pylibcudf.libcudf.filling cimport (
     fill_in_place as cpp_fill_in_place,
     repeat as cpp_repeat,
     sequence as cpp_sequence,
+    calendrical_month_sequence as cpp_calendrical_month_sequence
 )
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
@@ -164,3 +165,39 @@ cpdef Table repeat(
                 count
             )
     return Table.from_libcudf(move(result))
+
+
+cpdef Column calendrical_month_sequence(
+    size_type n,
+    Scalar init,
+    size_type months,
+):
+
+    """Fill destination column from begin to end with value.
+
+    For details, see :cpp:func:`calendrical_month_sequence`.
+
+    Parameters
+    ----------
+    n : size_type
+        Number of timestamps to generate
+    init : Scalar
+        The initial timestamp
+    months : size_type
+        Months to increment
+
+    Returns
+    -------
+    pylibcudf.Column
+        Timestamps column with sequences of months
+    """
+
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_calendrical_month_sequence(
+            n,
+            dereference(init.c_obj),
+            months
+        )
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_filling.py b/python/pylibcudf/pylibcudf/tests/test_filling.py
new file mode 100644
index 00000000000..91c7e42a0a0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_filling.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from datetime import datetime
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq, assert_table_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture
+def pa_col():
+    return pa.array([2, 3, 5, 7, 11])
+
+
+@pytest.fixture
+def pa_table():
+    pa_col = pa.array([1, 2, 3])
+    return pa.table([pa_col], names=["a"])
+
+
+def test_fill(pa_col):
+    result = plc.filling.fill(
+        plc.interop.from_arrow(pa_col),
+        1,
+        3,
+        plc.interop.from_arrow(pa.scalar(5)),
+    )
+    expect = pa.array([2, 5, 5, 7, 11])
+    assert_column_eq(result, expect)
+
+
+def test_fill_in_place(pa_col):
+    result = plc.interop.from_arrow(pa_col)
+    plc.filling.fill_in_place(
+        result,
+        1,
+        3,
+        plc.interop.from_arrow(pa.scalar(5)),
+    )
+    expect = pa.array([2, 5, 5, 7, 11])
+    assert_column_eq(result, expect)
+
+
+def test_sequence():
+    size = 5
+    init_scalar = plc.interop.from_arrow(pa.scalar(10))
+    step_scalar = plc.interop.from_arrow(pa.scalar(2))
+    result = plc.filling.sequence(
+        size,
+        init_scalar,
+        step_scalar,
+    )
+    expect = pa.array([10, 12, 14, 16, 18])
+    assert_column_eq(result, expect)
+
+
+def test_repeat_with_count_int(pa_table):
+    input_table = plc.interop.from_arrow(pa_table)
+    count = 2
+    result = plc.filling.repeat(input_table, count)
+    expect = pa.table([[1, 1, 2, 2, 3, 3]], names=["a"])
+    assert_table_eq(expect, result)
+
+
+def test_repeat_with_count_column(pa_table):
+    input_table = plc.interop.from_arrow(pa_table)
+    count = plc.interop.from_arrow(pa.array([1, 2, 3]))
+    result = plc.filling.repeat(input_table, count)
+    expect = pa.table([[1] + [2] * 2 + [3] * 3], names=["a"])
+    assert_table_eq(expect, result)
+
+
+def test_calendrical_month_sequence():
+    n = 5
+    init_date = datetime(2020, 1, 31)
+    init = plc.interop.from_arrow(
+        pa.scalar(init_date, type=pa.timestamp("ms"))
+    )
+    months = 1
+    result = plc.filling.calendrical_month_sequence(n, init, months)
+    expected_dates = [
+        datetime(2020, 1, 31),
+        datetime(2020, 2, 29),
+        datetime(2020, 3, 31),
+        datetime(2020, 4, 30),
+        datetime(2020, 5, 31),
+    ]
+    expect = pa.array(expected_dates, type=pa.timestamp("ms"))
+    assert_column_eq(result, expect)

From fea46cd869bac0e312a898ca959783aa8db2ad5f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 8 Nov 2024 14:14:55 -0800
Subject: [PATCH 03/19] Add read_parquet_metadata to pylibcudf (#17245)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17245
---
 docs/cudf/source/conf.py                      |   2 +
 .../api_docs/pylibcudf/io/index.rst           |   1 +
 .../pylibcudf/io/parquet_metadata.rst         |   6 +
 python/cudf/cudf/_lib/io/utils.pxd            |   1 -
 python/cudf/cudf/_lib/io/utils.pyx            |  56 -----
 python/cudf/cudf/_lib/parquet.pyx             |  67 ++----
 python/cudf/cudf/tests/test_parquet.py        |   4 +-
 python/pylibcudf/pylibcudf/io/CMakeLists.txt  |   4 +-
 python/pylibcudf/pylibcudf/io/__init__.pxd    |  12 +-
 python/pylibcudf/pylibcudf/io/__init__.py     |  13 +-
 .../pylibcudf/io/parquet_metadata.pxd         |  51 +++++
 .../pylibcudf/io/parquet_metadata.pyx         | 207 ++++++++++++++++++
 .../pylibcudf/libcudf/io/parquet_metadata.pxd |   4 +-
 13 files changed, 318 insertions(+), 110 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst
 create mode 100644 python/pylibcudf/pylibcudf/io/parquet_metadata.pxd
 create mode 100644 python/pylibcudf/pylibcudf/io/parquet_metadata.pyx

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 5942cc16850..0d463b918d3 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -554,6 +554,8 @@ def on_missing_reference(app, env, node, contnode):
 
 
 nitpick_ignore = [
+    # Erroneously warned in ParquetColumnSchema.name
+    ("py:class", "unicode"),
     ("py:class", "SeriesOrIndex"),
     ("py:class", "Dtype"),
     # The following are erroneously warned due to
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index cd5c5a5f77e..1c1c8040972 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -19,5 +19,6 @@ I/O Functions
     csv
     json
     parquet
+    parquet_metadata
     text
     timezone
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst
new file mode 100644
index 00000000000..fce964f9714
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst
@@ -0,0 +1,6 @@
+================
+Parquet Metadata
+================
+
+.. automodule:: pylibcudf.io.parquet_metadata
+   :members:
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 76a6e32fde0..96504ebdd66 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -13,7 +13,6 @@ from pylibcudf.libcudf.io.types cimport (
 from cudf._lib.column cimport Column
 
 
-cdef source_info make_source_info(list src) except*
 cdef sink_info make_sinks_info(
     list src, vector[unique_ptr[data_sink]] & data) except*
 cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except*
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 564daefbae2..f23980b387a 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -7,76 +7,20 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from pylibcudf.io.datasource cimport Datasource
 from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.datasource cimport datasource
 from pylibcudf.libcudf.io.types cimport (
     column_name_info,
-    host_buffer,
     sink_info,
-    source_info,
 )
 
 from cudf._lib.column cimport Column
 
 import codecs
-import errno
 import io
 import os
 
 from cudf.core.dtypes import StructDtype
 
-
-# Converts the Python source input to libcudf IO source_info
-# with the appropriate type and source values
-cdef source_info make_source_info(list src) except*:
-    if not src:
-        raise ValueError("Need to pass at least one source")
-
-    cdef const unsigned char[::1] c_buffer
-    cdef vector[host_buffer] c_host_buffers
-    cdef vector[string] c_files
-    cdef Datasource csrc
-    cdef vector[datasource*] c_datasources
-    empty_buffer = False
-    if isinstance(src[0], bytes):
-        empty_buffer = True
-        for buffer in src:
-            if (len(buffer) > 0):
-                c_buffer = buffer
-                c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
-                                                     c_buffer.shape[0]))
-                empty_buffer = False
-    elif isinstance(src[0], io.BytesIO):
-        for bio in src:
-            c_buffer = bio.getbuffer()  # check if empty?
-            c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
-                                                 c_buffer.shape[0]))
-    # Otherwise src is expected to be a numeric fd, string path, or PathLike.
-    # TODO (ptaylor): Might need to update this check if accepted input types
-    #                 change when UCX and/or cuStreamz support is added.
-    elif isinstance(src[0], Datasource):
-        for csrc in src:
-            c_datasources.push_back(csrc.get_datasource())
-        return source_info(c_datasources)
-    elif isinstance(src[0], (int, float, complex, basestring, os.PathLike)):
-        # If source is a file, return source_info where type=FILEPATH
-        if not all(os.path.isfile(file) for file in src):
-            raise FileNotFoundError(errno.ENOENT,
-                                    os.strerror(errno.ENOENT),
-                                    src)
-
-        files = [<string> str(elem).encode() for elem in src]
-        c_files = files
-        return source_info(c_files)
-    else:
-        raise TypeError("Unrecognized input type: {}".format(type(src[0])))
-
-    if empty_buffer is True:
-        c_host_buffers.push_back(host_buffer(<char*>NULL, 0))
-
-    return source_info(c_host_buffers)
-
 # Converts the Python sink input to libcudf IO sink_info.
 cdef sink_info make_sinks_info(
     list src, vector[unique_ptr[data_sink]] & sink
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 1212637d330..d4bd0cd306c 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -27,7 +27,6 @@ from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
-from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -41,12 +40,7 @@ from pylibcudf.libcudf.io.parquet cimport (
     parquet_writer_options,
     write_parquet as parquet_writer,
 )
-from pylibcudf.libcudf.io.parquet_metadata cimport (
-    parquet_metadata,
-    read_parquet_metadata as parquet_metadata_reader,
-)
 from pylibcudf.libcudf.io.types cimport (
-    source_info,
     sink_info,
     column_in_metadata,
     table_input_metadata,
@@ -62,7 +56,6 @@ from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport (
     add_df_col_struct_names,
     make_sinks_info,
-    make_source_info,
 )
 from cudf._lib.utils cimport table_view_from_table
 
@@ -373,7 +366,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                            nrows=nrows, skip_rows=skip_rows)
     return df
 
-cpdef read_parquet_metadata(filepaths_or_buffers):
+cpdef read_parquet_metadata(list filepaths_or_buffers):
     """
     Cython function to call into libcudf API, see `read_parquet_metadata`.
 
@@ -382,56 +375,40 @@ cpdef read_parquet_metadata(filepaths_or_buffers):
     cudf.io.parquet.read_parquet
     cudf.io.parquet.to_parquet
     """
-    cdef source_info source = make_source_info(filepaths_or_buffers)
-
-    args = move(source)
-
-    cdef parquet_metadata c_result
-
-    # Read Parquet metadata
-    with nogil:
-        c_result = move(parquet_metadata_reader(args))
-
-    # access and return results
-    num_rows = c_result.num_rows()
-    num_rowgroups = c_result.num_rowgroups()
-
-    # extract row group metadata and sanitize keys
-    row_group_metadata = [{k.decode(): v for k, v in metadata}
-                          for metadata in c_result.rowgroup_metadata()]
+    parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata(
+        plc.io.SourceInfo(filepaths_or_buffers)
+    )
 
     # read all column names including index column, if any
-    col_names = [info.name().decode() for info in c_result.schema().root().children()]
-
-    # access the Parquet file_footer to find the index
-    index_col = None
-    cdef unordered_map[string, string] file_footer = c_result.metadata()
+    col_names = [info.name() for info in parquet_metadata.schema().root().children()]
 
-    # get index column name(s)
-    index_col_names = None
-    json_str = file_footer[b'pandas'].decode('utf-8')
-    meta = None
+    index_col_names = set()
+    json_str = parquet_metadata.metadata()['pandas']
     if json_str != "":
         meta = json.loads(json_str)
         file_is_range_index, index_col, _ = _parse_metadata(meta)
-        if not file_is_range_index and index_col is not None \
-                and index_col_names is None:
-            index_col_names = {}
+        if (
+            not file_is_range_index
+            and index_col is not None
+        ):
+            columns = meta['columns']
             for idx_col in index_col:
-                for c in meta['columns']:
+                for c in columns:
                     if c['field_name'] == idx_col:
-                        index_col_names[idx_col] = c['name']
+                        index_col_names.add(idx_col)
 
     # remove the index column from the list of column names
     # only if index_col_names is not None
-    if index_col_names is not None:
+    if len(index_col_names) >= 0:
         col_names = [name for name in col_names if name not in index_col_names]
 
-    # num_columns = length of list(col_names)
-    num_columns = len(col_names)
-
-    # return the metadata
-    return num_rows, num_rowgroups, col_names, num_columns, row_group_metadata
+    return (
+        parquet_metadata.num_rows(),
+        parquet_metadata.num_rowgroups(),
+        col_names,
+        len(col_names),
+        parquet_metadata.rowgroup_metadata()
+    )
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index c9ce24d2a5b..3c4398a87de 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -405,14 +405,14 @@ def test_parquet_range_index_pandas_metadata(tmpdir, pandas_compat, as_bytes):
     assert_eq(expect, got)
 
 
-def test_parquet_read_metadata(tmpdir, pdf):
+def test_parquet_read_metadata(tmp_path, pdf):
     if len(pdf) > 100:
         pytest.skip("Skipping long setup test")
 
     def num_row_groups(rows, group_size):
         return max(1, (rows + (group_size - 1)) // group_size)
 
-    fname = tmpdir.join("metadata.parquet")
+    fname = tmp_path / "metadata.parquet"
     row_group_size = 5
     pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size)
 
diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
index f78d97ef4d1..664faef718f 100644
--- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
@@ -12,8 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx
-                   text.pyx types.pyx
+set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx
+                   parquet_metadata.pyx text.pyx timezone.pyx types.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd
index 6ba7f78a013..663804e714d 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/io/__init__.pxd
@@ -1,5 +1,15 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 # CSV is removed since it is def not cpdef (to force kw-only arguments)
-from . cimport avro, datasource, json, orc, parquet, timezone, text, types
+from . cimport (
+    avro,
+    datasource,
+    json,
+    orc,
+    parquet,
+    parquet_metadata,
+    text,
+    timezone,
+    types,
+)
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py
index 0fc77dd0f57..9e8e0f6e080 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.py
+++ b/python/pylibcudf/pylibcudf/io/__init__.py
@@ -1,4 +1,15 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, csv, datasource, json, orc, parquet, timezone, text, types
+from . import (
+    avro,
+    csv,
+    datasource,
+    json,
+    orc,
+    parquet,
+    parquet_metadata,
+    text,
+    timezone,
+    types,
+)
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd
new file mode 100644
index 00000000000..e421a64adc8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.io.types cimport SourceInfo
+from pylibcudf.libcudf.io.parquet_metadata cimport(
+    parquet_metadata,
+    parquet_schema,
+    parquet_column_schema,
+)
+
+cdef class ParquetColumnSchema:
+    cdef parquet_column_schema column_schema
+
+    @staticmethod
+    cdef from_column_schema(parquet_column_schema column_schema)
+
+    cpdef str name(self)
+
+    cpdef int num_children(self)
+
+    cpdef ParquetColumnSchema child(self, int idx)
+
+    cpdef list children(self)
+
+
+cdef class ParquetSchema:
+    cdef parquet_schema schema
+
+    @staticmethod
+    cdef from_schema(parquet_schema schema)
+
+    cpdef ParquetColumnSchema root(self)
+
+
+cdef class ParquetMetadata:
+    cdef parquet_metadata meta
+
+    @staticmethod
+    cdef from_metadata(parquet_metadata meta)
+
+    cpdef ParquetSchema schema(self)
+
+    cpdef int num_rows(self)
+
+    cpdef int num_rowgroups(self)
+
+    cpdef dict metadata(self)
+
+    cpdef list rowgroup_metadata(self)
+
+
+cpdef ParquetMetadata read_parquet_metadata(SourceInfo src_info)
diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx
new file mode 100644
index 00000000000..352905ff0f8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx
@@ -0,0 +1,207 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.io.types cimport SourceInfo
+from pylibcudf.libcudf.io cimport parquet_metadata as cpp_parquet_metadata
+
+
+cdef class ParquetColumnSchema:
+    """
+    Schema of a parquet column, including the nested columns.
+
+    Parameters
+    ----------
+    parquet_column_schema
+    """
+    def __init__(self):
+        raise ValueError("Construct ParquetColumnSchema with from_column_schema.")
+
+    @staticmethod
+    cdef from_column_schema(cpp_parquet_metadata.parquet_column_schema column_schema):
+        cdef ParquetColumnSchema result = ParquetColumnSchema.__new__(
+            ParquetColumnSchema
+        )
+        result.column_schema = column_schema
+        return result
+
+    cpdef str name(self):
+        """
+        Returns parquet column name; can be empty.
+
+        Returns
+        -------
+        str
+            Column name
+        """
+        return self.column_schema.name().decode()
+
+    cpdef int num_children(self):
+        """
+        Returns the number of child columns.
+
+        Returns
+        -------
+        int
+            Children count
+        """
+        return self.column_schema.num_children()
+
+    cpdef ParquetColumnSchema child(self, int idx):
+        """
+        Returns schema of the child with the given index.
+
+        Parameters
+        ----------
+        idx : int
+            Child Index
+
+        Returns
+        -------
+        ParquetColumnSchema
+            Child schema
+        """
+        return ParquetColumnSchema.from_column_schema(self.column_schema.child(idx))
+
+    cpdef list children(self):
+        """
+        Returns schemas of all child columns.
+
+        Returns
+        -------
+        list[ParquetColumnSchema]
+            Child schemas.
+        """
+        cdef cpp_parquet_metadata.parquet_column_schema child
+        return [
+            ParquetColumnSchema.from_column_schema(child)
+            for child in self.column_schema.children()
+        ]
+
+
+cdef class ParquetSchema:
+    """
+    Schema of a parquet file.
+
+    Parameters
+    ----------
+    parquet_schema
+    """
+
+    def __init__(self):
+        raise ValueError("Construct ParquetSchema with from_schema.")
+
+    @staticmethod
+    cdef from_schema(cpp_parquet_metadata.parquet_schema schema):
+        cdef ParquetSchema result = ParquetSchema.__new__(ParquetSchema)
+        result.schema = schema
+        return result
+
+    cpdef ParquetColumnSchema root(self):
+        """
+        Returns the schema of the struct column that contains all columns as fields.
+
+        Returns
+        -------
+        ParquetColumnSchema
+            Root column schema
+        """
+        return ParquetColumnSchema.from_column_schema(self.schema.root())
+
+
+cdef class ParquetMetadata:
+    """
+    Information about content of a parquet file.
+
+    Parameters
+    ----------
+    parquet_metadata
+    """
+
+    def __init__(self):
+        raise ValueError("Construct ParquetMetadata with from_metadata.")
+
+    @staticmethod
+    cdef from_metadata(cpp_parquet_metadata.parquet_metadata meta):
+        cdef ParquetMetadata result = ParquetMetadata.__new__(ParquetMetadata)
+        result.meta = meta
+        return result
+
+    cpdef ParquetSchema schema(self):
+        """
+        Returns the parquet schema.
+
+        Returns
+        -------
+        ParquetSchema
+            Parquet schema
+        """
+        return ParquetSchema.from_schema(self.meta.schema())
+
+    cpdef int num_rows(self):
+        """
+        Returns the number of rows of the root column.
+
+        Returns
+        -------
+        int
+            Number of rows
+        """
+        return self.meta.num_rows()
+
+    cpdef int num_rowgroups(self):
+        """
+        Returns the number of rowgroups in the file.
+
+        Returns
+        -------
+        int
+            Number of row groups.
+        """
+        return self.meta.num_rowgroups()
+
+    cpdef dict metadata(self):
+        """
+        Returns the key-value metadata in the file footer.
+
+        Returns
+        -------
+        dict[bytes, bytes]
+            Key value metadata as a map.
+        """
+        return {key.decode(): val.decode() for key, val in self.meta.metadata()}
+
+    cpdef list rowgroup_metadata(self):
+        """
+        Returns the row group metadata in the file footer.
+
+        Returns
+        -------
+        list[dict[str, int]]
+            Vector of row group metadata as maps.
+        """
+        return [
+            {key.decode(): val for key, val in metadata}
+            for metadata in self.meta.rowgroup_metadata()
+        ]
+
+
+cpdef ParquetMetadata read_parquet_metadata(SourceInfo src_info):
+    """
+    Reads metadata of parquet dataset.
+
+    Parameters
+    ----------
+    src_info : SourceInfo
+        Dataset source.
+
+    Returns
+    -------
+    ParquetMetadata
+        Parquet_metadata with parquet schema, number of rows,
+        number of row groups and key-value metadata.
+    """
+    cdef cpp_parquet_metadata.parquet_metadata c_result
+
+    with nogil:
+        c_result = cpp_parquet_metadata.read_parquet_metadata(src_info.c_obj)
+
+    return ParquetMetadata.from_metadata(c_result)
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd
index 8e6da56c9a6..b0ce13e4492 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-cimport pylibcudf.libcudf.io.types as cudf_io_types
 from libc.stdint cimport int64_t
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.io.types cimport source_info
 
 
 cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil:
@@ -28,4 +28,4 @@ cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil:
         unordered_map[string, string] metadata() except+
         vector[unordered_map[string, int64_t]] rowgroup_metadata() except+
 
-    cdef parquet_metadata read_parquet_metadata(cudf_io_types.source_info src) except+
+    cdef parquet_metadata read_parquet_metadata(source_info src_info) except+

From db69c52d9140d909aeb4af3a5b3db1e7c44c92bc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 8 Nov 2024 14:46:27 -0800
Subject: [PATCH 04/19] Follow up making Python tests more deterministic
 (#17272)

Addressing comments in https://github.com/rapidsai/cudf/pull/17008/files#r1823318321 and https://github.com/rapidsai/cudf/pull/17008/files#r1823318898

Didn't touch the `_fuzz_testing` directory because maybe we don't want that to be deterministic?

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - James Lamb (https://github.com/jameslamb)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17272
---
 .pre-commit-config.yaml                         |  4 ++--
 python/cudf/cudf/tests/test_parquet.py          | 11 +++--------
 .../dask_cudf/tests/test_reductions.py          | 17 +----------------
 python/dask_cudf/dask_cudf/tests/utils.py       |  2 +-
 4 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f5234f58efe..6d070a8a14c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -90,8 +90,8 @@ repos:
         entry: |
           # Check for usage of default_rng without seeding
           default_rng\(\)|
-          # Check for usage of np.random.seed
-          np.random.seed\(
+          # Check for usage of np.random.seed (NPY002 only disallows this being called)
+          np.random.seed
         language: pygrep
         types: [python]
       - id: cmake-format
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 3c4398a87de..96512dacb69 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -193,11 +193,6 @@ def parquet_file(request, tmp_path_factory, pdf):
     return fname
 
 
-@pytest.fixture(scope="module")
-def rdg_seed():
-    return int(os.environ.get("TEST_CUDF_RDG_SEED", "42"))
-
-
 def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64):
     test_pdf = pd.DataFrame(
         [list(range(ncolumns * i, ncolumns * (i + 1))) for i in range(nrows)],
@@ -431,7 +426,7 @@ def num_row_groups(rows, group_size):
         assert a == b
 
 
-def test_parquet_read_filtered(tmpdir, rdg_seed):
+def test_parquet_read_filtered(tmpdir):
     # Generate data
     fname = tmpdir.join("filtered.parquet")
     dg.generate(
@@ -455,13 +450,13 @@ def test_parquet_read_filtered(tmpdir, rdg_seed):
                 dg.ColumnParameters(
                     40,
                     0.2,
-                    lambda: np.random.default_rng(seed=None).integers(
+                    lambda: np.random.default_rng(seed=0).integers(
                         0, 100, size=40
                     ),
                     True,
                 ),
             ],
-            seed=rdg_seed,
+            seed=42,
         ),
         format={"name": "parquet", "row_group_size": 64},
     )
diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py
index 4351b672151..f11a5252080 100644
--- a/python/dask_cudf/dask_cudf/tests/test_reductions.py
+++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py
@@ -1,7 +1,5 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-import numpy as np
-import pandas as pd
 import pytest
 
 import dask
@@ -10,20 +8,7 @@
 import cudf
 
 import dask_cudf
-
-
-def _make_random_frame(nelem, npartitions=2):
-    rng = np.random.default_rng(seed=0)
-    df = pd.DataFrame(
-        {
-            "x": rng.integers(0, 5, size=nelem),
-            "y": rng.normal(loc=1.0, scale=1.0, size=nelem),
-        }
-    )
-    gdf = cudf.DataFrame.from_pandas(df)
-    dgf = dask_cudf.from_cudf(gdf, npartitions=npartitions)
-    return df, dgf
-
+from dask_cudf.tests.utils import _make_random_frame
 
 _reducers = ["sum", "count", "mean", "var", "std", "min", "max"]
 
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index a9f61f75762..b44b3f939e7 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -19,7 +19,7 @@
 
 
 def _make_random_frame(nelem, npartitions=2, include_na=False):
-    rng = np.random.default_rng(seed=None)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {"x": rng.random(size=nelem), "y": rng.random(size=nelem)}
     )

From 0fc5fab825ece5b605d84a3d5ef04d7dde31b39f Mon Sep 17 00:00:00 2001
From: Graham Markall <535640+gmarkall@users.noreply.github.com>
Date: Sat, 9 Nov 2024 00:01:26 +0000
Subject: [PATCH 05/19] Use numba-cuda<0.0.18 (#17280)

Numba-cuda 0.0.18 (not yet released) contains some changes that might break pynvjitlink patching. In order to avoid breaking RAPIDS CI whilst working through that after releasing numba-cuda 0.0.18 but before the next pynvjitlink, this PR makes use of numba-cuda 0.0.17 or less a requirement.

Authors:
  - Graham Markall (https://github.com/gmarkall)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - https://github.com/brandon-b-miller
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17280
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 2 +-
 dependencies.yaml                                | 2 +-
 python/cudf/pyproject.toml                       | 2 +-
 python/dask_cudf/pyproject.toml                  | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 6fbdd4ba568..01764411346 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -55,7 +55,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.0.13
+- numba-cuda>=0.0.13,<0.0.18
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 4aafa12fdae..9074e6332d9 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -54,7 +54,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.0.13
+- numba-cuda>=0.0.13,<0.0.18
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcomp==4.1.0.6
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 2aafcae072d..04904e95630 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -80,7 +80,7 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
     - cupy >=12.0.0
-    - numba-cuda >=0.0.13
+    - numba-cuda >=0.0.13,<0.0.18
     - numpy >=1.23,<3.0a0
     - pyarrow>=14.0.0,<18.0.0a0
     - libcudf ={{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 59f8f2fda49..e47e0c7523c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -675,7 +675,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cachetools
-          - &numba-cuda-dep numba-cuda>=0.0.13
+          - &numba-cuda-dep numba-cuda>=0.0.13,<0.0.18
           - nvtx>=0.2.1
           - packaging
           - rich
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 41dedc4ff20..ca6dbddfecc 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -24,7 +24,7 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "libcudf==24.12.*,>=0.0.0a0",
-    "numba-cuda>=0.0.13",
+    "numba-cuda>=0.0.13,<0.0.18",
     "numpy>=1.23,<3.0a0",
     "nvtx>=0.2.1",
     "packaging",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index c7e4cbc45ea..c4bfc3054bc 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -46,7 +46,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
 [project.optional-dependencies]
 test = [
     "dask-cuda==24.12.*,>=0.0.0a0",
-    "numba-cuda>=0.0.13",
+    "numba-cuda>=0.0.13,<0.0.18",
     "pytest-cov",
     "pytest-xdist",
     "pytest<8",

From e399e9596d9fe1cf2df0ff1270e2c0c764331b8e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 8 Nov 2024 16:23:25 -0800
Subject: [PATCH 06/19] Use pylibcudf enums in cudf Python quantile (#17287)

Shouldn't need to use the "private" `pylibcudf.libcudf` types anymore now that the Python side enums are exposed

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17287
---
 python/cudf/cudf/_lib/quantiles.pyx | 28 +++---------------
 python/cudf/cudf/_lib/types.pxd     |  5 ----
 python/cudf/cudf/_lib/types.pyx     | 44 -----------------------------
 python/cudf/cudf/core/frame.py      | 12 ++++----
 4 files changed, 10 insertions(+), 79 deletions(-)

diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx
index 7666b7ff8da..509cfe5e9f8 100644
--- a/python/cudf/cudf/_lib/quantiles.pyx
+++ b/python/cudf/cudf/_lib/quantiles.pyx
@@ -6,14 +6,6 @@ from libcpp cimport bool
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.types cimport (
-    underlying_type_t_interpolation,
-    underlying_type_t_sorted,
-)
-
-from cudf._lib.types import Interpolation
-
-from pylibcudf.libcudf.types cimport interpolation, sorted
 
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
@@ -28,17 +20,13 @@ def quantile(
     Column ordered_indices,
     bool exact,
 ):
-    cdef interpolation c_interp = <interpolation>(
-        <underlying_type_t_interpolation> Interpolation[interp.upper()]
-    )
-
     return Column.from_pylibcudf(
         plc.quantiles.quantile(
             input.to_pylibcudf(mode="read"),
             q,
-            c_interp,
+            plc.types.Interpolation[interp.upper()],
             ordered_indices.to_pylibcudf(mode="read"),
-            <bool>exact
+            exact
         )
     )
 
@@ -51,22 +39,14 @@ def quantile_table(
     list column_order,
     list null_precedence,
 ):
-
-    cdef interpolation c_interp = <interpolation>(
-        <underlying_type_t_interpolation> interp
-    )
-    cdef sorted c_is_input_sorted = <sorted>(
-        <underlying_type_t_sorted> is_input_sorted
-    )
-
     return columns_from_pylibcudf_table(
         plc.quantiles.quantiles(
             plc.Table([
                 c.to_pylibcudf(mode="read") for c in source_columns
             ]),
             q,
-            c_interp,
-            c_is_input_sorted,
+            interp,
+            is_input_sorted,
             column_order,
             null_precedence
         )
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index 4fd3d31841e..c2b760490c1 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -7,12 +7,7 @@ cimport pylibcudf.libcudf.types as libcudf_types
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
-ctypedef bool underlying_type_t_order
-ctypedef bool underlying_type_t_null_order
-ctypedef bool underlying_type_t_sorted
-ctypedef int32_t underlying_type_t_interpolation
 ctypedef int32_t underlying_type_t_type_id
-ctypedef bool underlying_type_t_null_policy
 
 cdef dtype_from_column_view(column_view cv)
 
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 861bb063707..f169ea12b10 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -11,12 +11,6 @@ cimport pylibcudf.libcudf.types as libcudf_types
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
-from cudf._lib.types cimport (
-    underlying_type_t_interpolation,
-    underlying_type_t_order,
-    underlying_type_t_sorted,
-)
-
 import pylibcudf
 
 import cudf
@@ -151,44 +145,6 @@ datetime_unit_map = {
 size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID]
 
 
-class Interpolation(IntEnum):
-    LINEAR = (
-        <underlying_type_t_interpolation> libcudf_types.interpolation.LINEAR
-    )
-    LOWER = (
-        <underlying_type_t_interpolation> libcudf_types.interpolation.LOWER
-    )
-    HIGHER = (
-        <underlying_type_t_interpolation> libcudf_types.interpolation.HIGHER
-    )
-    MIDPOINT = (
-        <underlying_type_t_interpolation> libcudf_types.interpolation.MIDPOINT
-    )
-    NEAREST = (
-        <underlying_type_t_interpolation> libcudf_types.interpolation.NEAREST
-    )
-
-
-class Order(IntEnum):
-    ASCENDING = <underlying_type_t_order> libcudf_types.order.ASCENDING
-    DESCENDING = <underlying_type_t_order> libcudf_types.order.DESCENDING
-
-
-class Sorted(IntEnum):
-    YES = <underlying_type_t_sorted> libcudf_types.sorted.YES
-    NO = <underlying_type_t_sorted> libcudf_types.sorted.NO
-
-
-class NullOrder(IntEnum):
-    BEFORE = <underlying_type_t_order> libcudf_types.null_order.BEFORE
-    AFTER = <underlying_type_t_order> libcudf_types.null_order.AFTER
-
-
-class NullHandling(IntEnum):
-    INCLUDE = <underlying_type_t_null_policy> libcudf_types.null_policy.INCLUDE
-    EXCLUDE = <underlying_type_t_null_policy> libcudf_types.null_policy.EXCLUDE
-
-
 cdef dtype_from_lists_column_view(column_view cv):
     # lists_column_view have no default constructor, so we heap
     # allocate it to get around Cython's limitation of requiring
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 205edd91d9d..2b4a17f9559 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -16,6 +16,8 @@
 import pyarrow as pa
 from typing_extensions import Self
 
+import pylibcudf as plc
+
 import cudf
 from cudf import _lib as libcudf
 from cudf.api.types import is_dtype_equal, is_scalar
@@ -789,15 +791,13 @@ def _quantile_table(
         column_order=(),
         null_precedence=(),
     ):
-        interpolation = libcudf.types.Interpolation[interpolation]
+        interpolation = plc.types.Interpolation[interpolation]
 
-        is_sorted = libcudf.types.Sorted["YES" if is_sorted else "NO"]
+        is_sorted = plc.types.Sorted["YES" if is_sorted else "NO"]
 
-        column_order = [libcudf.types.Order[key] for key in column_order]
+        column_order = [plc.types.Order[key] for key in column_order]
 
-        null_precedence = [
-            libcudf.types.NullOrder[key] for key in null_precedence
-        ]
+        null_precedence = [plc.types.NullOrder[key] for key in null_precedence]
 
         return self._from_columns_like_self(
             libcudf.quantiles.quantile_table(

From 7a499f645c040c300e466721a39be65e3e1b054e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 8 Nov 2024 17:38:47 -0800
Subject: [PATCH 07/19] Use more pylibcudf Python enums in cudf._lib (#17288)

Similar to https://github.com/rapidsai/cudf/pull/17287. Also remove a `plc` naming shadowing

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17288
---
 python/cudf/cudf/_lib/groupby.pyx | 7 ++-----
 python/cudf/cudf/_lib/json.pyx    | 2 +-
 python/cudf/cudf/_lib/lists.pyx   | 8 ++++++--
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index c199ed96d4f..1ce6dfab15e 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -18,7 +18,6 @@ from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib.scalar import as_device_scalar
 
-from pylibcudf.libcudf.replace cimport replace_policy
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 import pylibcudf
@@ -244,13 +243,11 @@ cdef class GroupBy:
         return columns_from_pylibcudf_table(shifts), columns_from_pylibcudf_table(keys)
 
     def replace_nulls(self, list values, object method):
-        # TODO: This is using an enum (replace_policy) that has not been exposed in
-        # pylibcudf yet. We'll want to fix that import once it is in pylibcudf.
         _, replaced = self._groupby.replace_nulls(
             pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]),
             [
-                replace_policy.PRECEDING
-                if method == 'ffill' else replace_policy.FOLLOWING
+                pylibcudf.replace.ReplacePolicy.PRECEDING
+                if method == 'ffill' else pylibcudf.replace.ReplacePolicy.FOLLOWING
             ] * len(values),
         )
 
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index fb149603960..7dc9cd01a00 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -104,7 +104,7 @@ cpdef read_json(object filepaths_or_buffers,
         )
         df = cudf.DataFrame._from_data(
             *_data_from_columns(
-                columns=[Column.from_pylibcudf(plc) for plc in res_cols],
+                columns=[Column.from_pylibcudf(col) for col in res_cols],
                 column_names=res_col_names,
                 index_names=None
                )
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 12432ac6d5d..a91d44274e5 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -4,7 +4,7 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
-from pylibcudf.libcudf.types cimport null_order, size_type
+from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_pylibcudf_table
@@ -49,7 +49,11 @@ def sort_lists(Column col, bool ascending, str na_position):
         plc.lists.sort_lists(
             col.to_pylibcudf(mode="read"),
             ascending,
-            null_order.BEFORE if na_position == "first" else null_order.AFTER,
+            (
+                plc.types.NullOrder.BEFORE
+                if na_position == "first"
+                else plc.types.NullOrder.AFTER
+            ),
             False,
         )
     )

From 5cbdcd07a71fd63813840fdf270d7aec62f1c844 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Fri, 8 Nov 2024 21:53:45 -0500
Subject: [PATCH 08/19] Expose delimiter character in JSON reader options to
 JSON reader APIs (#17266)

Fixes #17261
Removes delimiter symbol group from whitespace normalization FST since it is run post-tokenization.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/17266
---
 cpp/include/cudf/io/detail/json.hpp           |  8 +--
 cpp/src/io/json/json_normalization.cu         | 49 ++++++++++---------
 cpp/src/io/json/read_json.cu                  |  3 +-
 .../io/json/json_quote_normalization_test.cpp | 21 ++++++--
 4 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 940d03cdb41..2e2ac43d6fe 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -57,11 +57,13 @@ void write_json(data_sink* sink,
 /**
  * @brief Normalize single quotes to double quotes using FST
  *
- * @param indata Input device buffer
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
+ * @param indata    Input device buffer
+ * @param delimiter Line-separating delimiter character in JSONL inputs
+ * @param stream    CUDA stream used for device memory operations and kernel launches
+ * @param mr        Device memory resource to use for device memory allocation
  */
 void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
+                             char delimiter,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
 
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 34a87918e57..1b61be20202 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -58,7 +58,7 @@ enum class dfa_symbol_group_id : uint32_t {
   DOUBLE_QUOTE_CHAR,  ///< Quote character SG: "
   SINGLE_QUOTE_CHAR,  ///< Quote character SG: '
   ESCAPE_CHAR,        ///< Escape character SG: '\'
-  NEWLINE_CHAR,       ///< Newline character SG: '\n'
+  DELIM_CHAR,         ///< Delimiter character SG
   OTHER_SYMBOLS,      ///< SG implicitly matching all other characters
   NUM_SYMBOL_GROUPS   ///< Total number of symbol groups
 };
@@ -72,13 +72,17 @@ constexpr auto TT_SEC            = dfa_states::TT_SEC;
 constexpr auto TT_NUM_STATES     = static_cast<StateT>(dfa_states::TT_NUM_STATES);
 constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
 
-// The i-th string representing all the characters of a symbol group
-std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const qna_sgs{
-  {{'\"'}, {'\''}, {'\\'}, {'\n'}}};
+auto get_sgid_lut(SymbolT delim)
+{
+  // The i-th string representing all the characters of a symbol group
+  std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> symbol_groups{
+    {{'\"'}, {'\''}, {'\\'}, {delim}}};
+  return symbol_groups;
+}
 
 // Transition table
 std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const qna_state_tt{{
-  /* IN_STATE      "       '       \       \n    OTHER  */
+  /* IN_STATE      "       '       \     <delim>    OTHER  */
   /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}},
   /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}},
   /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}},
@@ -199,28 +203,26 @@ struct TransduceToNormalizedQuotes {
 
 namespace normalize_whitespace {
 
+// We do not need a symbol group for the delimiter character since whitespace normalization
+// now occurs after tokenization.
 enum class dfa_symbol_group_id : uint32_t {
   DOUBLE_QUOTE_CHAR,   ///< Quote character SG: "
   ESCAPE_CHAR,         ///< Escape character SG: '\\'
-  NEWLINE_CHAR,        ///< Newline character SG: '\n'
   WHITESPACE_SYMBOLS,  ///< Whitespace characters SG: '\t' or ' '
   OTHER_SYMBOLS,       ///< SG implicitly matching all other characters
   NUM_SYMBOL_GROUPS    ///< Total number of symbol groups
 };
 // Alias for readability of symbol group ids
 constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
-// The i-th string representing all the characters of a symbol group
-std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
-  {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};
+
+std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{{{'"'}, {'\\'}, {' ', '\t'}}};
 
 /**
  * -------- FST states ---------
  * -----------------------------
  * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
- *        |   quotes as well as any other character not enclosed by a string. Also handles
- *        |   newline character present within a string
- * TT_DQS | Double-quoted string state handling all characters within double quotes except
- *        |   newline character
+ *        |   quotes as well as any other character not enclosed by a string.
+ * TT_DQS | Double-quoted string state handling all characters within double quotes
  * TT_DEC | State handling escaped characters inside double-quoted string. Note that this
  *        |   state is necessary to process escaped double-quote characters. Without this
  *        |   state, whitespaces following escaped double quotes inside strings may be removed.
@@ -235,10 +237,10 @@ constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
 
 // Transition table
 std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
-  {/* IN_STATE      "       \       \n    <SPC>   OTHER  */
-   /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
-   /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
-   /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
+  {/* IN_STATE      "       \     <SPC>   OTHER  */
+   /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS}},
+   /* TT_DQS */ {{TT_OOS, TT_DEC, TT_DQS, TT_DQS}},
+   /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
 
 // The DFA's starting state
 constexpr StateT start_state = static_cast<StateT>(TT_OOS);
@@ -302,18 +304,19 @@ struct TransduceToNormalizedWS {
 namespace detail {
 
 void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
+                             char delimiter,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   static constexpr std::int32_t min_out = 0;
   static constexpr std::int32_t max_out = 2;
-  auto parser =
-    fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
-                          fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
-                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
-                            normalize_quotes::TransduceToNormalizedQuotes{}),
-                          stream);
+  auto parser                           = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(normalize_quotes::get_sgid_lut(delimiter)),
+    fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
+    fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+      normalize_quotes::TransduceToNormalizedQuotes{}),
+    stream);
 
   rmm::device_buffer outbuf(indata.size() * 2, stream, mr);
   cudf::detail::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 2bc15ea19cb..279f5e71351 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -248,7 +248,8 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
   // invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_single_quotes()) {
-    normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref());
+    normalize_single_quotes(
+      bufview, reader_opts.get_delimiter(), stream, cudf::get_current_device_resource_ref());
   }
 
   auto buffer =
diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
index c8c2d18903f..0fbd7da7f4d 100644
--- a/cpp/tests/io/json/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json/json_quote_normalization_test.cpp
@@ -34,7 +34,9 @@
 // Base test fixture for tests
 struct JsonNormalizationTest : public cudf::test::BaseFixture {};
 
-void run_test(std::string const& host_input, std::string const& expected_host_output)
+void run_test(std::string const& host_input,
+              std::string const& expected_host_output,
+              char delimiter = '\n')
 {
   // RMM memory resource
   std::shared_ptr<rmm::mr::device_memory_resource> rsc =
@@ -46,7 +48,7 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
 
   // Preprocessing FST
   cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
-  cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get());
+  cudf::io::json::detail::normalize_single_quotes(device_data, delimiter, stream_view, rsc.get());
 
   std::string preprocessed_host_output(device_data.size(), 0);
   CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
@@ -172,6 +174,13 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces
   run_test(input, output);
 }
 
+TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_NonNewlineDelimiter)
+{
+  std::string input{"{\"a\": \"1\n2\"}z{\'a\': 12}"};
+  std::string output{"{\"a\": \"1\n2\"}z{\"a\": 12}"};
+  run_test(input, output, 'z');
+}
+
 TEST_F(JsonNormalizationTest, ReadJsonOption)
 {
   // RMM memory resource
@@ -179,22 +188,24 @@ TEST_F(JsonNormalizationTest, ReadJsonOption)
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
   // Test input
-  std::string const host_input = R"({"A":'TEST"'})";
+  std::string const host_input = R"({"a": "1\n2"}h{'a': 12})";
   cudf::io::json_reader_options input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{host_input.data(), host_input.size()})
       .lines(true)
+      .delimiter('h')
       .normalize_single_quotes(true);
 
   cudf::io::table_with_metadata processed_table =
     cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get());
 
   // Expected table
-  std::string const expected_input = R"({"A":"TEST\""})";
+  std::string const expected_input = R"({"a": "1\n2"}h{"a": 12})";
   cudf::io::json_reader_options expected_input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{expected_input.data(), expected_input.size()})
-      .lines(true);
+      .lines(true)
+      .delimiter('h');
 
   cudf::io::table_with_metadata expected_table =
     cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get());

From 84743c3d413f386077ff6f5f162e5d5159449ccd Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 11 Nov 2024 18:19:28 -0600
Subject: [PATCH 09/19] Fix `Dataframe.__setitem__` slow-downs (#17222)

Fixes: #17140

This PR fixes slow-downs in `DataFrame.__seitem__` by properly passing in CPU objects where needed instead of passing a GPU object and then failing and performing a GPU -> CPU transfer.

`DataFrame.__setitem__` first argument can be a column(pd.Index), in our fast path this will be converted to `cudf.Index` and thus there will be failure from cudf side and then the transfer to CPU + slow-path executes, this is the primary reason for slowdown. This PR maintains a dict mapping of such special functions where we shouldn't be converting the objects to fast path.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17222
---
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 49 ++++++++++++++++++-
 .../cudf_pandas_tests/test_cudf_pandas.py     | 23 +++++++++
 2 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 99c0cb82f41..9768a6c4a2f 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -33,6 +33,20 @@ def call_operator(fn, args, kwargs):
     "EXECUTE_SLOW": 0x0571B0,
 }
 
+# This is a dict of functions that are known to have arguments that
+# need to be transformed from fast to slow only. i.e., Some cudf functions
+# error on passing a device object but don't error on passing a host object.
+# For example: DataFrame.__setitem__(arg, value) errors on passing a
+# cudf.Index object but doesn't error on passing a pd.Index object.
+# Hence we need to transform the arg from fast to slow only. So, we use
+# a dictionary like:
+# {"DataFrame.__setitem__": {0}}
+# where the keys are the function names and the values are the indices
+# (0-based) of the arguments that need to be transformed.
+
+_SPECIAL_FUNCTIONS_ARGS_MAP = {
+    "DataFrame.__setitem__": {0},
+}
 
 _WRAPPER_ASSIGNMENTS = tuple(
     attr
@@ -875,6 +889,10 @@ def __name__(self, value):
             pass
         setattr(self._fsproxy_slow, "__name__", value)
 
+    @property
+    def _customqualname(self):
+        return self._fsproxy_slow.__qualname__
+
 
 def _assert_fast_slow_eq(left, right):
     if _is_final_type(type(left)) or type(left) in NUMPY_TYPES:
@@ -1011,7 +1029,36 @@ def _transform_arg(
         # use __reduce_ex__ instead...
         if type(arg) is tuple:
             # Must come first to avoid infinite recursion
-            return tuple(_transform_arg(a, attribute_name, seen) for a in arg)
+            if (
+                len(arg) > 0
+                and isinstance(arg[0], _MethodProxy)
+                and arg[0]._customqualname in _SPECIAL_FUNCTIONS_ARGS_MAP
+            ):
+                indices_map = _SPECIAL_FUNCTIONS_ARGS_MAP[
+                    arg[0]._customqualname
+                ]
+                method_proxy, original_args, original_kwargs = arg
+
+                original_args = tuple(
+                    _transform_arg(a, "_fsproxy_slow", seen)
+                    if i - 1 in indices_map
+                    else _transform_arg(a, attribute_name, seen)
+                    for i, a in enumerate(original_args)
+                )
+                original_kwargs = _transform_arg(
+                    original_kwargs, attribute_name, seen
+                )
+                return tuple(
+                    (
+                        _transform_arg(method_proxy, attribute_name, seen),
+                        original_args,
+                        original_kwargs,
+                    )
+                )
+            else:
+                return tuple(
+                    _transform_arg(a, attribute_name, seen) for a in arg
+                )
         elif hasattr(arg, "__getnewargs_ex__"):
             # Partial implementation of to reconstruct with
             # transformed pieces
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index e260b448219..d48fbad0ec3 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -12,6 +12,7 @@
 import pickle
 import subprocess
 import tempfile
+import time
 import types
 from io import BytesIO, StringIO
 
@@ -1795,3 +1796,25 @@ def test_iter_doesnot_raise(monkeypatch):
         monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True")
         for _ in s:
             pass
+
+
+def test_dataframe_setitem_slowdown():
+    # We are explicitly testing the slowdown of the setitem operation
+    df = xpd.DataFrame(
+        {"a": [1, 2, 3] * 100000, "b": [1, 2, 3] * 100000}
+    ).astype("float64")
+    df = xpd.DataFrame({"a": df["a"].repeat(1000), "b": df["b"].repeat(1000)})
+    new_df = df + 1
+    start_time = time.time()
+    df[df.columns] = new_df
+    end_time = time.time()
+    delta = int(end_time - start_time)
+    if delta > 5:
+        pytest.fail(f"Test took too long to run, runtime: {delta}")
+
+
+def test_dataframe_setitem():
+    df = xpd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}).astype("float64")
+    new_df = df + 1
+    df[df.columns] = new_df
+    tm.assert_equal(df, new_df)

From 61031ccd5977d5d85bf0b8e9c32bea1c853a25ae Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 11 Nov 2024 21:57:47 -0500
Subject: [PATCH 10/19] Expose streams in public quantile APIs (#17257)

Adds stream parameter to
```
cudf::quantile
cudf::quantiles
cudf::percentile_approx
```
Added stream gtests to verify correct stream forwarding.

Reference: #13744

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17257
---
 cpp/include/cudf/quantiles.hpp       |  6 +++
 cpp/src/quantiles/quantile.cu        |  3 +-
 cpp/src/quantiles/quantiles.cu       | 11 ++---
 cpp/src/quantiles/tdigest/tdigest.cu |  3 +-
 cpp/tests/CMakeLists.txt             |  1 +
 cpp/tests/streams/quantile_test.cpp  | 74 ++++++++++++++++++++++++++++
 6 files changed, 88 insertions(+), 10 deletions(-)
 create mode 100644 cpp/tests/streams/quantile_test.cpp

diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index f6bae170f03..f0039734519 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -48,6 +48,7 @@ namespace CUDF_EXPORT cudf {
  *                            ignored.
  * @param[in] exact           If true, returns doubles.
  *                            If false, returns same type as input.
+ * @param[in] stream          CUDA stream used for device memory operations and kernel launches
  * @param[in] mr              Device memory resource used to allocate the returned column's device
  memory
  * @returns Column of specified quantiles, with nulls for indeterminable values
@@ -59,6 +60,7 @@ std::unique_ptr<column> quantile(
   interpolation interp               = interpolation::LINEAR,
   column_view const& ordered_indices = {},
   bool exact                         = true,
+  rmm::cuda_stream_view stream       = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr  = cudf::get_current_device_resource_ref());
 
 /**
@@ -85,6 +87,7 @@ std::unique_ptr<column> quantile(
  * @param is_input_sorted Indicates if the input has been pre-sorted
  * @param column_order    The desired sort order for each column
  * @param null_precedence The desired order of null compared to other elements
+ * @param stream          CUDA stream used for device memory operations and kernel launches
  * @param mr              Device memory resource used to allocate the returned table's device memory
  *
  * @returns Table of specified quantiles, with nulls for indeterminable values
@@ -98,6 +101,7 @@ std::unique_ptr<table> quantiles(
   cudf::sorted is_input_sorted                   = sorted::NO,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
@@ -114,6 +118,7 @@ std::unique_ptr<table> quantiles(
  *
  * @param input           tdigest input data. One tdigest per row
  * @param percentiles     Desired percentiles in range [0, 1]
+ * @param stream          CUDA stream used for device memory operations and kernel launches
  * @param mr              Device memory resource used to allocate the returned column's device
  * memory
  *
@@ -125,6 +130,7 @@ std::unique_ptr<table> quantiles(
 std::unique_ptr<column> percentile_approx(
   tdigest::tdigest_column_view const& input,
   column_view const& percentiles,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 80fd72a3088..21f6fe87a62 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -195,10 +195,11 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  interpolation interp,
                                  column_view const& ordered_indices,
                                  bool exact,
+                                 rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::quantile(input, q, interp, ordered_indices, exact, cudf::get_default_stream(), mr);
+  return detail::quantile(input, q, interp, ordered_indices, exact, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index 69421f3bfc4..a94fb9362b9 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -103,17 +103,12 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  cudf::sorted is_input_sorted,
                                  std::vector<order> const& column_order,
                                  std::vector<null_order> const& null_precedence,
+                                 rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::quantiles(input,
-                           q,
-                           interp,
-                           is_input_sorted,
-                           column_order,
-                           null_precedence,
-                           cudf::get_default_stream(),
-                           mr);
+  return detail::quantiles(
+    input, q, interp, is_input_sorted, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 43c3b0a291b..fb5aebb4b39 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -410,10 +410,11 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
 
 std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
+                                          rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return tdigest::percentile_approx(input, percentiles, cudf::get_default_stream(), mr);
+  return tdigest::percentile_approx(input, percentiles, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f502195aea4..3a9b930830b 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -711,6 +711,7 @@ ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_PARTITIONING_TEST streams/partitioning_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
+ConfigureTest(STREAM_QUANTILE_TEST streams/quantile_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/quantile_test.cpp b/cpp/tests/streams/quantile_test.cpp
new file mode 100644
index 00000000000..4f4f16a9e70
--- /dev/null
+++ b/cpp/tests/streams/quantile_test.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/quantiles.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <stdexcept>
+
+struct QuantileTest : public cudf::test::BaseFixture {};
+
+TEST_F(QuantileTest, TestMultiColumnUnsorted)
+{
+  auto input_a = cudf::test::strings_column_wrapper(
+    {"C", "B", "A", "A", "D", "B", "D", "B", "D", "C", "C", "C",
+     "D", "B", "D", "B", "C", "C", "A", "D", "B", "A", "A", "A"},
+    {true, true, true, true, true, true, true, true, true, true, true, true,
+     true, true, true, true, true, true, true, true, true, true, true, true});
+
+  cudf::test::fixed_width_column_wrapper<numeric::decimal32, int32_t> input_b(
+    {4, 3, 5, 0, 1, 0, 4, 1, 5, 3, 0, 5, 2, 4, 3, 2, 1, 2, 3, 0, 5, 1, 4, 2},
+    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+
+  auto input = cudf::table_view({input_a, input_b});
+
+  auto actual = cudf::quantiles(input,
+                                {0.0f, 0.5f, 0.7f, 0.25f, 1.0f},
+                                cudf::interpolation::NEAREST,
+                                cudf::sorted::NO,
+                                {cudf::order::ASCENDING, cudf::order::DESCENDING},
+                                {},
+                                cudf::test::get_default_stream());
+}
+
+TEST_F(QuantileTest, TestEmpty)
+{
+  auto input = cudf::test::fixed_width_column_wrapper<numeric::decimal32>({});
+  cudf::quantile(
+    input, {0.5, 0.25}, cudf::interpolation::LINEAR, {}, true, cudf::test::get_default_stream());
+}
+
+TEST_F(QuantileTest, EmptyInput)
+{
+  auto empty_ = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
+  cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
+
+  std::vector<cudf::column_view> input;
+  input.push_back(*empty_);
+  input.push_back(*empty_);
+  input.push_back(*empty_);
+  auto empty = cudf::concatenate(input, cudf::test::get_default_stream());
+
+  cudf::tdigest::tdigest_column_view tdv(*empty);
+  auto result = cudf::percentile_approx(tdv, percentiles, cudf::test::get_default_stream());
+}

From bdddab39826c061d3fad932aa306ba9313b1d062 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 12 Nov 2024 04:52:11 +0100
Subject: [PATCH 11/19] cmake option: `CUDF_KVIKIO_REMOTE_IO` (#17291)

Compile flag to enable/disable remote IO through KvikIO: `CUDF_KVIKIO_REMOTE_IO`

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17291
---
 cpp/CMakeLists.txt                    | 12 ++++++++++++
 cpp/cmake/thirdparty/get_kvikio.cmake |  2 +-
 cpp/src/io/utilities/datasource.cpp   | 19 ++++++++++++++++---
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 559826ac232..65b05fd518b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -90,6 +90,12 @@ option(
 mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL)
 option(CUDF_STATIC_LINTERS "Enable static linters during compilation" OFF)
 
+option(
+  CUDF_KVIKIO_REMOTE_IO
+  "Enable remote IO (e.g. AWS S3) support through KvikIO. If disabled, cudf-python will still be able to do remote IO through fsspec."
+  ON
+)
+
 message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}")
 message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
 message(VERBOSE "CUDF: Configure CMake to build (google & nvbench) benchmarks: ${BUILD_BENCHMARKS}")
@@ -109,6 +115,9 @@ message(
   "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler): ${CUDA_ENABLE_LINEINFO}"
 )
 message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
+message(VERBOSE
+        "CUDF: Build with remote IO (e.g. AWS S3) support through KvikIO: ${CUDF_KVIKIO_REMOTE_IO}"
+)
 
 # Set a default build type if none was specified
 rapids_cmake_build_type("Release")
@@ -890,6 +899,9 @@ target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL
 # Define spdlog level
 target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}")
 
+# Enable remote IO through KvikIO
+target_compile_definitions(cudf PRIVATE $<$<BOOL:${CUDF_KVIKIO_REMOTE_IO}>:CUDF_KVIKIO_REMOTE_IO>)
+
 # Compile stringified JIT sources first
 add_dependencies(cudf jitify_preprocess_run)
 
diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake
index c949f48505e..73f875b46c2 100644
--- a/cpp/cmake/thirdparty/get_kvikio.cmake
+++ b/cpp/cmake/thirdparty/get_kvikio.cmake
@@ -22,7 +22,7 @@ function(find_and_configure_kvikio VERSION)
     GIT_REPOSITORY https://github.com/rapidsai/kvikio.git
     GIT_TAG branch-${VERSION}
     GIT_SHALLOW TRUE SOURCE_SUBDIR cpp
-    OPTIONS "KvikIO_BUILD_EXAMPLES OFF"
+    OPTIONS "KvikIO_BUILD_EXAMPLES OFF" "KvikIO_REMOTE_SUPPORT ${CUDF_KVIKIO_REMOTE_IO}"
   )
 
   include("${rapids-cmake-dir}/export/find_package_root.cmake")
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 9ea39e692b6..5ccc91e4220 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -26,7 +26,6 @@
 #include <cudf/utilities/span.hpp>
 
 #include <kvikio/file_handle.hpp>
-#include <kvikio/remote_handle.hpp>
 
 #include <rmm/device_buffer.hpp>
 
@@ -37,6 +36,10 @@
 #include <regex>
 #include <vector>
 
+#ifdef CUDF_KVIKIO_REMOTE_IO
+#include <kvikio/remote_handle.hpp>
+#endif
+
 namespace cudf {
 namespace io {
 namespace {
@@ -391,6 +394,7 @@ class user_datasource_wrapper : public datasource {
   datasource* const source;  ///< A non-owning pointer to the user-implemented datasource
 };
 
+#ifdef CUDF_KVIKIO_REMOTE_IO
 /**
  * @brief Remote file source backed by KvikIO, which handles S3 filepaths seamlessly.
  */
@@ -463,14 +467,23 @@ class remote_file_source : public datasource {
   static bool is_supported_remote_url(std::string const& url)
   {
     // Regular expression to match "s3://"
-    std::regex pattern{R"(^s3://)", std::regex_constants::icase};
+    static std::regex pattern{R"(^s3://)", std::regex_constants::icase};
     return std::regex_search(url, pattern);
   }
 
  private:
   kvikio::RemoteHandle _kvikio_file;
 };
-
+#else
+/**
+ * @brief When KvikIO remote IO is disabled, `is_supported_remote_url()` return false always.
+ */
+class remote_file_source : public file_source {
+ public:
+  explicit remote_file_source(char const* filepath) : file_source(filepath) {}
+  static constexpr bool is_supported_remote_url(std::string const&) { return false; }
+};
+#endif
 }  // namespace
 
 std::unique_ptr<datasource> datasource::create(std::string const& filepath,

From 202c2318282e859c8a156a48cfbc133dd2941117 Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Tue, 12 Nov 2024 12:36:44 +0800
Subject: [PATCH 12/19] Replace workaround of JNI build with
 CUDF_KVIKIO_REMOTE_IO=OFF (#17293)

JNI build does not require kvikIO, to unblock the build use `CUDF_KVIKIO_REMOTE_IO=OFF` in cpp build phase.

this should be merged after https://github.com/rapidsai/cudf/pull/17291

Authors:
  - Peixin (https://github.com/pxLi)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17293
---
 java/ci/build-in-docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index 4b5379cf0f1..b85c215d7d1 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -65,7 +65,7 @@ cmake .. -G"${CMAKE_GENERATOR}" \
          -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS \
          -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL \
          -DBUILD_SHARED_LIBS=OFF \
-         -DKvikIO_REMOTE_SUPPORT=OFF
+         -DCUDF_KVIKIO_REMOTE_IO=OFF
 
 if [[ -z "${PARALLEL_LEVEL}" ]]; then
     cmake --build .

From 043bcbdf28aa9f7213c3f1f2b4170f4940c9d39e Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 12 Nov 2024 07:12:05 -0500
Subject: [PATCH 13/19] [FEA] Report all unsupported operations for a query in
 cudf.polars (#16960)

Closes #16690. The purpose of this PR is to list all of the unique operations that are unsupported by `cudf.polars` when running a query.

1. Question: How to traverse the tree to report the error nodes? Should this be done upstream in Polars?
2. Instead of traversing the query afterwards, we should probably catch each unsupported feature as we translate the IR.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16960
---
 python/cudf_polars/cudf_polars/__init__.py    |   4 +-
 python/cudf_polars/cudf_polars/callback.py    |  32 +-
 python/cudf_polars/cudf_polars/dsl/expr.py    |   2 +
 .../cudf_polars/dsl/expressions/base.py       |  11 +
 python/cudf_polars/cudf_polars/dsl/ir.py      |  19 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  | 382 ++++++++++--------
 .../cudf_polars/testing/asserts.py            |  14 +-
 .../cudf_polars/cudf_polars/utils/dtypes.py   |  11 +-
 python/cudf_polars/docs/overview.md           |   4 +-
 python/cudf_polars/tests/dsl/test_to_ast.py   |   4 +-
 .../cudf_polars/tests/dsl/test_traversal.py   |   8 +-
 .../tests/expressions/test_sort.py            |   4 +-
 python/cudf_polars/tests/test_mapfunction.py  |  13 -
 13 files changed, 297 insertions(+), 211 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
index 66c15f694ee..ba4858c5619 100644
--- a/python/cudf_polars/cudf_polars/__init__.py
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -12,7 +12,7 @@
 
 from cudf_polars._version import __git_commit__, __version__
 from cudf_polars.callback import execute_with_cudf
-from cudf_polars.dsl.translate import translate_ir
+from cudf_polars.dsl.translate import Translator
 
 # Check we have a supported polars version
 from cudf_polars.utils.versions import _ensure_polars_version
@@ -22,7 +22,7 @@
 
 __all__: list[str] = [
     "execute_with_cudf",
-    "translate_ir",
+    "Translator",
     "__git_commit__",
     "__version__",
 ]
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 76816ee0a61..ff4933c7564 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -18,7 +18,7 @@
 import rmm
 from rmm._cuda import gpu
 
-from cudf_polars.dsl.translate import translate_ir
+from cudf_polars.dsl.translate import Translator
 
 if TYPE_CHECKING:
     from collections.abc import Generator
@@ -180,14 +180,30 @@ def execute_with_cudf(
         )
     try:
         with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
-            nt.set_udf(
-                partial(
-                    _callback,
-                    translate_ir(nt),
-                    device=device,
-                    memory_resource=memory_resource,
+            translator = Translator(nt)
+            ir = translator.translate_ir()
+            ir_translation_errors = translator.errors
+            if len(ir_translation_errors):
+                # TODO: Display these errors in user-friendly way.
+                # tracked in https://github.com/rapidsai/cudf/issues/17051
+                unique_errors = sorted(set(ir_translation_errors), key=str)
+                error_message = "Query contained unsupported operations"
+                verbose_error_message = (
+                    f"{error_message}\nThe errors were:\n{unique_errors}"
+                )
+                unsupported_ops_exception = NotImplementedError(
+                    error_message, unique_errors
+                )
+                if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
+                    warnings.warn(verbose_error_message, UserWarning, stacklevel=2)
+                if raise_on_fail:
+                    raise unsupported_ops_exception
+            else:
+                nt.set_udf(
+                    partial(
+                        _callback, ir, device=device, memory_resource=memory_resource
+                    )
                 )
-            )
     except exception as e:
         if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
             warnings.warn(
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 1881286ccbb..326d6b65cbe 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -20,6 +20,7 @@
     AggInfo,
     Col,
     ColRef,
+    ErrorExpr,
     Expr,
     NamedExpr,
 )
@@ -36,6 +37,7 @@
 
 __all__ = [
     "Expr",
+    "ErrorExpr",
     "NamedExpr",
     "Literal",
     "LiteralColumn",
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
index 21ba7aea707..23851f91938 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
@@ -155,6 +155,17 @@ def collect_agg(self, *, depth: int) -> AggInfo:
         )  # pragma: no cover; check_agg trips first
 
 
+class ErrorExpr(Expr):
+    __slots__ = ("error",)
+    _non_child = ("dtype", "error")
+    error: str
+
+    def __init__(self, dtype: plc.DataType, error: str) -> None:
+        self.dtype = dtype
+        self.error = error
+        self.children = ()
+
+
 class NamedExpr:
     # NamedExpr does not inherit from Expr since it does not appear
     # when evaluating expressions themselves, only when constructing
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index bc42b4a254f..beea5908e56 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -42,6 +42,7 @@
 
 __all__ = [
     "IR",
+    "ErrorNode",
     "PythonScan",
     "Scan",
     "Cache",
@@ -212,6 +213,22 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         )
 
 
+class ErrorNode(IR):
+    """Represents an error translating the IR."""
+
+    __slots__ = ("error",)
+    _non_child = (
+        "schema",
+        "error",
+    )
+    error: str
+    """The error."""
+
+    def __init__(self, schema: Schema, error: str):
+        self.schema = schema
+        self.error = error
+
+
 class PythonScan(IR):
     """Representation of input from a python function."""
 
@@ -1532,7 +1549,7 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR):
                 raise NotImplementedError(
                     "Unpivot cannot cast all input columns to "
                     f"{self.schema[value_name].id()}"
-                )
+                )  # pragma: no cover
             self.options = (
                 tuple(indices),
                 tuple(pivotees),
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 2711676d31e..e8ed009cdf2 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -9,7 +9,7 @@
 import json
 from contextlib import AbstractContextManager, nullcontext
 from functools import singledispatch
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import pyarrow as pa
 from typing_extensions import assert_never
@@ -25,7 +25,123 @@
 from cudf_polars.typing import NodeTraverser
 from cudf_polars.utils import dtypes, sorting
 
-__all__ = ["translate_ir", "translate_named_expr"]
+if TYPE_CHECKING:
+    from cudf_polars.typing import NodeTraverser
+
+__all__ = ["Translator", "translate_named_expr"]
+
+
+class Translator:
+    """
+    Translates polars-internal IR nodes and expressions to our representation.
+
+    Parameters
+    ----------
+    visitor
+        Polars NodeTraverser object
+    """
+
+    def __init__(self, visitor: NodeTraverser):
+        self.visitor = visitor
+        self.errors: list[Exception] = []
+
+    def translate_ir(self, *, n: int | None = None) -> ir.IR:
+        """
+        Translate a polars-internal IR node to our representation.
+
+        Parameters
+        ----------
+        visitor
+            Polars NodeTraverser object
+        n
+            Optional node to start traversing from, if not provided uses
+            current polars-internal node.
+
+        Returns
+        -------
+        Translated IR object
+
+        Raises
+        ------
+        NotImplementedError
+            If the version of Polars IR is unsupported.
+
+        Notes
+        -----
+        Any expression nodes that cannot be translated are replaced by
+        :class:`expr.ErrorNode` nodes and collected in the the `errors` attribute.
+        After translation is complete, this list of errors should be inspected
+        to determine if the query is supported.
+        """
+        ctx: AbstractContextManager[None] = (
+            set_node(self.visitor, n) if n is not None else noop_context
+        )
+        # IR is versioned with major.minor, minor is bumped for backwards
+        # compatible changes (e.g. adding new nodes), major is bumped for
+        # incompatible changes (e.g. renaming nodes).
+        if (version := self.visitor.version()) >= (4, 0):
+            e = NotImplementedError(
+                f"No support for polars IR {version=}"
+            )  # pragma: no cover; no such version for now.
+            self.errors.append(e)  # pragma: no cover
+            raise e  # pragma: no cover
+
+        with ctx:
+            polars_schema = self.visitor.get_schema()
+            try:
+                schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()}
+            except Exception as e:
+                self.errors.append(NotImplementedError(str(e)))
+                return ir.ErrorNode({}, str(e))
+            try:
+                node = self.visitor.view_current_node()
+            except Exception as e:
+                self.errors.append(e)
+                return ir.ErrorNode(schema, str(e))
+            try:
+                result = _translate_ir(node, self, schema)
+            except Exception as e:
+                self.errors.append(e)
+                return ir.ErrorNode(schema, str(e))
+            if any(
+                isinstance(dtype, pl.Null)
+                for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values())
+            ):
+                error = NotImplementedError(
+                    f"No GPU support for {result} with Null column dtype."
+                )
+                self.errors.append(error)
+                return ir.ErrorNode(schema, str(error))
+
+            return result
+
+    def translate_expr(self, *, n: int) -> expr.Expr:
+        """
+        Translate a polars-internal expression IR into our representation.
+
+        Parameters
+        ----------
+        n
+            Node to translate, an integer referencing a polars internal node.
+
+        Returns
+        -------
+        Translated IR object.
+
+        Notes
+        -----
+        Any expression nodes that cannot be translated are replaced by
+        :class:`expr.ErrorExpr` nodes and collected in the the `errors` attribute.
+        After translation is complete, this list of errors should be inspected
+        to determine if the query is supported.
+        """
+        node = self.visitor.view_expression(n)
+        dtype = dtypes.from_polars(self.visitor.get_dtype(n))
+        try:
+            return _translate_expr(node, self, dtype)
+        except Exception as e:
+            self.errors.append(e)
+            return expr.ErrorExpr(dtype, str(e))
 
 
 class set_node(AbstractContextManager[None]):
@@ -67,7 +183,7 @@ def __exit__(self, *args: Any) -> None:
 
 @singledispatch
 def _translate_ir(
-    node: Any, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: Any, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     raise NotImplementedError(
         f"Translation for {type(node).__name__}"
@@ -76,19 +192,19 @@ def _translate_ir(
 
 @_translate_ir.register
 def _(
-    node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.PythonScan, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     scan_fn, with_columns, source_type, predicate, nrows = node.options
     options = (scan_fn, with_columns, source_type, nrows)
     predicate = (
-        translate_named_expr(visitor, n=predicate) if predicate is not None else None
+        translate_named_expr(translator, n=predicate) if predicate is not None else None
     )
     return ir.PythonScan(schema, options, predicate)
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Scan, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     typ, *options = node.scan_type
     if typ == "ndjson":
@@ -117,7 +233,7 @@ def _(
         skip_rows,
         n_rows,
         row_index,
-        translate_named_expr(visitor, n=node.predicate)
+        translate_named_expr(translator, n=node.predicate)
         if node.predicate is not None
         else None,
     )
@@ -125,20 +241,20 @@ def _(
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Cache, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Cache, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input))
+    return ir.Cache(schema, node.id_, translator.translate_ir(n=node.input))
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.DataFrameScan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.DataFrameScan, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     return ir.DataFrameScan(
         schema,
         node.df,
         node.projection,
-        translate_named_expr(visitor, n=node.selection)
+        translate_named_expr(translator, n=node.selection)
         if node.selection is not None
         else None,
     )
@@ -146,22 +262,22 @@ def _(
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Select, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Select, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    with set_node(visitor, node.input):
-        inp = translate_ir(visitor, n=None)
-        exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
+    with set_node(translator.visitor, node.input):
+        inp = translator.translate_ir(n=None)
+        exprs = [translate_named_expr(translator, n=e) for e in node.expr]
     return ir.Select(schema, exprs, node.should_broadcast, inp)
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.GroupBy, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.GroupBy, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    with set_node(visitor, node.input):
-        inp = translate_ir(visitor, n=None)
-        aggs = [translate_named_expr(visitor, n=e) for e in node.aggs]
-        keys = [translate_named_expr(visitor, n=e) for e in node.keys]
+    with set_node(translator.visitor, node.input):
+        inp = translator.translate_ir(n=None)
+        aggs = [translate_named_expr(translator, n=e) for e in node.aggs]
+        keys = [translate_named_expr(translator, n=e) for e in node.keys]
     return ir.GroupBy(
         schema,
         keys,
@@ -174,17 +290,17 @@ def _(
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Join, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Join, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     # Join key dtypes are dependent on the schema of the left and
     # right inputs, so these must be translated with the relevant
     # input active.
-    with set_node(visitor, node.input_left):
-        inp_left = translate_ir(visitor, n=None)
-        left_on = [translate_named_expr(visitor, n=e) for e in node.left_on]
-    with set_node(visitor, node.input_right):
-        inp_right = translate_ir(visitor, n=None)
-        right_on = [translate_named_expr(visitor, n=e) for e in node.right_on]
+    with set_node(translator.visitor, node.input_left):
+        inp_left = translator.translate_ir(n=None)
+        left_on = [translate_named_expr(translator, n=e) for e in node.left_on]
+    with set_node(translator.visitor, node.input_right):
+        inp_right = translator.translate_ir(n=None)
+        right_on = [translate_named_expr(translator, n=e) for e in node.right_on]
     if (how := node.options[0]) in {
         "inner",
         "left",
@@ -239,27 +355,27 @@ def _(
 
 @_translate_ir.register
 def _(
-    node: pl_ir.HStack, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.HStack, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    with set_node(visitor, node.input):
-        inp = translate_ir(visitor, n=None)
-        exprs = [translate_named_expr(visitor, n=e) for e in node.exprs]
+    with set_node(translator.visitor, node.input):
+        inp = translator.translate_ir(n=None)
+        exprs = [translate_named_expr(translator, n=e) for e in node.exprs]
     return ir.HStack(schema, exprs, node.should_broadcast, inp)
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Reduce, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Reduce, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:  # pragma: no cover; polars doesn't emit this node yet
-    with set_node(visitor, node.input):
-        inp = translate_ir(visitor, n=None)
-        exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
+    with set_node(translator.visitor, node.input):
+        inp = translator.translate_ir(n=None)
+        exprs = [translate_named_expr(translator, n=e) for e in node.expr]
     return ir.Reduce(schema, exprs, inp)
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Distinct, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     (keep, subset, maintain_order, zlice) = node.options
     keep = ir.Distinct._KEEP_MAP[keep]
@@ -270,17 +386,17 @@ def _(
         subset,
         zlice,
         maintain_order,
-        translate_ir(visitor, n=node.input),
+        translator.translate_ir(n=node.input),
     )
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Sort, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Sort, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    with set_node(visitor, node.input):
-        inp = translate_ir(visitor, n=None)
-        by = [translate_named_expr(visitor, n=e) for e in node.by_column]
+    with set_node(translator.visitor, node.input):
+        inp = translator.translate_ir(n=None)
+        by = [translate_named_expr(translator, n=e) for e in node.by_column]
     stable, nulls_last, descending = node.sort_options
     order, null_order = sorting.sort_order(
         descending, nulls_last=nulls_last, num_keys=len(by)
@@ -290,33 +406,35 @@ def _(
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Slice, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.Slice(schema, node.offset, node.len, translate_ir(visitor, n=node.input))
+    return ir.Slice(
+        schema, node.offset, node.len, translator.translate_ir(n=node.input)
+    )
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Filter, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Filter, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    with set_node(visitor, node.input):
-        inp = translate_ir(visitor, n=None)
-        mask = translate_named_expr(visitor, n=node.predicate)
+    with set_node(translator.visitor, node.input):
+        inp = translator.translate_ir(n=None)
+        mask = translate_named_expr(translator, n=node.predicate)
     return ir.Filter(schema, mask, inp)
 
 
 @_translate_ir.register
 def _(
     node: pl_ir.SimpleProjection,
-    visitor: NodeTraverser,
+    translator: Translator,
     schema: dict[str, plc.DataType],
 ) -> ir.IR:
-    return ir.Projection(schema, translate_ir(visitor, n=node.input))
+    return ir.Projection(schema, translator.translate_ir(n=node.input))
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.MapFunction, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.MapFunction, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     name, *options = node.function
     return ir.MapFunction(
@@ -324,83 +442,36 @@ def _(
         name,
         options,
         # TODO: merge_sorted breaks this pattern
-        translate_ir(visitor, n=node.input),
+        translator.translate_ir(n=node.input),
     )
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.Union, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     return ir.Union(
-        schema, node.options, *(translate_ir(visitor, n=n) for n in node.inputs)
+        schema, node.options, *(translator.translate_ir(n=n) for n in node.inputs)
     )
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+    node: pl_ir.HConcat, translator: Translator, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.HConcat(schema, *(translate_ir(visitor, n=n) for n in node.inputs))
-
-
-def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
-    """
-    Translate a polars-internal IR node to our representation.
-
-    Parameters
-    ----------
-    visitor
-        Polars NodeTraverser object
-    n
-        Optional node to start traversing from, if not provided uses
-        current polars-internal node.
-
-    Returns
-    -------
-    Translated IR object
-
-    Raises
-    ------
-    NotImplementedError
-        If we can't translate the nodes due to unsupported functionality.
-    """
-    ctx: AbstractContextManager[None] = (
-        set_node(visitor, n) if n is not None else noop_context
-    )
-    # IR is versioned with major.minor, minor is bumped for backwards
-    # compatible changes (e.g. adding new nodes), major is bumped for
-    # incompatible changes (e.g. renaming nodes).
-    if (version := visitor.version()) >= (4, 0):
-        raise NotImplementedError(
-            f"No support for polars IR {version=}"
-        )  # pragma: no cover; no such version for now.
-
-    with ctx:
-        polars_schema = visitor.get_schema()
-        node = visitor.view_current_node()
-        schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()}
-        result = _translate_ir(node, visitor, schema)
-        if any(
-            isinstance(dtype, pl.Null)
-            for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values())
-        ):
-            raise NotImplementedError(
-                f"No GPU support for {result} with Null column dtype."
-            )
-        return result
+    return ir.HConcat(schema, *(translator.translate_ir(n=n) for n in node.inputs))
 
 
 def translate_named_expr(
-    visitor: NodeTraverser, *, n: pl_expr.PyExprIR
+    translator: Translator, *, n: pl_expr.PyExprIR
 ) -> expr.NamedExpr:
     """
     Translate a polars-internal named expression IR object into our representation.
 
     Parameters
     ----------
-    visitor
-        Polars NodeTraverser object
+    translator
+        Translator object
     n
         Node to translate, a named expression node.
 
@@ -420,12 +491,12 @@ def translate_named_expr(
     NotImplementedError
         If any translation fails due to unsupported functionality.
     """
-    return expr.NamedExpr(n.output_name, translate_expr(visitor, n=n.node))
+    return expr.NamedExpr(n.output_name, translator.translate_expr(n=n.node))
 
 
 @singledispatch
 def _translate_expr(
-    node: Any, visitor: NodeTraverser, dtype: plc.DataType
+    node: Any, translator: Translator, dtype: plc.DataType
 ) -> expr.Expr:
     raise NotImplementedError(
         f"Translation for {type(node).__name__}"
@@ -433,7 +504,7 @@ def _translate_expr(
 
 
 @_translate_expr.register
-def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     name, *options = node.function_data
     options = tuple(options)
     if isinstance(name, pl_expr.StringFunction):
@@ -442,7 +513,7 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             pl_expr.StringFunction.StripCharsStart,
             pl_expr.StringFunction.StripCharsEnd,
         }:
-            column, chars = (translate_expr(visitor, n=n) for n in node.input)
+            column, chars = (translator.translate_expr(n=n) for n in node.input)
             if isinstance(chars, expr.Literal):
                 if chars.value == pa.scalar(""):
                     # No-op in polars, but libcudf uses empty string
@@ -459,11 +530,11 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             dtype,
             name,
             options,
-            *(translate_expr(visitor, n=n) for n in node.input),
+            *(translator.translate_expr(n=n) for n in node.input),
         )
     elif isinstance(name, pl_expr.BooleanFunction):
         if name == pl_expr.BooleanFunction.IsBetween:
-            column, lo, hi = (translate_expr(visitor, n=n) for n in node.input)
+            column, lo, hi = (translator.translate_expr(n=n) for n in node.input)
             (closed,) = options
             lop, rop = expr.BooleanFunction._BETWEEN_OPS[closed]
             return expr.BinOp(
@@ -476,7 +547,7 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             dtype,
             name,
             options,
-            *(translate_expr(visitor, n=n) for n in node.input),
+            *(translator.translate_expr(n=n) for n in node.input),
         )
     elif isinstance(name, pl_expr.TemporalFunction):
         # functions for which evaluation of the expression may not return
@@ -496,14 +567,14 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             dtype,
             name,
             options,
-            *(translate_expr(visitor, n=n) for n in node.input),
+            *(translator.translate_expr(n=n) for n in node.input),
         )
         if name in needs_cast:
             return expr.Cast(dtype, result_expr)
         return result_expr
 
     elif isinstance(name, str):
-        children = (translate_expr(visitor, n=n) for n in node.input)
+        children = (translator.translate_expr(n=n) for n in node.input)
         if name == "log":
             (base,) = options
             (child,) = children
@@ -522,26 +593,26 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
 
 
 @_translate_expr.register
-def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Window, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     # TODO: raise in groupby?
     if isinstance(node.options, pl_expr.RollingGroupOptions):
         # pl.col("a").rolling(...)
         return expr.RollingWindow(
-            dtype, node.options, translate_expr(visitor, n=node.function)
+            dtype, node.options, translator.translate_expr(n=node.function)
         )
     elif isinstance(node.options, pl_expr.WindowMapping):
         # pl.col("a").over(...)
         return expr.GroupedRollingWindow(
             dtype,
             node.options,
-            translate_expr(visitor, n=node.function),
-            *(translate_expr(visitor, n=n) for n in node.partition_by),
+            translator.translate_expr(n=node.function),
+            *(translator.translate_expr(n=n) for n in node.partition_by),
         )
     assert_never(node.options)
 
 
 @_translate_expr.register
-def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Literal, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     if isinstance(node.value, plrs.PySeries):
         return expr.LiteralColumn(dtype, pl.Series._from_pyseries(node.value))
     value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype))
@@ -549,42 +620,42 @@ def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> exp
 
 
 @_translate_expr.register
-def _(node: pl_expr.Sort, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Sort, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     # TODO: raise in groupby
-    return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr))
+    return expr.Sort(dtype, node.options, translator.translate_expr(n=node.expr))
 
 
 @_translate_expr.register
-def _(node: pl_expr.SortBy, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.SortBy, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     return expr.SortBy(
         dtype,
         node.sort_options,
-        translate_expr(visitor, n=node.expr),
-        *(translate_expr(visitor, n=n) for n in node.by),
+        translator.translate_expr(n=node.expr),
+        *(translator.translate_expr(n=n) for n in node.by),
     )
 
 
 @_translate_expr.register
-def _(node: pl_expr.Gather, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Gather, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     return expr.Gather(
         dtype,
-        translate_expr(visitor, n=node.expr),
-        translate_expr(visitor, n=node.idx),
+        translator.translate_expr(n=node.expr),
+        translator.translate_expr(n=node.idx),
     )
 
 
 @_translate_expr.register
-def _(node: pl_expr.Filter, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Filter, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     return expr.Filter(
         dtype,
-        translate_expr(visitor, n=node.input),
-        translate_expr(visitor, n=node.by),
+        translator.translate_expr(n=node.input),
+        translator.translate_expr(n=node.by),
     )
 
 
 @_translate_expr.register
-def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
-    inner = translate_expr(visitor, n=node.expr)
+def _(node: pl_expr.Cast, translator: Translator, dtype: plc.DataType) -> expr.Expr:
+    inner = translator.translate_expr(n=node.expr)
     # Push casts into literals so we can handle Cast(Literal(Null))
     if isinstance(inner, expr.Literal):
         return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype)))
@@ -596,17 +667,17 @@ def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.E
 
 
 @_translate_expr.register
-def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Column, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     return expr.Col(dtype, node.name)
 
 
 @_translate_expr.register
-def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Agg, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     value = expr.Agg(
         dtype,
         node.name,
         node.options,
-        *(translate_expr(visitor, n=n) for n in node.arguments),
+        *(translator.translate_expr(n=n) for n in node.arguments),
     )
     if value.name == "count" and value.dtype.id() != plc.TypeId.INT32:
         return expr.Cast(value.dtype, value)
@@ -614,55 +685,30 @@ def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Ex
 
 
 @_translate_expr.register
-def _(node: pl_expr.Ternary, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Ternary, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     return expr.Ternary(
         dtype,
-        translate_expr(visitor, n=node.predicate),
-        translate_expr(visitor, n=node.truthy),
-        translate_expr(visitor, n=node.falsy),
+        translator.translate_expr(n=node.predicate),
+        translator.translate_expr(n=node.truthy),
+        translator.translate_expr(n=node.falsy),
     )
 
 
 @_translate_expr.register
 def _(
-    node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType
+    node: pl_expr.BinaryExpr, translator: Translator, dtype: plc.DataType
 ) -> expr.Expr:
     return expr.BinOp(
         dtype,
         expr.BinOp._MAPPING[node.op],
-        translate_expr(visitor, n=node.left),
-        translate_expr(visitor, n=node.right),
+        translator.translate_expr(n=node.left),
+        translator.translate_expr(n=node.right),
     )
 
 
 @_translate_expr.register
-def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Len, translator: Translator, dtype: plc.DataType) -> expr.Expr:
     value = expr.Len(dtype)
     if dtype.id() != plc.TypeId.INT32:
         return expr.Cast(dtype, value)
     return value  # pragma: no cover; never reached since polars len has uint32 dtype
-
-
-def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr:
-    """
-    Translate a polars-internal expression IR into our representation.
-
-    Parameters
-    ----------
-    visitor
-        Polars NodeTraverser object
-    n
-        Node to translate, an integer referencing a polars internal node.
-
-    Returns
-    -------
-    Translated IR object.
-
-    Raises
-    ------
-    NotImplementedError
-        If any translation fails due to unsupported functionality.
-    """
-    node = visitor.view_expression(n)
-    dtype = dtypes.from_polars(visitor.get_dtype(n))
-    return _translate_expr(node, visitor, dtype)
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index 7b45c1eaa06..2207545aa60 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -10,7 +10,7 @@
 from polars import GPUEngine
 from polars.testing.asserts import assert_frame_equal
 
-from cudf_polars.dsl.translate import translate_ir
+from cudf_polars.dsl.translate import Translator
 
 if TYPE_CHECKING:
     import polars as pl
@@ -117,12 +117,14 @@ def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception])
     AssertionError
        If the specified exceptions were not raised.
     """
-    try:
-        _ = translate_ir(q._ldf.visit())
-    except exceptions:
+    translator = Translator(q._ldf.visit())
+    translator.translate_ir()
+    if errors := translator.errors:
+        for err in errors:
+            assert any(
+                isinstance(err, err_type) for err_type in exceptions
+            ), f"Translation DID NOT RAISE {exceptions}"
         return
-    except Exception as e:
-        raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e
     else:
         raise AssertionError(f"Translation DID NOT RAISE {exceptions}")
 
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index a90c283ee54..e7ac72df609 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -71,11 +71,16 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool:
     -------
     True if casting is supported, False otherwise
     """
+    has_empty = from_.id() == plc.TypeId.EMPTY or to.id() == plc.TypeId.EMPTY
     return (
         (
-            plc.traits.is_fixed_width(to)
-            and plc.traits.is_fixed_width(from_)
-            and plc.unary.is_supported_cast(from_, to)
+            from_ == to
+            or not has_empty
+            and (
+                plc.traits.is_fixed_width(to)
+                and plc.traits.is_fixed_width(from_)
+                and plc.unary.is_supported_cast(from_, to)
+            )
         )
         or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to))
         or (to.id() == plc.TypeId.STRING and is_numeric_not_bool(from_))
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index 17a94c633f8..2f2361223d2 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -458,12 +458,12 @@ translate it to our intermediate representation (IR), and then execute
 and convert back to polars:
 
 ```python
-from cudf_polars.dsl.translate import translate_ir
+from cudf_polars.dsl.translate import Translator
 
 q = ...
 
 # Convert to our IR
-ir = translate_ir(q._ldf.visit())
+ir = Translator(q._ldf.visit()).translate_ir()
 
 # DataFrame living on the device
 result = ir.evaluate(cache={})
diff --git a/python/cudf_polars/tests/dsl/test_to_ast.py b/python/cudf_polars/tests/dsl/test_to_ast.py
index 8f10f119199..f6c24da0180 100644
--- a/python/cudf_polars/tests/dsl/test_to_ast.py
+++ b/python/cudf_polars/tests/dsl/test_to_ast.py
@@ -13,7 +13,7 @@
 
 import cudf_polars.dsl.expr as expr_nodes
 import cudf_polars.dsl.ir as ir_nodes
-from cudf_polars import translate_ir
+from cudf_polars import Translator
 from cudf_polars.containers.dataframe import DataFrame, NamedColumn
 from cudf_polars.dsl.to_ast import insert_colrefs, to_ast, to_parquet_filter
 
@@ -60,7 +60,7 @@ def df():
 )
 def test_compute_column(expr, df):
     q = df.select(expr)
-    ir = translate_ir(q._ldf.visit())
+    ir = Translator(q._ldf.visit()).translate_ir()
 
     assert isinstance(ir, ir_nodes.Select)
     table = ir.children[0].evaluate(cache={})
diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py
index 15c644d7978..8958c2a0f84 100644
--- a/python/cudf_polars/tests/dsl/test_traversal.py
+++ b/python/cudf_polars/tests/dsl/test_traversal.py
@@ -10,7 +10,7 @@
 
 import pylibcudf as plc
 
-from cudf_polars import translate_ir
+from cudf_polars import Translator
 from cudf_polars.dsl import expr, ir
 from cudf_polars.dsl.traversal import (
     CachingVisitor,
@@ -109,7 +109,7 @@ def test_rewrite_ir_node():
     df = pl.LazyFrame({"a": [1, 2, 1], "b": [1, 3, 4]})
     q = df.group_by("a").agg(pl.col("b").sum()).sort("b")
 
-    orig = translate_ir(q._ldf.visit())
+    orig = Translator(q._ldf.visit()).translate_ir()
 
     new_df = pl.DataFrame({"a": [1, 1, 2], "b": [-1, -2, -4]})
 
@@ -150,7 +150,7 @@ def replace_scan(node, rec):
 
     mapper = CachingVisitor(replace_scan)
 
-    orig = translate_ir(q._ldf.visit())
+    orig = Translator(q._ldf.visit()).translate_ir()
     new = mapper(orig)
 
     result = new.evaluate(cache={}).to_polars()
@@ -174,7 +174,7 @@ def test_rewrite_names_and_ops():
         .collect()
     )
 
-    qir = translate_ir(q._ldf.visit())
+    qir = Translator(q._ldf.visit()).translate_ir()
 
     @singledispatch
     def _transform(e: expr.Expr, fn: ExprTransformer) -> expr.Expr:
diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py
index 62df8ce1498..6170281ad54 100644
--- a/python/cudf_polars/tests/expressions/test_sort.py
+++ b/python/cudf_polars/tests/expressions/test_sort.py
@@ -10,7 +10,7 @@
 
 import pylibcudf as plc
 
-from cudf_polars import translate_ir
+from cudf_polars import Translator
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
@@ -68,7 +68,7 @@ def test_setsorted(descending, nulls_last, with_nulls):
 
     assert_gpu_result_equal(q)
 
-    df = translate_ir(q._ldf.visit()).evaluate(cache={})
+    df = Translator(q._ldf.visit()).translate_ir().evaluate(cache={})
 
     a = df.column_map["a"]
 
diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py
index e895f27f637..63aa1c573a9 100644
--- a/python/cudf_polars/tests/test_mapfunction.py
+++ b/python/cudf_polars/tests/test_mapfunction.py
@@ -93,16 +93,3 @@ def test_unpivot_defaults():
     )
     q = df.unpivot(index="d")
     assert_gpu_result_equal(q)
-
-
-def test_unpivot_unsupported_cast_raises():
-    df = pl.LazyFrame(
-        {
-            "a": ["x", "y", "z"],
-            "b": pl.Series([1, 3, 5], dtype=pl.Int16),
-        }
-    )
-
-    q = df.unpivot(["a", "b"])
-
-    assert_ir_translation_raises(q, NotImplementedError)

From ccfc95a623e13d59a6e4f640ee7c022bda35f763 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:03:06 -0500
Subject: [PATCH 14/19] Add new nvtext minhash_permuted API (#16756)

Introduce new nvtext minhash API that takes a single seed for hashing and 2 parameter vectors to calculate the minhash results from the seed hash:
```
std::unique_ptr<cudf::column> minhash_permuted(
  cudf::strings_column_view const& input,
  uint32_t seed,
  cudf::device_span<uint32_t const> parameter_a,
  cudf::device_span<uint32_t const> parameter_b,
  cudf::size_type width,
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr);
```
The `seed` is used to hash the `input` using rolling set of substrings `width` characters wide.
The hashes are then combined with the values in `parameter_a` and `parameter_b` to calculate a set of 32-bit (or 64-bit) values for each row. Only the minimum value is returned per element of `a` and `b` when combined with all the hashes for a row. Each output row is a set of M values where `M = parameter_a.size() = parameter_b.size()`

This implementation is significantly faster than the current minhash which computes hashes for multiple seeds.

Included in this PR is also the `minhash64_permuted()` API that is identical but uses 64-bit values for the seed and the parameter values. Also included are new tests and a benchmark as well as the pylibcudf and cudf interfaces.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Lawrence Mitchell (https://github.com/wence-)
  - Karthikeyan (https://github.com/karthikeyann)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16756
---
 cpp/benchmarks/CMakeLists.txt                 |   4 +-
 cpp/benchmarks/text/minhash.cpp               |  38 +-
 cpp/include/nvtext/minhash.hpp                |  94 +++++
 cpp/src/text/minhash.cu                       | 390 ++++++++++++++++++
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/text/minhash_tests.cpp              | 267 ++++++------
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  28 ++
 python/cudf/cudf/_lib/strings/__init__.py     |   2 +
 python/cudf/cudf/core/column/string.py        | 107 +++++
 .../cudf/cudf/tests/text/test_text_methods.py |  48 +--
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |  16 +
 python/pylibcudf/pylibcudf/nvtext/minhash.pxd |  16 +
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 103 +++++
 .../pylibcudf/tests/test_nvtext_minhash.py    |  12 +-
 14 files changed, 949 insertions(+), 177 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index ad090be99f3..59f5602fd5a 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -348,8 +348,8 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 ConfigureBench(TEXT_BENCH text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
+  text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index 31ce60d8f9a..a80d0dcbdb8 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -20,8 +20,6 @@
 
 #include <nvtext/minhash.hpp>
 
-#include <rmm/device_buffer.hpp>
-
 #include <nvbench/nvbench.cuh>
 
 static void bench_minhash(nvbench::state& state)
@@ -29,26 +27,25 @@ static void bench_minhash(nvbench::state& state)
   auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const hash_width = static_cast<cudf::size_type>(state.get_int64("hash_width"));
-  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
+  auto const parameters = static_cast<cudf::size_type>(state.get_int64("parameters"));
   auto const base64     = state.get_int64("hash_type") == 64;
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const strings_profile = data_profile_builder().distribution(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
   auto const strings_table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input(strings_table->view().column(0));
 
-  data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution(
-    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, row_width);
-  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
-  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
-  auto seeds             = seeds_table->get_column(0);
-  seeds.set_null_mask(rmm::device_buffer{}, 0);
+  data_profile const param_profile = data_profile_builder().no_validity().distribution(
+    cudf::type_to_id<cudf::hash_value_type>(),
+    distribution_id::NORMAL,
+    0u,
+    std::numeric_limits<cudf::hash_value_type>::max());
+  auto const param_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
+  auto const param_table =
+    create_random_table({param_type, param_type}, row_count{parameters}, param_profile);
+  auto const parameters_a = param_table->view().column(0);
+  auto const parameters_b = param_table->view().column(1);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
@@ -57,15 +54,16 @@ static void bench_minhash(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width)
-                         : nvtext::minhash(input, seeds.view(), hash_width);
+    auto result = base64
+                    ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width)
+                    : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width);
   });
 }
 
 NVBENCH_BENCH(bench_minhash)
   .set_name("minhash")
-  .add_int64_axis("num_rows", {1024, 8192, 16364, 131072})
-  .add_int64_axis("row_width", {128, 512, 2048})
-  .add_int64_axis("hash_width", {5, 10})
-  .add_int64_axis("seed_count", {2, 26})
+  .add_int64_axis("num_rows", {15000, 30000, 60000})
+  .add_int64_axis("row_width", {6000, 28000, 50000})
+  .add_int64_axis("hash_width", {12, 24})
+  .add_int64_axis("parameters", {26, 260})
   .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 42124461cdf..b2c1a23f57e 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -94,6 +94,53 @@ namespace CUDF_EXPORT nvtext {
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each string
+ *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
+ * The input strings are first hashed using the given `seed` over substrings
+ * of `width` characters. These hash values are then combined with the `a`
+ * and `b` parameter values using the following formula:
+ * ```
+ *   max_hash = max of uint32
+ *   mp = (1 << 61) - 1
+ *   hv[i] = hash value of a substring at i
+ *   pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash
+ * ```
+ *
+ * This calculation is performed on each substring and the minimum value is computed
+ * as follows:
+ * ```
+ *   mh[j,i] = min(pv[i]) for all substrings in row j
+ *                        and where i=[0,a.size())
+ * ```
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the width < 2
+ * @throw std::invalid_argument if parameter_a is empty
+ * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()`
+ * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param seed Seed value used for the hash algorithm
+ * @param parameter_a Values used for the permuted calculation
+ * @param parameter_b Values used for the permuted calculation
+ * @param width The character width of substrings to hash for each row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash_permuted(
+  cudf::strings_column_view const& input,
+  uint32_t seed,
+  cudf::device_span<uint32_t const> parameter_a,
+  cudf::device_span<uint32_t const> parameter_b,
+  cudf::size_type width,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Returns the minhash value for each string
  *
@@ -159,6 +206,53 @@ namespace CUDF_EXPORT nvtext {
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each string
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ *
+ * The input strings are first hashed using the given `seed` over substrings
+ * of `width` characters. These hash values are then combined with the `a`
+ * and `b` parameter values using the following formula:
+ * ```
+ *   max_hash = max of uint64
+ *   mp = (1 << 61) - 1
+ *   hv[i] = hash value of a substring at i
+ *   pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash
+ * ```
+ *
+ * This calculation is performed on each substring and the minimum value is computed
+ * as follows:
+ * ```
+ *   mh[j,i] = min(pv[i]) for all substrings in row j
+ *                        and where i=[0,a.size())
+ * ```
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the width < 2
+ * @throw std::invalid_argument if parameter_a is empty
+ * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()`
+ * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param seed Seed value used for the hash algorithm
+ * @param parameter_a Values used for the permuted calculation
+ * @param parameter_b Values used for the permuted calculation
+ * @param width The character width of substrings to hash for each row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash64_permuted(
+  cudf::strings_column_view const& input,
+  uint64_t seed,
+  cudf::device_span<uint64_t const> parameter_a,
+  cudf::device_span<uint64_t const> parameter_b,
+  cudf::size_type width,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Returns the minhash values for each row of strings per seed
  *
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index a03a34f5fa7..aee83ab35ed 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sequence.hpp>
@@ -37,9 +38,13 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cooperative_groups.h>
 #include <cuda/atomic>
+#include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
 
 #include <limits>
 
@@ -162,6 +167,339 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
   return hashes;
 }
 
+constexpr cudf::thread_index_type block_size = 256;
+// for potentially tuning minhash_seed_kernel independently from block_size
+constexpr cudf::thread_index_type tile_size = block_size;
+
+// Number of a/b parameter values to process per thread.
+// The intermediate values are stored in shared-memory and therefore limits this count.
+// This value was found to be an efficient size for both uint32 and uint64
+// hash types based on benchmarks.
+constexpr cuda::std::size_t params_per_thread = 16;
+
+// Separate kernels are used to process strings above and below this value (in bytes).
+constexpr cudf::size_type wide_string_threshold = 1 << 18;  // 256K
+// The number of blocks per string for the above-threshold kernel processing.
+constexpr cudf::size_type blocks_per_string = 64;
+// The above values were determined using the redpajama and books_sample datasets
+
+/**
+ * @brief Hashing kernel launched as a thread per tile-size (block or warp)
+ *
+ * This kernel computes the hashes for each string using the seed and the specified
+ * hash function. The width is used to compute rolling substrings to hash over.
+ * The hashes are stored in d_hashes to be used in the minhash_permuted_kernel.
+ *
+ * This kernel also counts the number of strings above the wide_string_threshold
+ * and proactively initializes the output values for those strings.
+ *
+ * @tparam HashFunction The hash function to use for this kernel
+ * @tparam hash_value_type Derived from HashFunction result_type
+ *
+ * @param d_strings The input strings to hash
+ * @param seed The seed used for the hash function
+ * @param width Width in characters used for determining substrings to hash
+ * @param d_hashes The resulting hash values are stored here
+ * @param threshold_count Stores the number of strings above wide_string_threshold
+ * @param param_count Number of parameters (used for the proactive initialize)
+ * @param d_results Final results vector (used for the proactive initialize)
+ */
+template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
+CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
+                                     hash_value_type seed,
+                                     cudf::size_type width,
+                                     hash_value_type* d_hashes,
+                                     cudf::size_type* threshold_count,
+                                     cudf::size_type param_count,
+                                     hash_value_type* d_results)
+{
+  auto const tid     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = tid / tile_size;
+  if (str_idx >= d_strings.size()) { return; }
+  if (d_strings.is_null(str_idx)) { return; }
+
+  // retrieve this string's offset to locate the output position in d_hashes
+  auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
+  auto const offsets_itr =
+    cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
+  auto const offset     = offsets_itr[str_idx];
+  auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
+  if (size_bytes == 0) { return; }
+
+  auto const d_str    = cudf::string_view(d_strings.head<char>() + offset, size_bytes);
+  auto const lane_idx = tid % tile_size;
+
+  // hashes for this string/thread are stored here
+  auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx;
+
+  auto const begin  = d_str.data() + lane_idx;
+  auto const end    = d_str.data() + d_str.size_bytes();
+  auto const hasher = HashFunction(seed);
+
+  for (auto itr = begin; itr < end; itr += tile_size, seed_hashes += tile_size) {
+    if (cudf::strings::detail::is_utf8_continuation_char(*itr)) {
+      *seed_hashes = 0;
+      continue;
+    }
+    auto const check_str =  // used for counting 'width' characters
+      cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+    auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width);
+    if ((itr != d_str.data()) && (left > 0)) {
+      // true itr+width is past the end of the string
+      *seed_hashes = 0;
+      continue;
+    }
+
+    auto const hash_str = cudf::string_view(itr, bytes);
+    hash_value_type hv;
+    if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
+      hv = hasher(hash_str);
+    } else {
+      hv = thrust::get<0>(hasher(hash_str));
+    }
+    // disallowing hash to zero case
+    *seed_hashes = cuda::std::max(hv, hash_value_type{1});
+  }
+
+  // logic appended here so an extra kernel is not required
+  if (size_bytes >= wide_string_threshold) {
+    if (lane_idx == 0) {
+      // count the number of wide strings
+      cuda::atomic_ref<cudf::size_type, cuda::thread_scope_device> ref{*threshold_count};
+      ref.fetch_add(1, cuda::std::memory_order_relaxed);
+    }
+    // initialize the output -- only needed for wider strings
+    auto d_output = d_results + (str_idx * param_count);
+    for (auto i = lane_idx; i < param_count; i += tile_size) {
+      d_output[i] = std::numeric_limits<hash_value_type>::max();
+    }
+  }
+}
+
+/**
+ * @brief Permutation calculation kernel
+ *
+ * This kernel uses the hashes from the minhash_seed_kernel and the parameter_a and
+ * parameter_b values to compute the final output results.
+ * The output is the number of input rows (N) by the number of parameter values (M).
+ * Each output[i] is the calculated result for parameter_a/b[0:M].
+ *
+ * This kernel is launched with either blocks per strings of 1 for strings
+ * below the wide_strings_threshold or blocks per string = blocks_per_strings
+ * for strings above wide_strings_threshold.
+ *
+ * @tparam hash_value_type Derived from HashFunction result_type
+ * @tparam blocks_per_string Number of blocks used to process each string
+ *
+ * @param d_strings The input strings to hash
+ * @param indices The indices of the strings in d_strings to process
+ * @param parameter_a 1st set of parameters for the calculation result
+ * @param parameter_b 2nd set of parameters for the calculation result
+ * @param width Used for calculating the number of available hashes in each string
+ * @param d_hashes The hash values computed in minhash_seed_kernel
+ * @param d_results Final results vector of calculate values
+ */
+template <typename hash_value_type, int blocks_per_string>
+CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
+                                         cudf::device_span<cudf::size_type const> indices,
+                                         cudf::device_span<hash_value_type const> parameter_a,
+                                         cudf::device_span<hash_value_type const> parameter_b,
+                                         cudf::size_type width,
+                                         hash_value_type const* d_hashes,
+                                         hash_value_type* d_results)
+{
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
+  auto const idx = (tid / blocks_per_string) / block_size;
+  if (idx >= indices.size()) { return; }
+  auto const str_idx = indices[idx];
+  if (d_strings.is_null(str_idx)) { return; }
+
+  auto const block      = cooperative_groups::this_thread_block();
+  int const section_idx = block.group_index().x % blocks_per_string;
+
+  auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
+  auto const offsets_itr =
+    cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
+  auto const offset     = offsets_itr[str_idx];
+  auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
+
+  // number of items to process in this block;
+  // last block also includes any remainder values from the size_bytes/blocks_per_string truncation
+  // example:
+  //  each section_size for string with size 588090 and blocks_per_string=64 is 9188
+  //  except the last section which is 9188 + (588090 % 64) = 9246
+  auto const section_size =
+    (size_bytes / blocks_per_string) +
+    (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string);
+  auto const section_offset = section_idx * (size_bytes / blocks_per_string);
+
+  // hash values for this block/section
+  auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset;
+  // width used here as a max value since a string's char-count <= byte-count
+  auto const hashes_size =
+    section_idx < (blocks_per_string - 1)
+      ? section_size
+      : cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), section_size - width + 1);
+
+  auto const init     = size_bytes == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
+  auto const lane_idx = block.thread_rank();
+  auto const d_output = d_results + (str_idx * parameter_a.size());
+
+  auto const begin = seed_hashes + lane_idx;
+  auto const end   = seed_hashes + hashes_size;
+
+  // constants used in the permutation calculations
+  constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
+  constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
+
+  // found to be an efficient shared memory size for both hash types
+  __shared__ hash_value_type block_values[block_size * params_per_thread];
+
+  for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) {
+    // initialize this block's chunk of shared memory
+    // each thread handles params_per_thread of values
+    auto const chunk_values = block_values + (lane_idx * params_per_thread);
+    thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + params_per_thread, init);
+    block.sync();
+
+    auto const param_count =
+      cuda::std::min(static_cast<cuda::std::size_t>(params_per_thread), parameter_a.size() - i);
+
+    // each lane accumulates min hashes in its shared memory
+    for (auto itr = begin; itr < end; itr += block_size) {
+      auto const hv = *itr;
+      // 0 is used as a skip sentinel for UTF-8 and trailing bytes
+      if (hv == 0) { continue; }
+
+      for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) {
+        // permutation formula used by datatrove
+        hash_value_type const v =
+          ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max;
+        auto const block_idx    = ((param_idx % params_per_thread) * block_size) + lane_idx;
+        block_values[block_idx] = cuda::std::min(v, block_values[block_idx]);
+      }
+    }
+    block.sync();
+
+    // reduce each parameter values vector to a single min value;
+    // assumes that the block_size > params_per_thread;
+    // each thread reduces a block_size of parameter values (thread per parameter)
+    if (lane_idx < param_count) {
+      auto const values = block_values + (lane_idx * block_size);
+      // cooperative groups does not have a min function and cub::BlockReduce was slower
+      auto const minv =
+        thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{});
+      if constexpr (blocks_per_string > 1) {
+        // accumulates mins for each block into d_output
+        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{d_output[lane_idx + i]};
+        ref.fetch_min(minv, cuda::std::memory_order_relaxed);
+      } else {
+        d_output[lane_idx + i] = minv;
+      }
+    }
+    block.sync();
+  }
+}
+
+template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
+std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
+                                         hash_value_type seed,
+                                         cudf::device_span<hash_value_type const> parameter_a,
+                                         cudf::device_span<hash_value_type const> parameter_b,
+                                         cudf::size_type width,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(width >= 2,
+               "Parameter width should be an integer value of 2 or greater",
+               std::invalid_argument);
+  CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument);
+  CUDF_EXPECTS(parameter_a.size() == parameter_b.size(),
+               "Parameters A and B should have the same number of elements",
+               std::invalid_argument);
+  CUDF_EXPECTS(
+    (static_cast<std::size_t>(input.size()) * parameter_a.size()) <
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+    "The number of parameters times the number of input rows exceeds the column size limit",
+    std::overflow_error);
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  auto results =
+    cudf::make_numeric_column(output_type,
+                              input.size() * static_cast<cudf::size_type>(parameter_a.size()),
+                              cudf::mask_state::UNALLOCATED,
+                              stream,
+                              mr);
+  auto d_results = results->mutable_view().data<hash_value_type>();
+
+  cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * block_size,
+                             block_size};
+  auto const hashes_size = input.chars_size(stream);
+  auto d_hashes          = rmm::device_uvector<hash_value_type>(hashes_size, stream);
+  auto d_threshold_count = cudf::detail::device_scalar<cudf::size_type>(0, stream);
+
+  minhash_seed_kernel<HashFunction>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(*d_strings,
+                                                                         seed,
+                                                                         width,
+                                                                         d_hashes.data(),
+                                                                         d_threshold_count.data(),
+                                                                         parameter_a.size(),
+                                                                         d_results);
+  auto const threshold_count = d_threshold_count.value(stream);
+
+  auto indices = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+  thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end());
+  cudf::size_type threshold_index = threshold_count < input.size() ? input.size() : 0;
+
+  // if we counted a split of above/below threshold then
+  // compute partitions based on the size of each string
+  if ((threshold_count > 0) && (threshold_count < input.size())) {
+    auto sizes = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      thrust::counting_iterator<cudf::size_type>(0),
+                      thrust::counting_iterator<cudf::size_type>(input.size()),
+                      sizes.data(),
+                      cuda::proclaim_return_type<cudf::size_type>(
+                        [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type {
+                          if (d_strings.is_null(idx)) { return 0; }
+                          return d_strings.element<cudf::string_view>(idx).size_bytes();
+                        }));
+    thrust::sort_by_key(
+      rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin());
+    auto const lb = thrust::lower_bound(
+      rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold);
+    threshold_index = static_cast<cudf::size_type>(thrust::distance(sizes.begin(), lb));
+  }
+
+  // handle the strings below the threshold width
+  if (threshold_index > 0) {
+    auto d_indices = cudf::device_span<cudf::size_type const>(indices.data(), threshold_index);
+    cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(d_indices.size()) * block_size,
+                               block_size};
+    minhash_permuted_kernel<hash_value_type, 1>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
+  }
+
+  // handle the strings above the threshold width
+  if (threshold_index < input.size()) {
+    auto const count = static_cast<cudf::thread_index_type>(input.size() - threshold_index);
+    auto d_indices =
+      cudf::device_span<cudf::size_type const>(indices.data() + threshold_index, count);
+    cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size};
+    minhash_permuted_kernel<hash_value_type, blocks_per_string>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
+  }
+
+  return results;
+}
+
 /**
  * @brief Compute the minhash of each list row of strings for each seed
  *
@@ -309,6 +647,20 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
+                                      uint32_t seed,
+                                      cudf::device_span<uint32_t const> parameter_a,
+                                      cudf::device_span<uint32_t const> parameter_b,
+                                      cudf::size_type width,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto hashes =
+    detail::minhash_fn<HashFunction>(input, seed, parameter_a, parameter_b, width, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
+}
+
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> const& seed,
                                         cudf::size_type width,
@@ -333,6 +685,20 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
+                                        uint64_t seed,
+                                        cudf::device_span<uint64_t const> parameter_a,
+                                        cudf::device_span<uint64_t const> parameter_b,
+                                        cudf::size_type width,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto hashes =
+    detail::minhash_fn<HashFunction>(input, seed, parameter_a, parameter_b, width, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
+}
+
 std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
                                            cudf::device_span<uint32_t const> seeds,
                                            rmm::cuda_stream_view stream,
@@ -374,6 +740,18 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   return detail::minhash(input, seeds, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash_permuted(cudf::strings_column_view const& input,
+                                               uint32_t seed,
+                                               cudf::device_span<uint32_t const> parameter_a,
+                                               cudf::device_span<uint32_t const> parameter_b,
+                                               cudf::size_type width,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr);
+}
+
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> seed,
                                         cudf::size_type width,
@@ -394,6 +772,18 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return detail::minhash64(input, seeds, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash64_permuted(cudf::strings_column_view const& input,
+                                                 uint64_t seed,
+                                                 cudf::device_span<uint64_t const> parameter_a,
+                                                 cudf::device_span<uint64_t const> parameter_b,
+                                                 cudf::size_type width,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr);
+}
+
 std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
                                            cudf::device_span<uint32_t const> seeds,
                                            rmm::cuda_stream_view stream,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3a9b930830b..cbca0ceef77 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -610,6 +610,7 @@ ConfigureTest(
   text/bpe_tests.cpp
   text/edit_distance_tests.cpp
   text/jaccard_tests.cpp
+  text/minhash_tests.cpp
   text/ngrams_tests.cpp
   text/ngrams_tokenize_tests.cpp
   text/normalize_tests.cpp
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index ef35a4472cf..042ac44621e 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -28,155 +28,169 @@
 
 struct MinHashTest : public cudf::test::BaseFixture {};
 
-TEST_F(MinHashTest, Basic)
+TEST_F(MinHashTest, Permuted)
 {
-  auto validity = cudf::test::iterators::null_at(1);
   auto input =
     cudf::test::strings_column_wrapper({"doc 1",
-                                        "",
                                         "this is doc 2",
-                                        "",
                                         "doc 3",
                                         "d",
-                                        "The quick brown fox jumpéd over the lazy brown dog."},
-                                       validity);
+                                        "The quick brown fox jumpéd over the lazy brown dog.",
+                                        "line six",
+                                        "line seven",
+                                        "line eight",
+                                        "line nine",
+                                        "line ten"});
 
   auto view = cudf::strings_column_view(input);
 
-  auto results = nvtext::minhash(view);
+  auto first  = thrust::counting_iterator<uint32_t>(10);
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
-  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(
-    {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, validity);
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW32 expected({
+    LCW32{1392101586u,  394869177u,  811528444u},
+    LCW32{ 211415830u,  187088503u,  130291444u},
+    LCW32{2098117052u,  394869177u,  799753544u},
+    LCW32{2264583304u, 2920538364u, 3576493424u},
+    LCW32{ 253327882u,   41747273u,  302030804u},
+    LCW32{2109809594u, 1017470651u,  326988172u},
+    LCW32{1303819864u,  850676747u,  147107852u},
+    LCW32{ 736021564u,  720812292u, 1405158760u},
+    LCW32{ 902780242u,  134064807u, 1613944636u},
+    LCW32{ 547084870u, 1748895564u,  656501844u}
+  });
+  // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto results64  = nvtext::minhash64(view);
-  auto expected64 = cudf::test::fixed_width_column_wrapper<uint64_t>({774489391575805754ul,
-                                                                      0ul,
-                                                                      3232308021562742685ul,
-                                                                      0ul,
-                                                                      13145552576991307582ul,
-                                                                      14660046701545912182ul,
-                                                                      398062025280761388ul},
-                                                                     validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
-}
+  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
-TEST_F(MinHashTest, LengthEqualsWidth)
-{
-  auto input   = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"});
-  auto view    = cudf::strings_column_view(input);
-  auto results = nvtext::minhash(view, 0, 5);
-  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(
-    {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
+  // clang-format off
+  LCW64 expected64({
+    LCW64{ 827364888116975697ul, 1601854279692781452ul,  70500662054893256ul},
+    LCW64{  18312093741021833ul,  133793446674258329ul,  21974512489226198ul},
+    LCW64{  22474244732520567ul, 1638811775655358395ul, 949306297364502264ul},
+    LCW64{1332357434996402861ul, 2157346081260151330ul, 676491718310205848ul},
+    LCW64{  65816830624808020ul,   43323600380520789ul,  63511816333816345ul},
+    LCW64{ 629657184954525200ul,   49741036507643002ul,  97466271004074331ul},
+    LCW64{ 301611977846331113ul,  101188874709594830ul,  97466271004074331ul},
+    LCW64{ 121498891461700668ul,  171065800427907402ul,  97466271004074331ul},
+    LCW64{  54617739511834072ul,  231454301607238929ul,  97466271004074331ul},
+    LCW64{ 576418665851990314ul,  231454301607238929ul,  97466271004074331ul}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
-TEST_F(MinHashTest, MultiSeed)
+TEST_F(MinHashTest, PermutedWide)
 {
-  auto input =
-    cudf::test::strings_column_wrapper({"doc 1",
-                                        "this is doc 2",
-                                        "doc 3",
-                                        "d",
-                                        "The quick brown fox jumpéd over the lazy brown dog."});
-
-  auto view = cudf::strings_column_view(input);
+  std::string const small(2 << 10, 'x');  // below wide_string_threshold
+  std::string const wide(2 << 19, 'y');   // above wide_string_threshold
+  auto input = cudf::test::strings_column_wrapper({small, wide});
+  auto view  = cudf::strings_column_view(input);
 
-  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({0, 1, 2});
-  auto results = nvtext::minhash(view, cudf::column_view(seeds));
+  auto first  = thrust::counting_iterator<uint32_t>(20);
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
-  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
-  LCW expected({LCW{1207251914u, 1677652962u, 1061355987u},
-                LCW{  21141582u,  580916568u, 1258052021u},
-                LCW{1207251914u,  943567174u, 1109272887u},
-                LCW{ 655955059u,  488346356u, 2394664816u},
-                LCW{  86520422u,  236622901u,  102546228u}});
+  LCW32 expected({
+    LCW32{1731998032u,  315359380u, 3193688024u},
+    LCW32{1293098788u, 2860992281u,  133918478u}
+  });
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({0, 1, 2});
-  auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64));
+  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
-  LCW64 expected64({LCW64{  774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul},
-                    LCW64{ 3232308021562742685ul,  4445611509348165860ul, 1188598072697676120ul},
-                    LCW64{13145552576991307582ul,  6846192680998069919ul, 1188598072697676120ul},
-                    LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul},
-                    LCW64{  398062025280761388ul,   377720198157450084ul,  984941365662009329ul}});
+   LCW64 expected64({
+     LCW64{1818322427062143853ul, 641024893347719371ul, 1769570368846988848ul},
+     LCW64{1389920339306667795ul, 421787002125838902ul, 1759496674158703968ul}
+   });
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
-TEST_F(MinHashTest, MultiSeedWithNullInputRow)
+TEST_F(MinHashTest, PermutedManyParameters)
 {
-  auto validity = cudf::test::iterators::null_at(1);
-  auto input    = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity);
-  auto view     = cudf::strings_column_view(input);
+  std::string const small(2 << 10, 'x');
+  std::string const wide(2 << 19, 'y');
+  auto input = cudf::test::strings_column_wrapper({small, wide});
+  auto view  = cudf::strings_column_view(input);
 
-  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
-  auto results = nvtext::minhash(view, cudf::column_view(seeds));
+  auto first = thrust::counting_iterator<uint32_t>(20);
+  // more than params_per_thread
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 31);
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
-  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
-  LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}},
-               validity);
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW32 expected({
+    LCW32{1731998032u,  315359380u, 3193688024u, 1777049372u,  360410720u, 3238739364u, 1822100712u,  405462060u,
+          3283790704u, 1867152052u,  450513400u, 3328842044u, 1912203392u,  495564740u, 3373893384u, 1957254732u,
+           540616080u, 3418944724u, 2002306072u,  585667420u, 3463996064u, 2047357412u,  630718760u, 3509047404u,
+          2092408752u,  675770100u, 3554098744u, 2137460092u,  720821440u, 3599150084u, 2182511432u},
+    LCW32{1293098788u, 2860992281u,  133918478u, 1701811971u, 3269705464u,  542631661u, 2110525154u, 3678418647u,
+           951344844u, 2519238337u, 4087131830u, 1360058027u, 2927951520u,  200877717u, 1768771210u, 3336664703u,
+           609590900u, 2177484393u, 3745377886u, 1018304083u, 2586197576u, 4154091069u, 1427017266u, 2994910759u,
+           267836956u, 1835730449u, 3403623942u,  676550139u, 2244443632u, 3812337125u, 1085263322u}
+  });
+  // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
-  auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64));
+  // more than params_per_thread
+  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 31);
+  auto results64 = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
-  LCW64 expected64({LCW64{2597399324547032480ul, 4461410998582111052ul},
-                    LCW64{},
-                    LCW64{0ul, 0ul},
-                    LCW64{2717781266371273264ul, 6977325820868387259ul}},
-                   validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
-}
-
-TEST_F(MinHashTest, WordsMinHash)
-{
-  using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
-  auto validity = cudf::test::iterators::null_at(1);
-
-  LCWS input(
-    {LCWS({"hello", "abcdéfgh"}),
-     LCWS{},
-     LCWS({"rapids", "moré", "test", "text"}),
-     LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})},
-    validity);
-
-  auto view = cudf::lists_column_view(input);
-
-  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
-  auto results = nvtext::word_minhash(view, cudf::column_view(seeds));
-  using LCW32  = cudf::test::lists_column_wrapper<uint32_t>;
-  LCW32 expected({LCW32{2069617641u, 1975382903u},
-                  LCW32{},
-                  LCW32{657297235u, 1010955999u},
-                  LCW32{644643885u, 310002789u}},
-                 validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-
-  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
-  auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64));
-  using LCW64    = cudf::test::lists_column_wrapper<uint64_t>;
-  LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul},
-                    LCW64{},
-                    LCW64{5331949571924938590ul, 2088583894581919741ul},
-                    LCW64{3400468157617183341ul, 2398577492366130055ul}},
-                   validity);
+  // clang-format off
+   LCW64 expected64({
+     LCW64{1818322427062143853,  641024893347719371, 1769570368846988848, 592272835132564366,
+           1720818310631833835,  543520776917409353, 1672066252416678822, 494768718702254348,
+           1623314194201523817,  446016660487099335, 1574562135986368804, 397264602271944322,
+           1525810077771213799,  348512544056789317, 1477058019556058786, 299760485841634304,
+           1428305961340903773,  251008427626479291, 1379553903125748768, 202256369411324286,
+           1330801844910593755,  153504311196169273, 1282049786695438742, 104752252981014268,
+           1233297728480283737,   56000194765859255, 1184545670265128724,   7248136550704242,
+           1135793612049973719, 2264339087549243188, 1087041553834818706},
+     LCW64{1389920339306667795,  421787002125838902, 1759496674158703968,  791363336977875075,
+           2129073009010740141, 1160939671829911248,  192806334649082363, 1530516006681947421,
+            562382669501118536, 1900092341533983602,  931959004353154709, 2269668676386019775,
+           1301535339205190882,  333402002024361997, 1671111674057227055,  702978336876398170,
+           2040688008909263228, 1072554671728434343,  104421334547605450, 1442131006580470516,
+            473997669399641631, 1811707341432506689,  843574004251677804, 2181283676284542862,
+           1213150339103713977,  245017001922885084, 1582726673955750150,  614593336774921257,
+           1952303008807786323,  984169671626957438,   16036334446128545}
+   });
+  // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
 TEST_F(MinHashTest, EmptyTest)
 {
-  auto input   = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-  auto view    = cudf::strings_column_view(input->view());
-  auto results = nvtext::minhash(view);
+  auto input  = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  auto view   = cudf::strings_column_view(input->view());
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2, 3});
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::minhash64(view);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>({1, 2, 3});
+  results       = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
   EXPECT_EQ(results->size(), 0);
 }
 
@@ -184,20 +198,39 @@ TEST_F(MinHashTest, ErrorsTest)
 {
   auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"});
   auto view  = cudf::strings_column_view(input);
-  EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument);
-  EXPECT_THROW(nvtext::minhash64(view, 0, 0), std::invalid_argument);
-  auto seeds = cudf::test::fixed_width_column_wrapper<uint32_t>();
-  EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument);
-  auto seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
-  EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::invalid_argument);
+  auto empty = cudf::test::fixed_width_column_wrapper<uint32_t>();
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0),
+    std::invalid_argument);
+  auto empty64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
+  EXPECT_THROW(
+    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0),
+    std::invalid_argument);
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4),
+    std::invalid_argument);
+  EXPECT_THROW(
+    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4),
+    std::invalid_argument);
 
   std::vector<std::string> h_input(50000, "");
   input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end());
   view  = cudf::strings_column_view(input);
 
   auto const zeroes = thrust::constant_iterator<uint32_t>(0);
-  seeds             = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error);
-  seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::overflow_error);
+  auto params       = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4),
+    std::overflow_error);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(nvtext::minhash64_permuted(
+                 view, 0, cudf::column_view(params64), cudf::column_view(params64), 4),
+               std::overflow_error);
+
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4),
+    std::invalid_argument);
+  EXPECT_THROW(
+    nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4),
+    std::invalid_argument);
 }
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 5e39cafa47b..25cfcf99ca6 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -1,5 +1,7 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint32_t, uint64_t
+
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
@@ -17,6 +19,19 @@ def minhash(Column input, Column seeds, int width=4):
     return Column.from_pylibcudf(result)
 
 
+@acquire_spill_lock()
+def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width):
+    return Column.from_pylibcudf(
+        nvtext.minhash.minhash_permuted(
+            input.to_pylibcudf(mode="read"),
+            seed,
+            a.to_pylibcudf(mode="read"),
+            b.to_pylibcudf(mode="read"),
+            width,
+        )
+    )
+
+
 @acquire_spill_lock()
 def minhash64(Column input, Column seeds, int width=4):
     result = nvtext.minhash.minhash64(
@@ -27,6 +42,19 @@ def minhash64(Column input, Column seeds, int width=4):
     return Column.from_pylibcudf(result)
 
 
+@acquire_spill_lock()
+def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width):
+    return Column.from_pylibcudf(
+        nvtext.minhash.minhash64_permuted(
+            input.to_pylibcudf(mode="read"),
+            seed,
+            a.to_pylibcudf(mode="read"),
+            b.to_pylibcudf(mode="read"),
+            width,
+        )
+    )
+
+
 @acquire_spill_lock()
 def word_minhash(Column input, Column seeds):
     result = nvtext.minhash.word_minhash(
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index ffa5e603408..4c0ec2d9ac5 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -9,6 +9,8 @@
 from cudf._lib.nvtext.minhash import (
     minhash,
     minhash64,
+    minhash64_permuted,
+    minhash_permuted,
     word_minhash,
     word_minhash64,
 )
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 856ce0f75de..3d70b01b7e4 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5350,11 +5350,65 @@ def minhash(
             libstrings.minhash(self._column, seeds_column, width)
         )
 
+    def minhash_permuted(
+        self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int
+    ) -> SeriesOrIndex:
+        """
+        Compute the minhash of a strings column.
+
+        This uses the MurmurHash3_x86_32 algorithm for the hash function.
+
+        Calculation uses the formula (hv * a + b) % mersenne_prime
+        where hv is the hash of a substring of width characters,
+        a and b are provided values and mersenne_prime is 2^61-1.
+
+        Parameters
+        ----------
+        seed : uint32
+            The seed used for the hash algorithm.
+        a : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint32.
+        b : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint32.
+        width : int
+            The width of the substring to hash.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> s = cudf.Series(['this is my', 'favorite book'])
+        >>> a = cudf.Series([1, 2, 3], dtype=np.uint32)
+        >>> b = cudf.Series([4, 5, 6], dtype=np.uint32)
+        >>> s.str.minhash_permuted(0, a=a, b=b, width=5)
+        0    [1305480171, 462824409, 74608232]
+        1       [32665388, 65330773, 97996158]
+        dtype: list
+        """
+        a_column = column.as_column(a)
+        if a_column.dtype != np.uint32:
+            raise ValueError(
+                f"Expecting a Series with dtype uint32, got {type(a)}"
+            )
+        b_column = column.as_column(b)
+        if b_column.dtype != np.uint32:
+            raise ValueError(
+                f"Expecting a Series with dtype uint32, got {type(b)}"
+            )
+        return self._return_or_inplace(
+            libstrings.minhash_permuted(
+                self._column, seed, a_column, b_column, width
+            )
+        )
+
     def minhash64(
         self, seeds: ColumnLike | None = None, width: int = 4
     ) -> SeriesOrIndex:
         """
         Compute the minhash of a strings column.
+
         This uses the MurmurHash3_x64_128 algorithm for the hash function.
         This function generates 2 uint64 values but only the first
         uint64 value is used.
@@ -5390,6 +5444,59 @@ def minhash64(
             libstrings.minhash64(self._column, seeds_column, width)
         )
 
+    def minhash64_permuted(
+        self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int
+    ) -> SeriesOrIndex:
+        """
+        Compute the minhash of a strings column.
+        This uses the MurmurHash3_x64_128 algorithm for the hash function.
+
+        Calculation uses the formula (hv * a + b) % mersenne_prime
+        where hv is the hash of a substring of width characters,
+        a and b are provided values and mersenne_prime is 2^61-1.
+
+        Parameters
+        ----------
+        seed : uint64
+            The seed used for the hash algorithm.
+        a : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint64.
+        b : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint64.
+        width : int
+            The width of the substring to hash.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> s = cudf.Series(['this is my', 'favorite book', 'to read'])
+        >>> a = cudf.Series([2, 3], dtype=np.uint64)
+        >>> b = cudf.Series([5, 6], dtype=np.uint64)
+        >>> s.str.minhash64_permuted(0, a=a, b=b, width=5)
+        0    [172452388517576012, 316595762085180527]
+        1      [71427536958126239, 58787297728258215]
+        2    [423885828176437114, 1140588505926961370]
+        dtype: list
+        """
+        a_column = column.as_column(a)
+        if a_column.dtype != np.uint64:
+            raise ValueError(
+                f"Expecting a Series with dtype uint64, got {type(a)}"
+            )
+        b_column = column.as_column(b)
+        if b_column.dtype != np.uint64:
+            raise ValueError(
+                f"Expecting a Series with dtype uint64, got {type(b)}"
+            )
+        return self._return_or_inplace(
+            libstrings.minhash64_permuted(
+                self._column, seed, a_column, b_column, width
+            )
+        )
+
     def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
         """
         Compute the minhash of a list column of strings.
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 997ca357986..47e541fdcef 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -882,68 +882,48 @@ def test_is_vowel_consonant():
     assert_eq(expected, actual)
 
 
-def test_minhash():
+def test_minhash_permuted():
     strings = cudf.Series(["this is my", "favorite book", None, ""])
 
+    params = cudf.Series([1, 2, 3], dtype=np.uint32)
     expected = cudf.Series(
         [
-            cudf.Series([21141582], dtype=np.uint32),
-            cudf.Series([962346254], dtype=np.uint32),
-            None,
-            cudf.Series([0], dtype=np.uint32),
-        ]
-    )
-    actual = strings.str.minhash()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-    expected = cudf.Series(
-        [
-            cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32),
-            cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32),
+            cudf.Series([1305480168, 462824406, 74608229], dtype=np.uint32),
+            cudf.Series([32665385, 65330770, 97996155], dtype=np.uint32),
             None,
             cudf.Series([0, 0, 0], dtype=np.uint32),
         ]
     )
-    actual = strings.str.minhash(seeds=seeds, width=5)
+    actual = strings.str.minhash_permuted(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
-    expected = cudf.Series(
-        [
-            cudf.Series([3232308021562742685], dtype=np.uint64),
-            cudf.Series([23008204270530356], dtype=np.uint64),
-            None,
-            cudf.Series([0], dtype=np.uint64),
-        ]
-    )
-    actual = strings.str.minhash64()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+    params = cudf.Series([1, 2, 3], dtype=np.uint64)
     expected = cudf.Series(
         [
             cudf.Series(
-                [7082801294247314046, 185949556058924788, 167570629329462454],
+                [105531920695060180, 172452388517576009, 316595762085180524],
                 dtype=np.uint64,
             ),
             cudf.Series(
-                [382665377781028452, 86243762733551437, 7688750597953083512],
+                [35713768479063122, 71427536958126236, 58787297728258212],
                 dtype=np.uint64,
             ),
             None,
             cudf.Series([0, 0, 0], dtype=np.uint64),
         ]
     )
-    actual = strings.str.minhash64(seeds=seeds, width=5)
+    actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
     # test wrong seed types
     with pytest.raises(ValueError):
-        strings.str.minhash(seeds="a")
+        strings.str.minhash_permuted(1, a="a", b="b", width=7)
     with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
-        strings.str.minhash(seeds=seeds)
+        params = cudf.Series([0, 1, 2], dtype=np.int32)
+        strings.str.minhash_permuted(1, a=params, b=params, width=6)
     with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-        strings.str.minhash64(seeds=seeds)
+        params = cudf.Series([0, 1, 2], dtype=np.uint32)
+        strings.str.minhash64_permuted(1, a=params, b=params, width=8)
 
 
 def test_word_minhash():
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index 41250037dcf..ebf8eda1ce3 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -22,6 +22,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const size_type width,
     ) except +
 
+    cdef unique_ptr[column] minhash_permuted(
+        const column_view &strings,
+        const uint32_t seed,
+        const column_view &a,
+        const column_view &b,
+        const size_type width,
+    ) except +
+
     cdef unique_ptr[column] minhash64(
         const column_view &strings,
         const column_view &seeds,
@@ -34,6 +42,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const size_type width,
     ) except +
 
+    cdef unique_ptr[column] minhash64_permuted(
+        const column_view &strings,
+        const uint64_t seed,
+        const column_view &a,
+        const column_view &b,
+        const size_type width,
+    ) except +
+
     cdef unique_ptr[column] word_minhash(
         const column_view &input,
         const column_view &seeds
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
index 97e8c9dc83c..6b544282f44 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
@@ -11,8 +11,24 @@ ctypedef fused ColumnOrScalar:
 
 cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)
 
+cpdef Column minhash_permuted(
+    Column input,
+    uint32_t seed,
+    Column a,
+    Column b,
+    size_type width
+)
+
 cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)
 
+cpdef Column minhash64_permuted(
+    Column input,
+    uint64_t seed,
+    Column a,
+    Column b,
+    size_type width
+)
+
 cpdef Column word_minhash(Column input, Column seeds)
 
 cpdef Column word_minhash64(Column input, Column seeds)
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index f1e012e60e5..5a51e32b287 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -8,6 +8,8 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
+    minhash64_permuted as cpp_minhash64_permuted,
+    minhash_permuted as cpp_minhash_permuted,
     word_minhash as cpp_word_minhash,
     word_minhash64 as cpp_word_minhash64,
 )
@@ -16,6 +18,7 @@ from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
+import warnings
 
 
 cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
@@ -40,6 +43,12 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
     Column
         List column of minhash values for each string per seed
     """
+    warnings.warn(
+        "Starting in version 25.02, the signature of this function will "
+        "be changed to match pylibcudf.nvtext.minhash_permuted.",
+        FutureWarning
+    )
+
     cdef unique_ptr[column] c_result
 
     if not isinstance(seeds, (Column, Scalar)):
@@ -55,6 +64,50 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
 
     return Column.from_libcudf(move(c_result))
 
+cpdef Column minhash_permuted(
+    Column input,
+    uint32_t seed,
+    Column a,
+    Column b,
+    size_type width
+):
+    """
+    Returns the minhash values for each string.
+    This function uses MurmurHash3_x86_32 for the hash algorithm.
+
+    For details, see :cpp:func:`minhash_permuted`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to compute minhash
+    seed : uint32_t
+        Seed used for the hash function
+    a : Column
+        1st parameter value used for the minhash algorithm.
+    b : Column
+        2nd parameter value used for the minhash algorithm.
+    width : size_type
+        Character width used for apply substrings;
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_minhash_permuted(
+            input.view(),
+            seed,
+            a.view(),
+            b.view(),
+            width
+        )
+
+    return Column.from_libcudf(move(c_result))
+
 cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
     """
     Returns the minhash values for each string per seed.
@@ -77,6 +130,12 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
     Column
         List column of minhash values for each string per seed
     """
+    warnings.warn(
+        "Starting in version 25.02, the signature of this function will "
+        "be changed to match pylibcudf.nvtext.minhash64_permuted.",
+        FutureWarning
+    )
+
     cdef unique_ptr[column] c_result
 
     if not isinstance(seeds, (Column, Scalar)):
@@ -92,6 +151,50 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
 
     return Column.from_libcudf(move(c_result))
 
+cpdef Column minhash64_permuted(
+    Column input,
+    uint64_t seed,
+    Column a,
+    Column b,
+    size_type width
+):
+    """
+    Returns the minhash values for each string.
+    This function uses MurmurHash3_x64_128 for the hash algorithm.
+
+    For details, see :cpp:func:`minhash64_permuted`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to compute minhash
+    seed : uint64_t
+        Seed used for the hash function
+    a : Column
+        1st parameter value used for the minhash algorithm.
+    b : Column
+        2nd parameter value used for the minhash algorithm.
+    width : size_type
+        Character width used for apply substrings;
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_minhash64_permuted(
+            input.view(),
+            seed,
+            a.view(),
+            b.view(),
+            width
+        )
+
+    return Column.from_libcudf(move(c_result))
+
 cpdef Column word_minhash(Column input, Column seeds):
     """
     Returns the minhash values for each row of strings per seed.
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index ead9ee094af..ec533e64307 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -21,15 +21,19 @@ def word_minhash_input_data(request):
 
 
 @pytest.mark.parametrize("width", [5, 12])
-def test_minhash(minhash_input_data, width):
+def test_minhash_permuted(minhash_input_data, width):
     input_arr, seeds, seed_type = minhash_input_data
     minhash_func = (
-        plc.nvtext.minhash.minhash
+        plc.nvtext.minhash.minhash_permuted
         if seed_type == pa.uint32()
-        else plc.nvtext.minhash.minhash64
+        else plc.nvtext.minhash.minhash64_permuted
     )
     result = minhash_func(
-        plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width
+        plc.interop.from_arrow(input_arr),
+        0,
+        plc.interop.from_arrow(seeds),
+        plc.interop.from_arrow(seeds),
+        width,
     )
     pa_result = plc.interop.to_arrow(result)
     assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))

From 7682edbfd418cf30c0f5494dbed36a5dbb102c06 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 12 Nov 2024 15:57:36 +0000
Subject: [PATCH 15/19] Add type stubs for pylibcudf (#17258)

Having looked at a bunch of the automation options, I just did it by hand.

A followup will add some automation to add docstrings (so we can see those via LSP integration in editors) and do some simple validation.

- Closes #15190

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17258
---
 docs/cudf/source/conf.py                      |  73 ++++++-
 docs/cudf/source/developer_guide/pylibcudf.md |  73 ++++++-
 python/cudf/cudf/_lib/labeling.pyx            |   4 +-
 python/cudf/cudf/_lib/lists.pyx               |  24 +--
 .../cudf_polars/containers/dataframe.py       |   2 +-
 .../cudf_polars/dsl/expressions/datetime.py   |   4 +-
 .../cudf_polars/dsl/expressions/literal.py    |   2 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      |   2 +-
 python/pylibcudf/pylibcudf/aggregation.pyi    | 110 +++++++++++
 python/pylibcudf/pylibcudf/aggregation.pyx    |  34 ++++
 python/pylibcudf/pylibcudf/binaryop.pyi       |  54 +++++
 python/pylibcudf/pylibcudf/binaryop.pyx       |   1 +
 python/pylibcudf/pylibcudf/column.pyi         |  48 +++++
 python/pylibcudf/pylibcudf/column.pyx         |   5 +
 .../pylibcudf/pylibcudf/column_factories.pyi  |  20 ++
 .../pylibcudf/pylibcudf/column_factories.pyx  |   9 +
 python/pylibcudf/pylibcudf/concatenate.pyi    |   8 +
 python/pylibcudf/pylibcudf/concatenate.pyx    |   1 +
 .../pylibcudf/pylibcudf/contiguous_split.pyi  |  14 ++
 .../pylibcudf/pylibcudf/contiguous_split.pyx  |  11 ++
 python/pylibcudf/pylibcudf/copying.pyi        |  54 +++++
 python/pylibcudf/pylibcudf/copying.pyx        |  17 ++
 python/pylibcudf/pylibcudf/datetime.pyi       |  45 +++++
 python/pylibcudf/pylibcudf/datetime.pyx       |  18 ++
 python/pylibcudf/pylibcudf/experimental.pyi   |   5 +
 python/pylibcudf/pylibcudf/experimental.pyx   |   2 +
 python/pylibcudf/pylibcudf/expressions.pyi    |  79 ++++++++
 python/pylibcudf/pylibcudf/expressions.pyx    |  12 +-
 python/pylibcudf/pylibcudf/filling.pyi        |  17 ++
 python/pylibcudf/pylibcudf/filling.pyx        |   8 +
 python/pylibcudf/pylibcudf/gpumemoryview.pyi  |   9 +
 python/pylibcudf/pylibcudf/gpumemoryview.pyx  |   3 +
 python/pylibcudf/pylibcudf/groupby.pyi        |  38 ++++
 python/pylibcudf/pylibcudf/groupby.pyx        |   6 +
 python/pylibcudf/pylibcudf/hashing.pyi        |  18 ++
 python/pylibcudf/pylibcudf/hashing.pyx        |  13 ++
 python/pylibcudf/pylibcudf/interop.pyi        |  52 +++++
 python/pylibcudf/pylibcudf/interop.pyx        |   8 +
 python/pylibcudf/pylibcudf/io/__init__.py     |  16 ++
 python/pylibcudf/pylibcudf/io/avro.pyi        |  11 ++
 python/pylibcudf/pylibcudf/io/avro.pyx        |   2 +
 python/pylibcudf/pylibcudf/io/csv.pyi         |  54 +++++
 python/pylibcudf/pylibcudf/io/csv.pyx         |   2 +
 python/pylibcudf/pylibcudf/io/datasource.pyi  |   4 +
 python/pylibcudf/pylibcudf/io/datasource.pyx  |   2 +
 python/pylibcudf/pylibcudf/io/json.pyi        |  50 +++++
 python/pylibcudf/pylibcudf/io/json.pyx        |   1 +
 python/pylibcudf/pylibcudf/io/orc.pyi         |  41 ++++
 python/pylibcudf/pylibcudf/io/orc.pyx         |  10 +
 python/pylibcudf/pylibcudf/io/parquet.pyi     |  36 ++++
 python/pylibcudf/pylibcudf/io/parquet.pyx     |   4 +
 .../pylibcudf/io/parquet_metadata.pyx         |   9 +-
 python/pylibcudf/pylibcudf/io/text.pyx        |   9 +
 python/pylibcudf/pylibcudf/io/timezone.pyi    |   7 +
 python/pylibcudf/pylibcudf/io/timezone.pyx    |   1 +
 python/pylibcudf/pylibcudf/io/types.pyi       |  97 +++++++++
 python/pylibcudf/pylibcudf/io/types.pyx       |  18 ++
 python/pylibcudf/pylibcudf/join.pyi           |  78 ++++++++
 python/pylibcudf/pylibcudf/join.pyx           |  18 ++
 python/pylibcudf/pylibcudf/json.pyi           |  23 +++
 python/pylibcudf/pylibcudf/json.pyx           |   3 +
 python/pylibcudf/pylibcudf/labeling.pxd       |   4 +-
 python/pylibcudf/pylibcudf/labeling.pyi       |  17 ++
 python/pylibcudf/pylibcudf/labeling.pyx       |  24 +--
 .../pylibcudf/libcudf/CMakeLists.txt          |   1 +
 .../pylibcudf/libcudf/lists/CMakeLists.txt    |  23 +++
 .../pylibcudf/libcudf/lists/combine.pxd       |   8 +-
 .../pylibcudf/libcudf/lists/combine.pyx       |   0
 .../pylibcudf/libcudf/lists/contains.pyx      |   0
 python/pylibcudf/pylibcudf/lists.pxd          |  30 ++-
 python/pylibcudf/pylibcudf/lists.pyi          |  70 +++++++
 python/pylibcudf/pylibcudf/lists.pyx          | 185 ++++++++----------
 python/pylibcudf/pylibcudf/merge.pyi          |  11 ++
 python/pylibcudf/pylibcudf/merge.pyx          |   1 +
 python/pylibcudf/pylibcudf/null_mask.pyi      |  14 ++
 python/pylibcudf/pylibcudf/null_mask.pyx      |   7 +
 .../pylibcudf/nvtext/byte_pair_encode.pyi     |  11 ++
 .../pylibcudf/nvtext/byte_pair_encode.pyx     |   3 +
 .../pylibcudf/nvtext/edit_distance.pyi        |   6 +
 .../pylibcudf/nvtext/edit_distance.pyx        |   1 +
 .../pylibcudf/nvtext/generate_ngrams.pyi      |  10 +
 .../pylibcudf/nvtext/generate_ngrams.pyx      |   5 +
 python/pylibcudf/pylibcudf/nvtext/jaccard.pyi |   5 +
 python/pylibcudf/pylibcudf/nvtext/jaccard.pyx |   1 +
 python/pylibcudf/pylibcudf/nvtext/minhash.pyi |  13 ++
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx |   6 +
 .../pylibcudf/nvtext/ngrams_tokenize.pyi      |   8 +
 .../pylibcudf/nvtext/ngrams_tokenize.pyx      |   1 +
 .../pylibcudf/pylibcudf/nvtext/normalize.pyi  |   6 +
 .../pylibcudf/pylibcudf/nvtext/normalize.pyx  |   1 +
 python/pylibcudf/pylibcudf/nvtext/replace.pyi |  17 ++
 python/pylibcudf/pylibcudf/nvtext/replace.pyx |   1 +
 python/pylibcudf/pylibcudf/nvtext/stemmer.pyi |   8 +
 python/pylibcudf/pylibcudf/nvtext/stemmer.pyx |   1 +
 .../pylibcudf/nvtext/subword_tokenize.pyi     |  15 ++
 .../pylibcudf/nvtext/subword_tokenize.pyx     |   3 +
 .../pylibcudf/pylibcudf/nvtext/tokenize.pyi   |  26 +++
 .../pylibcudf/pylibcudf/nvtext/tokenize.pyx   |  12 ++
 python/pylibcudf/pylibcudf/partitioning.pyi   |  14 ++
 python/pylibcudf/pylibcudf/partitioning.pyx   |   5 +
 python/pylibcudf/pylibcudf/py.typed           |   0
 python/pylibcudf/pylibcudf/quantiles.pyi      |  23 +++
 python/pylibcudf/pylibcudf/quantiles.pyx      |   1 +
 python/pylibcudf/pylibcudf/reduce.pyi         |  16 ++
 python/pylibcudf/pylibcudf/reduce.pyx         |   1 +
 python/pylibcudf/pylibcudf/replace.pyi        |  29 +++
 python/pylibcudf/pylibcudf/replace.pyx        |   8 +
 python/pylibcudf/pylibcudf/reshape.pyi        |   7 +
 python/pylibcudf/pylibcudf/reshape.pyx        |   1 +
 python/pylibcudf/pylibcudf/rolling.pyi        |  12 ++
 python/pylibcudf/pylibcudf/rolling.pyx        |   1 +
 python/pylibcudf/pylibcudf/round.pyi          |  15 ++
 python/pylibcudf/pylibcudf/round.pyx          |   1 +
 python/pylibcudf/pylibcudf/scalar.pyi         |  10 +
 python/pylibcudf/pylibcudf/scalar.pyx         |   4 +
 python/pylibcudf/pylibcudf/search.pyi         |  19 ++
 python/pylibcudf/pylibcudf/search.pyx         |   1 +
 python/pylibcudf/pylibcudf/sorting.pyi        |  64 ++++++
 python/pylibcudf/pylibcudf/sorting.pyx        |  12 ++
 .../pylibcudf/pylibcudf/stream_compaction.pxd |   2 +
 .../pylibcudf/pylibcudf/stream_compaction.pyi |  53 +++++
 .../pylibcudf/pylibcudf/stream_compaction.pyx |  12 ++
 .../pylibcudf/pylibcudf/strings/__init__.py   |   4 +-
 .../pylibcudf/strings/attributes.pyi          |   7 +
 .../pylibcudf/strings/attributes.pyx          |   1 +
 .../pylibcudf/strings/capitalize.pyi          |  12 ++
 .../pylibcudf/strings/capitalize.pyx          |   1 +
 python/pylibcudf/pylibcudf/strings/case.pyi   |   7 +
 python/pylibcudf/pylibcudf/strings/case.pyx   |   1 +
 .../pylibcudf/strings/char_types.pyi          |  30 +++
 .../pylibcudf/strings/char_types.pyx          |   5 +
 .../pylibcudf/pylibcudf/strings/combine.pyi   |  34 ++++
 .../pylibcudf/pylibcudf/strings/combine.pyx   |   7 +
 .../pylibcudf/pylibcudf/strings/contains.pyi  |  14 ++
 .../pylibcudf/pylibcudf/strings/contains.pyx  |   1 +
 .../pylibcudf/strings/convert/__init__.py     |  12 ++
 .../strings/convert/convert_booleans.pyi      |   9 +
 .../strings/convert/convert_booleans.pyx      |   1 +
 .../strings/convert/convert_datetime.pyi      |  12 ++
 .../strings/convert/convert_datetime.pyx      |   1 +
 .../strings/convert/convert_durations.pyi     |   9 +
 .../strings/convert/convert_durations.pyx     |   1 +
 .../strings/convert/convert_fixed_point.pyi   |  10 +
 .../strings/convert/convert_fixed_point.pyx   |   2 +
 .../strings/convert/convert_floats.pyi        |   8 +
 .../strings/convert/convert_floats.pyx        |   1 +
 .../strings/convert/convert_integers.pyi      |  11 ++
 .../strings/convert/convert_integers.pyx      |   8 +
 .../strings/convert/convert_ipv4.pyi          |   7 +
 .../strings/convert/convert_ipv4.pyx          |   1 +
 .../strings/convert/convert_lists.pyi         |  10 +
 .../strings/convert/convert_lists.pyx         |   1 +
 .../strings/convert/convert_urls.pyi          |   6 +
 .../strings/convert/convert_urls.pyx          |   1 +
 .../pylibcudf/pylibcudf/strings/extract.pyi   |   8 +
 .../pylibcudf/pylibcudf/strings/extract.pyx   |   1 +
 python/pylibcudf/pylibcudf/strings/find.pyi   |  14 ++
 python/pylibcudf/pylibcudf/strings/find.pyx   |   1 +
 .../pylibcudf/strings/find_multiple.pyi       |   5 +
 .../pylibcudf/strings/find_multiple.pyx       |   1 +
 .../pylibcudf/pylibcudf/strings/findall.pyi   |   7 +
 .../pylibcudf/pylibcudf/strings/findall.pyx   |   1 +
 .../pylibcudf/pylibcudf/strings/padding.pyi   |   9 +
 .../pylibcudf/pylibcudf/strings/padding.pyx   |   1 +
 .../pylibcudf/strings/regex_flags.pyi         |   7 +
 .../pylibcudf/strings/regex_flags.pyx         |   2 +
 .../pylibcudf/strings/regex_program.pyi       |   8 +
 .../pylibcudf/strings/regex_program.pyx       |   3 +
 python/pylibcudf/pylibcudf/strings/repeat.pyi |   5 +
 python/pylibcudf/pylibcudf/strings/repeat.pyx |   1 +
 .../pylibcudf/pylibcudf/strings/replace.pyi   |  14 ++
 .../pylibcudf/pylibcudf/strings/replace.pyx   |   1 +
 .../pylibcudf/strings/replace_re.pyi          |  27 +++
 .../pylibcudf/strings/replace_re.pyx          |   1 +
 .../pylibcudf/pylibcudf/strings/side_type.pyi |   7 +
 .../pylibcudf/pylibcudf/strings/side_type.pyx |   2 +
 python/pylibcudf/pylibcudf/strings/slice.pyi  |  11 ++
 python/pylibcudf/pylibcudf/strings/slice.pyx  |   1 +
 .../pylibcudf/strings/split/__init__.py       |   2 +
 .../pylibcudf/strings/split/partition.pyi     |   8 +
 .../pylibcudf/strings/split/partition.pyx     |   1 +
 .../pylibcudf/strings/split/split.pyi         |  27 +++
 .../pylibcudf/strings/split/split.pyx         |  10 +
 python/pylibcudf/pylibcudf/strings/strip.pyi  |  11 ++
 python/pylibcudf/pylibcudf/strings/strip.pyx  |   1 +
 .../pylibcudf/pylibcudf/strings/translate.pyi |  20 ++
 .../pylibcudf/pylibcudf/strings/translate.pyx |   1 +
 python/pylibcudf/pylibcudf/strings/wrap.pyi   |   5 +
 python/pylibcudf/pylibcudf/strings/wrap.pyx   |   1 +
 python/pylibcudf/pylibcudf/table.pyi          |   9 +
 python/pylibcudf/pylibcudf/table.pyx          |   3 +
 .../pylibcudf/tests/test_binaryops.py         |  14 --
 .../pylibcudf/tests/test_labeling.py          |   8 +-
 .../pylibcudf/pylibcudf/tests/test_lists.py   |  83 ++++----
 .../pylibcudf/tests/test_string_attributes.py |   2 +-
 python/pylibcudf/pylibcudf/traits.pyi         |  23 +++
 python/pylibcudf/pylibcudf/traits.pyx         |  21 ++
 python/pylibcudf/pylibcudf/transform.pyi      |  16 ++
 python/pylibcudf/pylibcudf/transform.pyx      |   9 +
 python/pylibcudf/pylibcudf/transpose.pyi      |   4 +
 python/pylibcudf/pylibcudf/transpose.pyx      |   1 +
 python/pylibcudf/pylibcudf/types.pyi          |  86 ++++++++
 python/pylibcudf/pylibcudf/types.pyx          |  16 ++
 python/pylibcudf/pylibcudf/unary.pyi          |  38 ++++
 python/pylibcudf/pylibcudf/unary.pyx          |  10 +
 python/pylibcudf/pyproject.toml               |  23 ++-
 206 files changed, 2863 insertions(+), 228 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/aggregation.pyi
 create mode 100644 python/pylibcudf/pylibcudf/binaryop.pyi
 create mode 100644 python/pylibcudf/pylibcudf/column.pyi
 create mode 100644 python/pylibcudf/pylibcudf/column_factories.pyi
 create mode 100644 python/pylibcudf/pylibcudf/concatenate.pyi
 create mode 100644 python/pylibcudf/pylibcudf/contiguous_split.pyi
 create mode 100644 python/pylibcudf/pylibcudf/copying.pyi
 create mode 100644 python/pylibcudf/pylibcudf/datetime.pyi
 create mode 100644 python/pylibcudf/pylibcudf/experimental.pyi
 create mode 100644 python/pylibcudf/pylibcudf/expressions.pyi
 create mode 100644 python/pylibcudf/pylibcudf/filling.pyi
 create mode 100644 python/pylibcudf/pylibcudf/gpumemoryview.pyi
 create mode 100644 python/pylibcudf/pylibcudf/groupby.pyi
 create mode 100644 python/pylibcudf/pylibcudf/hashing.pyi
 create mode 100644 python/pylibcudf/pylibcudf/interop.pyi
 create mode 100644 python/pylibcudf/pylibcudf/io/avro.pyi
 create mode 100644 python/pylibcudf/pylibcudf/io/csv.pyi
 create mode 100644 python/pylibcudf/pylibcudf/io/datasource.pyi
 create mode 100644 python/pylibcudf/pylibcudf/io/json.pyi
 create mode 100644 python/pylibcudf/pylibcudf/io/orc.pyi
 create mode 100644 python/pylibcudf/pylibcudf/io/parquet.pyi
 create mode 100644 python/pylibcudf/pylibcudf/io/timezone.pyi
 create mode 100644 python/pylibcudf/pylibcudf/io/types.pyi
 create mode 100644 python/pylibcudf/pylibcudf/join.pyi
 create mode 100644 python/pylibcudf/pylibcudf/json.pyi
 create mode 100644 python/pylibcudf/pylibcudf/labeling.pyi
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx
 create mode 100644 python/pylibcudf/pylibcudf/lists.pyi
 create mode 100644 python/pylibcudf/pylibcudf/merge.pyi
 create mode 100644 python/pylibcudf/pylibcudf/null_mask.pyi
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/jaccard.pyi
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/minhash.pyi
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/normalize.pyi
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/replace.pyi
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/stemmer.pyi
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/tokenize.pyi
 create mode 100644 python/pylibcudf/pylibcudf/partitioning.pyi
 create mode 100644 python/pylibcudf/pylibcudf/py.typed
 create mode 100644 python/pylibcudf/pylibcudf/quantiles.pyi
 create mode 100644 python/pylibcudf/pylibcudf/reduce.pyi
 create mode 100644 python/pylibcudf/pylibcudf/replace.pyi
 create mode 100644 python/pylibcudf/pylibcudf/reshape.pyi
 create mode 100644 python/pylibcudf/pylibcudf/rolling.pyi
 create mode 100644 python/pylibcudf/pylibcudf/round.pyi
 create mode 100644 python/pylibcudf/pylibcudf/scalar.pyi
 create mode 100644 python/pylibcudf/pylibcudf/search.pyi
 create mode 100644 python/pylibcudf/pylibcudf/sorting.pyi
 create mode 100644 python/pylibcudf/pylibcudf/stream_compaction.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/attributes.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/capitalize.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/case.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/char_types.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/combine.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/contains.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/extract.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/find.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/find_multiple.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/findall.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/padding.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/regex_flags.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/regex_program.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/repeat.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/replace.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/replace_re.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/side_type.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/slice.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/partition.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/split.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/strip.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/translate.pyi
 create mode 100644 python/pylibcudf/pylibcudf/strings/wrap.pyi
 create mode 100644 python/pylibcudf/pylibcudf/table.pyi
 create mode 100644 python/pylibcudf/pylibcudf/traits.pyi
 create mode 100644 python/pylibcudf/pylibcudf/transform.pyi
 create mode 100644 python/pylibcudf/pylibcudf/transpose.pyi
 create mode 100644 python/pylibcudf/pylibcudf/types.pyi
 create mode 100644 python/pylibcudf/pylibcudf/unary.pyi

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 0d463b918d3..fbb9ca4b128 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -26,16 +26,18 @@
 import tempfile
 import warnings
 import xml.etree.ElementTree as ET
+from enum import IntEnum
+from typing import Any
 
+import cudf
 from docutils.nodes import Text
 from packaging.version import Version
-from sphinx.addnodes import pending_xref
-from sphinx.highlighting import lexers
-from sphinx.ext import intersphinx
 from pygments.lexer import RegexLexer
 from pygments.token import Text as PText
-
-import cudf
+from sphinx.addnodes import pending_xref
+from sphinx.ext import intersphinx
+from sphinx.ext.autodoc import ClassDocumenter, bool_option
+from sphinx.highlighting import lexers
 
 
 class PseudoLexer(RegexLexer):
@@ -342,7 +344,10 @@ def clean_all_xml_files(path):
     "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
     "cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
-    "DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
+    "DeviceBuffer": (
+        "rmm.pylibrmm.device_buffer.DeviceBuffer",
+        "rmm.DeviceBuffer",
+    ),
 }
 
 
@@ -373,7 +378,14 @@ def _generate_namespaces(namespaces):
 _all_namespaces = _generate_namespaces(
     {
         # Note that io::datasource is actually a nested class
-        "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"},
+        "cudf": {
+            "io",
+            "io::datasource",
+            "strings",
+            "ast",
+            "ast::expression",
+            "io::text",
+        },
         "numeric": {},
         "nvtext": {},
     }
@@ -642,9 +654,54 @@ def linkcode_resolve(domain, info) -> str | None:
         f"branch-{version}/python/cudf/cudf/{fn}{linespec}"
     )
 
+
 # Needed for avoid build warning for PandasCompat extension
 suppress_warnings = ["myst.domains"]
 
+
+class PLCIntEnumDocumenter(ClassDocumenter):
+    objtype = "enum"
+    directivetype = "attribute"
+    priority = 10 + ClassDocumenter.priority
+
+    option_spec = dict(ClassDocumenter.option_spec)
+
+    @classmethod
+    def can_document_member(
+        cls, member: Any, membername: str, isattr: bool, parent: Any
+    ) -> bool:
+        try:
+            return issubclass(
+                member, IntEnum
+            ) and member.__module__.startswith("pylibcudf")
+        except TypeError:
+            return False
+
+    def add_directive_header(self, sig: str) -> None:
+        self.directivetype = "attribute"
+        super().add_directive_header(sig)
+
+    def add_content(self, more_content) -> None:
+        doc_as_attr = self.doc_as_attr
+        self.doc_as_attr = False
+        super().add_content(more_content)
+        self.doc_as_attr = doc_as_attr
+        source_name = self.get_sourcename()
+        enum_object: IntEnum = self.object
+
+        if self.object.__name__ != "Kind":
+            self.add_line(f"See also :cpp:enum:`cudf::{self.object.__name__}`.", source_name)
+        self.add_line("", source_name)
+        self.add_line("Enum members", source_name)
+        self.add_line("", source_name)
+
+        for the_member_name in enum_object.__members__:  # type: ignore[attr-defined]
+            self.add_line(
+                f"* ``{the_member_name}``", source_name
+            )
+            self.add_line("", source_name)
+
+
 def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
     app.add_js_file(
@@ -652,3 +709,5 @@ def setup(app):
     )
     app.connect("doctree-read", resolve_aliases)
     app.connect("missing-reference", on_missing_reference)
+    app.setup_extension("sphinx.ext.autodoc")
+    app.add_autodocumenter(PLCIntEnumDocumenter)
diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
index 39840e72e21..1ee828e7c4e 100644
--- a/docs/cudf/source/developer_guide/pylibcudf.md
+++ b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -15,7 +15,8 @@ To satisfy the goals of pylibcudf, we impose the following set of design princip
 - All typing in code should be written using Cython syntax, not PEP 484 Python typing syntax. Not only does this ensure compatibility with Cython < 3, but even with Cython 3 PEP 484 support remains incomplete as of this writing.
 - All cudf code should interact only with pylibcudf, never with libcudf directly. This is not currently the case, but is the direction that the library is moving towards.
 - Ideally, pylibcudf should depend on no RAPIDS component other than rmm, and should in general have minimal runtime dependencies.
-
+- Type stubs are provided and generated manually. When adding new
+  functionality, ensure that the matching type stub is appropriately updated.
 
 ## Relationship to libcudf
 
@@ -249,3 +250,73 @@ In the event that libcudf provides multiple overloads for the same function with
 and set arguments not shared between overloads to `None`. If a user tries to pass in an unsupported argument for a specific overload type, you should raise `ValueError`.
 
 Finally, consider making an libcudf issue if you think this inconsistency can be addressed on the libcudf side.
+
+### Type stubs
+
+Since static type checkers like `mypy` and `pyright` cannot parse
+Cython code, we provide type stubs for the pylibcudf package. These
+are currently maintained manually, alongside the matching pylibcudf
+files.
+
+Every `pyx` file should have a matching `pyi` file that provides the
+type stubs. Most functions can be exposed straightforwardly. Some
+guiding principles:
+
+- For typed integer arguments in libcudf, use `int` as a type
+  annotation.
+- For functions which are annotated as a `list` in Cython, but the
+  function body does more detailed checking, try and encode the
+  detailed information in the type.
+- For Cython fused types there are two options:
+    1. If the fused type appears only once in the function signature,
+       use a `Union` type;
+    2. If the fused type appears more than once (or as both an input
+       and output type), use a `TypeVar` with
+       the variants in the fused type provided as constraints.
+
+
+As an example, `pylibcudf.copying.split` is typed in Cython as:
+
+```cython
+ctypedef fused ColumnOrTable:
+    Table
+    Column
+
+cpdef list split(ColumnOrTable input, list splits): ...
+```
+
+Here we only have a single use of the fused type, and the `list`
+arguments do not specify their values. Here, if we provide a `Column`
+as input, we receive a `list[Column]` as output, and if we provide a
+`Table` we receive `list[Table]` as output.
+
+In the type stub, we can encode this with a `TypeVar`, we can also
+provide typing for the `splits` argument that indicates that the split
+values must be integers:
+
+```python
+ColumnOrTable = TypeVar("ColumnOrTable", Column, Table)
+
+def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ...
+```
+
+Conversely, `pylibcudf.copying.scatter` uses a fused type only once in
+its input:
+
+```cython
+ctypedef fused TableOrListOfScalars:
+    Table
+    list
+
+cpdef Table scatter(
+    TableOrListOfScalars source, Column scatter_map, Table target
+)
+```
+
+In the type stub, we can use a normal union in this case
+
+```python
+def scatter(
+    source: Table | list[Scalar], scatter_map: Column, target: Table
+) -> Table: ...
+```
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
index 3966cce8981..524bfd3b2e8 100644
--- a/python/cudf/cudf/_lib/labeling.pyx
+++ b/python/cudf/cudf/_lib/labeling.pyx
@@ -17,8 +17,8 @@ def label_bins(Column input, Column left_edges, cbool left_inclusive,
     plc_column = plc.labeling.label_bins(
         input.to_pylibcudf(mode="read"),
         left_edges.to_pylibcudf(mode="read"),
-        left_inclusive,
+        plc.labeling.Inclusive.YES if left_inclusive else plc.labeling.Inclusive.NO,
         right_edges.to_pylibcudf(mode="read"),
-        right_inclusive
+        plc.labeling.Inclusive.YES if right_inclusive else plc.labeling.Inclusive.NO,
     )
     return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index a91d44274e5..9a2aa4a6130 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -4,7 +4,9 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
-from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport (
+    nan_equality, null_equality, null_order, order, size_type
+)
 
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_pylibcudf_table
@@ -37,8 +39,8 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
     return Column.from_pylibcudf(
         plc.lists.distinct(
             col.to_pylibcudf(mode="read"),
-            nulls_equal,
-            nans_all_equal,
+            null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL,
+            nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL,
         )
     )
 
@@ -48,12 +50,8 @@ def sort_lists(Column col, bool ascending, str na_position):
     return Column.from_pylibcudf(
         plc.lists.sort_lists(
             col.to_pylibcudf(mode="read"),
-            ascending,
-            (
-                plc.types.NullOrder.BEFORE
-                if na_position == "first"
-                else plc.types.NullOrder.AFTER
-            ),
+            order.ASCENDING if ascending else order.DESCENDING,
+            null_order.BEFORE if na_position == "first" else null_order.AFTER,
             False,
         )
     )
@@ -95,7 +93,7 @@ def index_of_scalar(Column col, object py_search_key):
         plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             <Scalar> py_search_key.device_value.c_value,
-            True,
+            plc.lists.DuplicateFindOption.FIND_FIRST,
         )
     )
 
@@ -106,7 +104,7 @@ def index_of_column(Column col, Column search_keys):
         plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             search_keys.to_pylibcudf(mode="read"),
-            True,
+            plc.lists.DuplicateFindOption.FIND_FIRST,
         )
     )
 
@@ -127,7 +125,9 @@ def concatenate_list_elements(Column input_column, dropna=False):
     return Column.from_pylibcudf(
         plc.lists.concatenate_list_elements(
             input_column.to_pylibcudf(mode="read"),
-            dropna,
+            plc.lists.ConcatenateNullPolicy.IGNORE
+            if dropna
+            else plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW,
         )
     )
 
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index 08bc9d0ea3f..7560a0f5a64 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -60,7 +60,7 @@ def to_polars(self) -> pl.DataFrame:
         # To guarantee we produce correct names, we therefore
         # serialise with names we control and rename with that map.
         name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
-        table: pa.Table = plc.interop.to_arrow(
+        table = plc.interop.to_arrow(
             self.table,
             [plc.interop.ColumnMetadata(name=name) for name in name_map],
         )
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
index 65fa4bfa62f..cd8e5c6a4eb 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -27,7 +27,9 @@
 
 class TemporalFunction(Expr):
     __slots__ = ("name", "options")
-    _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
+    _COMPONENT_MAP: ClassVar[
+        dict[pl_expr.TemporalFunction, plc.datetime.DatetimeComponent]
+    ] = {
         pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
         pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
         pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
index c16313bf83c..7eba0c110ab 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
@@ -58,7 +58,7 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class LiteralColumn(Expr):
     __slots__ = ("value",)
     _non_child = ("dtype", "value")
-    value: pa.Array[Any, Any]
+    value: pa.Array[Any]
 
     def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
         self.dtype = dtype
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index beea5908e56..1f935190f28 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -517,7 +517,7 @@ def do_evaluate(
                 # Mask must have been applied.
                 return df
         elif typ == "ndjson":
-            json_schema: list[tuple[str, str, list]] = [
+            json_schema: list[plc.io.json.NameAndType] = [
                 (name, typ, []) for name, typ in schema.items()
             ]
             plc_tbl_w_meta = plc.io.json.read_json(
diff --git a/python/pylibcudf/pylibcudf/aggregation.pyi b/python/pylibcudf/pylibcudf/aggregation.pyi
new file mode 100644
index 00000000000..a59e2a9dc93
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/aggregation.pyi
@@ -0,0 +1,110 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.types import (
+    DataType,
+    Interpolation,
+    NanEquality,
+    NullEquality,
+    NullOrder,
+    NullPolicy,
+    Order,
+)
+
+class Kind(IntEnum):
+    SUM = ...
+    PRODUCT = ...
+    MIN = ...
+    MAX = ...
+    COUNT_VALID = ...
+    COUNT_ALL = ...
+    ANY = ...
+    ALL = ...
+    SUM_OF_SQUARES = ...
+    MEAN = ...
+    VARIANCE = ...
+    STD = ...
+    MEDIAN = ...
+    QUANTILE = ...
+    ARGMAX = ...
+    ARGMIN = ...
+    NUNIQUE = ...
+    NTH_ELEMENT = ...
+    RANK = ...
+    COLLECT_LIST = ...
+    COLLECT_SET = ...
+    PTX = ...
+    CUDA = ...
+    CORRELATION = ...
+    COVARIANCE = ...
+
+class CorrelationType(IntEnum):
+    PEARSON = ...
+    KENDALL = ...
+    SPEARMAN = ...
+
+class EWMHistory(IntEnum):
+    INFINITE = ...
+    FINITE = ...
+
+class RankMethod(IntEnum):
+    FIRST = ...
+    AVERAGE = ...
+    MIN = ...
+    MAX = ...
+    DENSE = ...
+
+class RankPercentage(IntEnum):
+    NONE = ...
+    ZERO_NORMALIZED = ...
+    ONE_NORMALIZED = ...
+
+class UdfType(IntEnum):
+    CUDA = ...
+    PTX = ...
+
+class Aggregation:
+    def __init__(self): ...
+    def kind(self) -> Kind: ...
+
+def sum() -> Aggregation: ...
+def product() -> Aggregation: ...
+def min() -> Aggregation: ...
+def max() -> Aggregation: ...
+def count(null_handling: NullPolicy = NullPolicy.INCLUDE) -> Aggregation: ...
+def any() -> Aggregation: ...
+def all() -> Aggregation: ...
+def sum_of_squares() -> Aggregation: ...
+def mean() -> Aggregation: ...
+def variance(ddof: int = 1) -> Aggregation: ...
+def std(ddof: int = 1) -> Aggregation: ...
+def median() -> Aggregation: ...
+def quantile(
+    quantiles: list[float], interp: Interpolation = Interpolation.LINEAR
+) -> Aggregation: ...
+def argmax() -> Aggregation: ...
+def argmin() -> Aggregation: ...
+def ewma(center_of_mass: float, history: EWMHistory) -> Aggregation: ...
+def nunique(null_handling: NullPolicy = NullPolicy.EXCLUDE) -> Aggregation: ...
+def nth_element(
+    n: int, null_handling: NullPolicy = NullPolicy.INCLUDE
+) -> Aggregation: ...
+def collect_list(
+    null_handling: NullPolicy = NullPolicy.INCLUDE,
+) -> Aggregation: ...
+def collect_set(
+    null_handling: NullPolicy = NullPolicy.INCLUDE,
+    nulls_equal: NullEquality = NullEquality.EQUAL,
+    nans_equal: NanEquality = NanEquality.ALL_EQUAL,
+) -> Aggregation: ...
+def udf(operation: str, output_type: DataType) -> Aggregation: ...
+def correlation(type: CorrelationType, min_periods: int) -> Aggregation: ...
+def covariance(min_periods: int, ddof: int) -> Aggregation: ...
+def rank(
+    method: RankMethod,
+    column_order: Order = Order.ASCENDING,
+    null_handling: NullPolicy = NullPolicy.EXCLUDE,
+    null_precedence: NullOrder = NullOrder.AFTER,
+    percentage: RankPercentage = RankPercentage.NONE,
+) -> Aggregation: ...
diff --git a/python/pylibcudf/pylibcudf/aggregation.pyx b/python/pylibcudf/pylibcudf/aggregation.pyx
index e510b738f70..662f76d5c8e 100644
--- a/python/pylibcudf/pylibcudf/aggregation.pyx
+++ b/python/pylibcudf/pylibcudf/aggregation.pyx
@@ -64,6 +64,40 @@ from pylibcudf.libcudf.aggregation import udf_type as UdfType  # no-cython-lint
 from .types cimport DataType
 
 
+__all__ = [
+    "Aggregation",
+    "CorrelationType",
+    "EWMHistory",
+    "Kind",
+    "RankMethod",
+    "RankPercentage",
+    "UdfType",
+    "all",
+    "any",
+    "argmax",
+    "argmin",
+    "collect_list",
+    "collect_set",
+    "correlation",
+    "count",
+    "covariance",
+    "ewma",
+    "max",
+    "mean",
+    "median",
+    "min",
+    "nth_element",
+    "nunique",
+    "product",
+    "quantile",
+    "rank",
+    "std",
+    "sum",
+    "sum_of_squares",
+    "udf",
+    "variance",
+]
+
 cdef class Aggregation:
     """A type of aggregation to perform.
 
diff --git a/python/pylibcudf/pylibcudf/binaryop.pyi b/python/pylibcudf/pylibcudf/binaryop.pyi
new file mode 100644
index 00000000000..f745e6c6854
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/binaryop.pyi
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.types import DataType
+
+class BinaryOperator(IntEnum):
+    ADD = ...
+    SUB = ...
+    MUL = ...
+    DIV = ...
+    TRUE_DIV = ...
+    FLOOR_DIV = ...
+    MOD = ...
+    PMOD = ...
+    PYMOD = ...
+    POW = ...
+    INT_POW = ...
+    LOG_BASE = ...
+    ATAN2 = ...
+    SHIFT_LEFT = ...
+    SHIFT_RIGHT = ...
+    SHIFT_RIGHT_UNSIGNED = ...
+    BITWISE_AND = ...
+    BITWISE_OR = ...
+    BITWISE_XOR = ...
+    LOGICAL_AND = ...
+    LOGICAL_OR = ...
+    EQUAL = ...
+    NOT_EQUAL = ...
+    LESS = ...
+    GREATER = ...
+    LESS_EQUAL = ...
+    GREATER_EQUAL = ...
+    NULL_EQUALS = ...
+    NULL_MAX = ...
+    NULL_MIN = ...
+    NULL_NOT_EQUALS = ...
+    GENERIC_BINARY = ...
+    NULL_LOGICAL_AND = ...
+    NULL_LOGICAL_OR = ...
+    INVALID_BINARY = ...
+
+def binary_operation(
+    lhs: Column | Scalar,
+    rhs: Column | Scalar,
+    op: BinaryOperator,
+    output_type: DataType,
+) -> Column: ...
+def is_supported_operation(
+    out: DataType, lhs: DataType, rhs: DataType, op: BinaryOperator
+) -> bool: ...
diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx
index eef73bf4e9d..b7b4ecc6e83 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pyx
+++ b/python/pylibcudf/pylibcudf/binaryop.pyx
@@ -16,6 +16,7 @@ from .column cimport Column
 from .scalar cimport Scalar
 from .types cimport DataType
 
+__all__ = ["BinaryOperator", "binary_operation", "is_supported_operation"]
 
 cpdef Column binary_operation(
     LeftBinaryOperand lhs,
diff --git a/python/pylibcudf/pylibcudf/column.pyi b/python/pylibcudf/pylibcudf/column.pyi
new file mode 100644
index 00000000000..c9f70de3dbf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/column.pyi
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from collections.abc import Sequence
+from typing import Any
+
+from pylibcudf.gpumemoryview import gpumemoryview
+from pylibcudf.scalar import Scalar
+from pylibcudf.types import DataType
+
+class Column:
+    def __init__(
+        self,
+        data_type: DataType,
+        size: int,
+        data: gpumemoryview | None,
+        mask: gpumemoryview | None,
+        null_count: int,
+        offset: int,
+        children: list[Column],
+    ) -> None: ...
+    def type(self) -> DataType: ...
+    def child(self, index: int) -> Column: ...
+    def size(self) -> int: ...
+    def null_count(self) -> int: ...
+    def offset(self) -> int: ...
+    def data(self) -> gpumemoryview | None: ...
+    def null_mask(self) -> gpumemoryview | None: ...
+    def children(self) -> list[Column]: ...
+    def copy(self) -> Column: ...
+    def with_mask(
+        self, mask: gpumemoryview | None, null_count: int
+    ) -> Column: ...
+    def list_view(self) -> ListColumnView: ...
+    @staticmethod
+    def from_scalar(scalar: Scalar, size: int) -> Column: ...
+    @staticmethod
+    def all_null_like(like: Column, size: int) -> Column: ...
+    @staticmethod
+    def from_cuda_array_interface_obj(obj: Any) -> Column: ...
+
+class ListColumnView:
+    def __init__(self, column: Column): ...
+    def child(self) -> Column: ...
+    def offsets(self) -> Column: ...
+
+def is_c_contiguous(
+    shape: Sequence[int], strides: Sequence[int], itemsize: int
+) -> bool: ...
diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx
index 4e5698566d0..9bb5574608e 100644
--- a/python/pylibcudf/pylibcudf/column.pyx
+++ b/python/pylibcudf/pylibcudf/column.pyx
@@ -17,6 +17,7 @@ from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
 
 import functools
 
+__all__ = ["Column", "ListColumnView", "is_c_contiguous"]
 
 cdef class Column:
     """A container of nullable device data as a column of elements.
@@ -61,6 +62,8 @@ cdef class Column:
         self._children = children
         self._num_children = len(children)
 
+    __hash__ = None
+
     cdef column_view view(self) nogil:
         """Generate a libcudf column_view to pass to libcudf algorithms.
 
@@ -384,6 +387,8 @@ cdef class ListColumnView:
             raise TypeError("Column is not a list type")
         self._column = col
 
+    __hash__ = None
+
     cpdef child(self):
         """The data column of the underlying list column."""
         return self._column.child(1)
diff --git a/python/pylibcudf/pylibcudf/column_factories.pyi b/python/pylibcudf/pylibcudf/column_factories.pyi
new file mode 100644
index 00000000000..c87fe423acb
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/column_factories.pyi
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from pylibcudf.column import Column
+from pylibcudf.types import DataType, MaskState, TypeId
+
+def make_empty_column(type_or_id: DataType | TypeId) -> Column: ...
+def make_numeric_column(
+    type_: DataType, size: int, mstate: MaskState
+) -> Column: ...
+def make_fixed_point_column(
+    type_: DataType, size: int, mstate: MaskState
+) -> Column: ...
+def make_timestamp_column(
+    type_: DataType, size: int, mstate: MaskState
+) -> Column: ...
+def make_duration_column(
+    type_: DataType, size: int, mstate: MaskState
+) -> Column: ...
+def make_fixed_width_column(
+    type_: DataType, size: int, mstate: MaskState
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx
index ac942a620b5..c4969a7f502 100644
--- a/python/pylibcudf/pylibcudf/column_factories.pyx
+++ b/python/pylibcudf/pylibcudf/column_factories.pyx
@@ -17,6 +17,15 @@ from .types cimport DataType, type_id
 from .types import MaskState, TypeId
 
 
+__all__ = [
+    "make_duration_column",
+    "make_empty_column",
+    "make_fixed_point_column",
+    "make_fixed_width_column",
+    "make_numeric_column",
+    "make_timestamp_column",
+]
+
 cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id):
     """Creates an empty column of the specified type.
 
diff --git a/python/pylibcudf/pylibcudf/concatenate.pyi b/python/pylibcudf/pylibcudf/concatenate.pyi
new file mode 100644
index 00000000000..79076f509e0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/concatenate.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+
+def concatenate[ColumnOrTable: (Column, Table)](
+    objects: list[ColumnOrTable],
+) -> ColumnOrTable: ...
diff --git a/python/pylibcudf/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx
index 10c860d97bb..42c5f34cf3e 100644
--- a/python/pylibcudf/pylibcudf/concatenate.pyx
+++ b/python/pylibcudf/pylibcudf/concatenate.pyx
@@ -12,6 +12,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 from .column cimport Column
 from .table cimport Table
 
+__all__ = ["concatenate"]
 
 cpdef concatenate(list objects):
     """Concatenate columns or tables.
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyi b/python/pylibcudf/pylibcudf/contiguous_split.pyi
new file mode 100644
index 00000000000..dd6328fbf23
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pyi
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.gpumemoryview import gpumemoryview
+from pylibcudf.table import Table
+
+class PackedColumns:
+    def __init__(self): ...
+    def release(self) -> tuple[memoryview, gpumemoryview]: ...
+
+def pack(input: Table) -> PackedColumns: ...
+def unpack(input: PackedColumns) -> Table: ...
+def unpack_from_memoryviews(
+    metadata: memoryview, gpu_data: gpumemoryview
+) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx
index ed926a3fcc0..94873e079c9 100644
--- a/python/pylibcudf/pylibcudf/contiguous_split.pyx
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx
@@ -20,6 +20,13 @@ from .table cimport Table
 from .utils cimport int_to_void_ptr
 
 
+__all__ = [
+    "PackedColumns",
+    "pack",
+    "unpack",
+    "unpack_from_memoryviews",
+]
+
 cdef class HostBuffer:
     """Owning host buffer that implements the buffer protocol"""
     cdef unique_ptr[vector[uint8_t]] c_obj
@@ -38,6 +45,8 @@ cdef class HostBuffer:
         out.strides[0] = 1
         return out
 
+    __hash__ = None
+
     def __getbuffer__(self, Py_buffer *buffer, int flags):
         buffer.buf = dereference(self.c_obj).data()
         buffer.format = NULL  # byte
@@ -69,6 +78,8 @@ cdef class PackedColumns:
             "Use one of the factories."
         )
 
+    __hash__ = None
+
     @staticmethod
     cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data):
         """Create a Python PackedColumns from a libcudf packed_columns."""
diff --git a/python/pylibcudf/pylibcudf/copying.pyi b/python/pylibcudf/pylibcudf/copying.pyi
new file mode 100644
index 00000000000..6cf4ed48724
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/copying.pyi
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+from typing import TypeVar
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+
+class MaskAllocationPolicy(IntEnum):
+    NEVER = ...
+    RETAIN = ...
+    ALWAYS = ...
+
+class OutOfBoundsPolicy(IntEnum):
+    NULLIFY = ...
+    DONT_CHECK = ...
+
+ColumnOrTable = TypeVar("ColumnOrTable", Column, Table)
+
+def gather(
+    source_table: Table, gather_map: Column, bounds_policy: OutOfBoundsPolicy
+) -> Table: ...
+def scatter(
+    source: Table | list[Scalar], scatter_map: Column, target_table: Table
+) -> Table: ...
+def empty_like(input: ColumnOrTable) -> ColumnOrTable: ...
+def allocate_like(
+    input_column: Column, policy: MaskAllocationPolicy, size: int | None = None
+) -> Column: ...
+def copy_range_in_place(
+    input_column: Column,
+    target_column: Column,
+    input_begin: int,
+    input_end: int,
+    target_begin: int,
+) -> Column: ...
+def copy_range(
+    input_column: Column,
+    target_column: Column,
+    input_begin: int,
+    input_end: int,
+    target_begin: int,
+) -> Column: ...
+def shift(input: Column, offset: int, fill_value: Scalar) -> Column: ...
+def slice(input: ColumnOrTable, indices: list[int]) -> list[ColumnOrTable]: ...
+def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ...
+def copy_if_else(
+    lhs: Column | Scalar, rhs: Column | Scalar, boolean_mask: Column
+) -> Column: ...
+def boolean_mask_scatter(
+    input: Table | list[Scalar], target: Table, boolean_mask: Column
+) -> Table: ...
+def get_element(input_column: Column, index: int) -> Scalar: ...
diff --git a/python/pylibcudf/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx
index 4938f1a3dda..fb8b6f9890e 100644
--- a/python/pylibcudf/pylibcudf/copying.pyx
+++ b/python/pylibcudf/pylibcudf/copying.pyx
@@ -36,6 +36,23 @@ from .table cimport Table
 from .utils cimport _as_vector
 
 
+__all__ = [
+    "MaskAllocationPolicy",
+    "OutOfBoundsPolicy",
+    "allocate_like",
+    "boolean_mask_scatter",
+    "copy_if_else",
+    "copy_range",
+    "copy_range_in_place",
+    "empty_like",
+    "gather",
+    "get_element",
+    "scatter",
+    "shift",
+    "slice",
+    "split",
+]
+
 cpdef Table gather(
     Table source_table,
     Column gather_map,
diff --git a/python/pylibcudf/pylibcudf/datetime.pyi b/python/pylibcudf/pylibcudf/datetime.pyi
new file mode 100644
index 00000000000..6a3ae7953d9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/datetime.pyi
@@ -0,0 +1,45 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class DatetimeComponent(IntEnum):
+    YEAR = ...
+    MONTH = ...
+    DAY = ...
+    WEEKDAY = ...
+    HOUR = ...
+    MINUTE = ...
+    SECOND = ...
+    MILLISECOND = ...
+    MICROSECOND = ...
+    NANOSECOND = ...
+
+class RoundingFrequency(IntEnum):
+    DAY = ...
+    HOUR = ...
+    MINUTE = ...
+    SECOND = ...
+    MILLISECOND = ...
+    MICROSECOND = ...
+    NANOSECOND = ...
+
+def extract_millisecond_fraction(input: Column) -> Column: ...
+def extract_microsecond_fraction(input: Column) -> Column: ...
+def extract_nanosecond_fraction(input: Column) -> Column: ...
+def extract_datetime_component(
+    input: Column, component: DatetimeComponent
+) -> Column: ...
+def ceil_datetimes(input: Column, freq: RoundingFrequency) -> Column: ...
+def floor_datetimes(input: Column, freq: RoundingFrequency) -> Column: ...
+def round_datetimes(input: Column, freq: RoundingFrequency) -> Column: ...
+def add_calendrical_months(
+    input: Column, months: Column | Scalar
+) -> Column: ...
+def day_of_year(input: Column) -> Column: ...
+def is_leap_year(input: Column) -> Column: ...
+def last_day_of_month(input: Column) -> Column: ...
+def extract_quarter(input: Column) -> Column: ...
+def days_in_month(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx
index 9e5e709d81d..b100e3e22d0 100644
--- a/python/pylibcudf/pylibcudf/datetime.pyx
+++ b/python/pylibcudf/pylibcudf/datetime.pyx
@@ -29,6 +29,24 @@ from cython.operator cimport dereference
 
 from .column cimport Column
 
+__all__ = [
+    "DatetimeComponent",
+    "RoundingFrequency",
+    "add_calendrical_months",
+    "ceil_datetimes",
+    "day_of_year",
+    "days_in_month",
+    "extract_datetime_component",
+    "extract_microsecond_fraction",
+    "extract_millisecond_fraction",
+    "extract_nanosecond_fraction",
+    "extract_quarter",
+    "floor_datetimes",
+    "is_leap_year",
+    "last_day_of_month",
+    "round_datetimes",
+]
+
 cpdef Column extract_millisecond_fraction(
     Column input
 ):
diff --git a/python/pylibcudf/pylibcudf/experimental.pyi b/python/pylibcudf/pylibcudf/experimental.pyi
new file mode 100644
index 00000000000..bbfb86b0ff6
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/experimental.pyi
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+def enable_prefetching(key: str) -> None: ...
+def disable_prefetching(key: str) -> None: ...
+def prefetch_debugging(enable: bool) -> None: ...
diff --git a/python/pylibcudf/pylibcudf/experimental.pyx b/python/pylibcudf/pylibcudf/experimental.pyx
index b25a53e13b2..d94d6d087ac 100644
--- a/python/pylibcudf/pylibcudf/experimental.pyx
+++ b/python/pylibcudf/pylibcudf/experimental.pyx
@@ -5,6 +5,8 @@ from libcpp.string cimport string
 from pylibcudf.libcudf cimport experimental as cpp_experimental
 
 
+__all__ = ["disable_prefetching", "enable_prefetching", "prefetch_debugging"]
+
 cpdef enable_prefetching(str key):
     """Turn on prefetch instructions for the given key.
 
diff --git a/python/pylibcudf/pylibcudf/expressions.pyi b/python/pylibcudf/pylibcudf/expressions.pyi
new file mode 100644
index 00000000000..12b473d8605
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/expressions.pyi
@@ -0,0 +1,79 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from enum import IntEnum
+
+from pylibcudf.scalar import Scalar
+
+class TableReference(IntEnum):
+    LEFT = ...
+    RIGHT = ...
+
+class ASTOperator(IntEnum):
+    ADD = ...
+    SUB = ...
+    MUL = ...
+    DIV = ...
+    TRUE_DIV = ...
+    FLOOR_DIV = ...
+    MOD = ...
+    PYMOD = ...
+    POW = ...
+    EQUAL = ...
+    NULL_EQUAL = ...
+    NOT_EQUAL = ...
+    LESS = ...
+    GREATER = ...
+    LESS_EQUAL = ...
+    GREATER_EQUAL = ...
+    BITWISE_AND = ...
+    BITWISE_OR = ...
+    BITWISE_XOR = ...
+    NULL_LOGICAL_AND = ...
+    LOGICAL_AND = ...
+    NULL_LOGICAL_OR = ...
+    LOGICAL_OR = ...
+    IDENTITY = ...
+    IS_NULL = ...
+    SIN = ...
+    COS = ...
+    TAN = ...
+    ARCSIN = ...
+    ARCCOS = ...
+    ARCTAN = ...
+    SINH = ...
+    COSH = ...
+    TANH = ...
+    ARCSINH = ...
+    ARCCOSH = ...
+    ARCTANH = ...
+    EXP = ...
+    LOG = ...
+    SQRT = ...
+    CBRT = ...
+    CEIL = ...
+    FLOOR = ...
+    ABS = ...
+    RINT = ...
+    BIT_INVERT = ...
+    NOT = ...
+
+class Expression:
+    def __init__(self): ...
+
+class Literal(Expression):
+    def __init__(self, value: Scalar): ...
+
+class ColumnReference(Expression):
+    def __init__(
+        self, index: int, table_source: TableReference = TableReference.LEFT
+    ): ...
+
+class ColumnNameReference(Expression):
+    def __init__(self, name: str): ...
+
+class Operation(Expression):
+    def __init__(
+        self,
+        op: ASTOperator,
+        left: Expression,
+        right: Expression | None = None,
+    ): ...
diff --git a/python/pylibcudf/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx
index 1535f68366b..0f12cfe313c 100644
--- a/python/pylibcudf/pylibcudf/expressions.pyx
+++ b/python/pylibcudf/pylibcudf/expressions.pyx
@@ -49,6 +49,16 @@ from .types cimport DataType
 # Aliases for simplicity
 ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
 
+__all__ = [
+    "ASTOperator",
+    "ColumnNameReference",
+    "ColumnReference",
+    "Expression",
+    "Literal",
+    "Operation",
+    "TableReference",
+]
+
 # Define this class just to have a docstring for it
 cdef class Expression:
     """
@@ -58,7 +68,7 @@ cdef class Expression:
 
     For details, see :cpp:class:`cudf::ast::expression`.
     """
-    pass
+    __hash__ = None
 
 cdef class Literal(Expression):
     """
diff --git a/python/pylibcudf/pylibcudf/filling.pyi b/python/pylibcudf/pylibcudf/filling.pyi
new file mode 100644
index 00000000000..0b5e29bdc32
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/filling.pyi
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+
+def fill(
+    destination: Column, begin: int, end: int, value: Scalar
+) -> Column: ...
+def fill_in_place(
+    destination: Column, begin: int, end: int, value: Scalar
+) -> None: ...
+def sequence(size: int, init: Scalar, step: Scalar) -> Column: ...
+def repeat(input_table: Table, count: Column | int) -> Table: ...
+def calendrical_month_sequence(
+    n: int, init: Scalar, months: int
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx
index 313605ead16..ea5b45ff7c2 100644
--- a/python/pylibcudf/pylibcudf/filling.pyx
+++ b/python/pylibcudf/pylibcudf/filling.pyx
@@ -19,6 +19,14 @@ from .scalar cimport Scalar
 from .table cimport Table
 
 
+__all__ = [
+    "fill",
+    "fill_in_place",
+    "repeat",
+    "sequence",
+    "calendrical_month_sequence",
+]
+
 cpdef Column fill(
     Column destination,
     size_type begin,
diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyi b/python/pylibcudf/pylibcudf/gpumemoryview.pyi
new file mode 100644
index 00000000000..50f1f39a515
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyi
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from collections.abc import Mapping
+from typing import Any
+
+class gpumemoryview:
+    def __init__(self, data: Any): ...
+    @property
+    def __cuda_array_interface__(self) -> Mapping[str, Any]: ...
diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyx b/python/pylibcudf/pylibcudf/gpumemoryview.pyx
index 0904022a944..41316eddb60 100644
--- a/python/pylibcudf/pylibcudf/gpumemoryview.pyx
+++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+__all__ = ["gpumemoryview"]
 
 cdef class gpumemoryview:
     """Minimal representation of a memory buffer.
@@ -25,3 +26,5 @@ cdef class gpumemoryview:
     @property
     def __cuda_array_interface__(self):
         return self.obj.__cuda_array_interface__
+
+    __hash__ = None
diff --git a/python/pylibcudf/pylibcudf/groupby.pyi b/python/pylibcudf/pylibcudf/groupby.pyi
new file mode 100644
index 00000000000..883ad6e34cf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/groupby.pyi
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.aggregation import Aggregation
+from pylibcudf.column import Column
+from pylibcudf.replace import ReplacePolicy
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+from pylibcudf.types import NullOrder, NullPolicy, Order, Sorted
+
+class GroupByRequest:
+    def __init__(
+        self, values: Column, aggregations: list[Aggregation]
+    ) -> None: ...
+
+class GroupBy:
+    def __init__(
+        self,
+        keys: Table,
+        null_handling: NullPolicy = NullPolicy.EXCLUDE,
+        keys_are_sorted: Sorted = Sorted.NO,
+        column_order: list[Order] | None = None,
+        null_precedence: list[NullOrder] | None = None,
+    ) -> None: ...
+    def aggregate(
+        self, requests: list[GroupByRequest]
+    ) -> tuple[Table, list[Table]]: ...
+    def scan(
+        self, requests: list[GroupByRequest]
+    ) -> tuple[Table, list[Table]]: ...
+    def shift(
+        self, values: Table, offset: list[int], fill_values: list[Scalar]
+    ) -> tuple[Table, Table]: ...
+    def replace_nulls(
+        self, value: Table, replace_policies: list[ReplacePolicy]
+    ) -> tuple[Table, Table]: ...
+    def get_groups(
+        self, values: Table | None = None
+    ) -> tuple[list[int], Table, Table]: ...
diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx
index 71f9ecb0453..e6cb3ac81a7 100644
--- a/python/pylibcudf/pylibcudf/groupby.pyx
+++ b/python/pylibcudf/pylibcudf/groupby.pyx
@@ -25,6 +25,8 @@ from .types cimport null_order, null_policy, order, sorted
 from .utils cimport _as_vector
 
 
+__all__ = ["GroupBy", "GroupByRequest"]
+
 cdef class GroupByRequest:
     """A request for a groupby aggregation or scan.
 
@@ -45,6 +47,8 @@ cdef class GroupByRequest:
         self._values = values
         self._aggregations = aggregations
 
+    __hash__ = None
+
     cdef aggregation_request _to_libcudf_agg_request(self) except *:
         """Convert to a libcudf aggregation_request object.
 
@@ -127,6 +131,8 @@ cdef class GroupBy:
         # deallocated from under us:
         self._keys = keys
 
+    __hash__ = None
+
     @staticmethod
     cdef tuple _parse_outputs(
         pair[unique_ptr[table], vector[aggregation_result]] c_res
diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi
new file mode 100644
index 00000000000..a849f5d0729
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/hashing.pyi
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from typing import Final
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+
+LIBCUDF_DEFAULT_HASH_SEED: Final[int]
+
+def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ...
+def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ...
+def xxhash_64(input: Table, seed: int = ...) -> Column: ...
+def md5(input: Table) -> Column: ...
+def sha1(input: Table) -> Column: ...
+def sha224(input: Table) -> Column: ...
+def sha256(input: Table) -> Column: ...
+def sha384(input: Table) -> Column: ...
+def sha512(input: Table) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx
index 9ea3d4d1bda..548cffc0ce8 100644
--- a/python/pylibcudf/pylibcudf/hashing.pyx
+++ b/python/pylibcudf/pylibcudf/hashing.pyx
@@ -20,6 +20,19 @@ from pylibcudf.libcudf.table.table cimport table
 from .column cimport Column
 from .table cimport Table
 
+__all__ = [
+    "LIBCUDF_DEFAULT_HASH_SEED",
+    "md5",
+    "murmurhash3_x64_128",
+    "murmurhash3_x86_32",
+    "sha1",
+    "sha224",
+    "sha256",
+    "sha384",
+    "sha512",
+    "xxhash_64",
+]
+
 LIBCUDF_DEFAULT_HASH_SEED = DEFAULT_HASH_SEED
 
 cpdef Column murmurhash3_x86_32(
diff --git a/python/pylibcudf/pylibcudf/interop.pyi b/python/pylibcudf/pylibcudf/interop.pyi
new file mode 100644
index 00000000000..63de816010b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/interop.pyi
@@ -0,0 +1,52 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass
+from typing import Any, overload
+
+import pyarrow as pa
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+from pylibcudf.types import DataType
+
+@dataclass
+class ColumnMetadata:
+    name: str = ...
+    children_meta: list[ColumnMetadata] = ...
+
+@overload
+def from_arrow(obj: pa.DataType) -> DataType: ...
+@overload
+def from_arrow(
+    obj: pa.Scalar[Any], *, data_type: DataType | None = None
+) -> Scalar: ...
+@overload
+def from_arrow(obj: pa.Array[Any]) -> Column: ...
+@overload
+def from_arrow(obj: pa.Table) -> Table: ...
+@overload
+def to_arrow(
+    obj: DataType,
+    *,
+    precision: int | None = None,
+    fields: Iterable[pa.Field[pa.DataType] | tuple[str, pa.DataType]]
+    | Mapping[str, pa.DataType]
+    | None = None,
+    value_type: pa.DataType | None = None,
+) -> pa.DataType: ...
+@overload
+def to_arrow(
+    obj: Table, metadata: list[ColumnMetadata | str] | None = None
+) -> pa.Table: ...
+@overload
+def to_arrow(
+    obj: Column, metadata: ColumnMetadata | str | None = None
+) -> pa.Array[Any]: ...
+@overload
+def to_arrow(
+    obj: Scalar, metadata: ColumnMetadata | str | None = None
+) -> pa.Scalar[Any]: ...
+def from_dlpack(managed_tensor: Any) -> Table: ...
+def to_dlpack(input: Table) -> Any: ...
diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
index 61e812353b7..bd5397ac328 100644
--- a/python/pylibcudf/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -38,6 +38,14 @@ from .scalar cimport Scalar
 from .table cimport Table
 from .types cimport DataType, type_id
 
+__all__ = [
+    "ColumnMetadata",
+    "from_arrow",
+    "from_dlpack",
+    "to_arrow",
+    "to_dlpack",
+]
+
 ARROW_TO_PYLIBCUDF_TYPES = {
     pa.int8(): type_id.INT8,
     pa.int16(): type_id.INT16,
diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py
index 9e8e0f6e080..f913a400684 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.py
+++ b/python/pylibcudf/pylibcudf/io/__init__.py
@@ -13,3 +13,19 @@
     types,
 )
 from .types import SinkInfo, SourceInfo, TableWithMetadata
+
+__all__ = [
+    "SinkInfo",
+    "SourceInfo",
+    "TableWithMetadata",
+    "avro",
+    "csv",
+    "datasource",
+    "json",
+    "orc",
+    "parquet",
+    "parquet_metadata",
+    "text",
+    "timezone",
+    "types",
+]
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyi b/python/pylibcudf/pylibcudf/io/avro.pyi
new file mode 100644
index 00000000000..49c2f083702
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/avro.pyi
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from pylibcudf.io.types import SourceInfo, TableWithMetadata
+
+__all__ = ["read_avro"]
+
+def read_avro(
+    source_info: SourceInfo,
+    columns: list[str] | None = None,
+    skip_rows: int = 0,
+    num_rows: int = -1,
+) -> TableWithMetadata: ...
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx
index fe765b34f82..4271333511a 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pyx
+++ b/python/pylibcudf/pylibcudf/io/avro.pyx
@@ -10,6 +10,8 @@ from pylibcudf.libcudf.io.avro cimport (
 )
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = ["read_avro"]
+
 
 cpdef TableWithMetadata read_avro(
     SourceInfo source_info,
diff --git a/python/pylibcudf/pylibcudf/io/csv.pyi b/python/pylibcudf/pylibcudf/io/csv.pyi
new file mode 100644
index 00000000000..356825a927d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/csv.pyi
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from collections.abc import Mapping
+
+from pylibcudf.io.types import (
+    CompressionType,
+    QuoteStyle,
+    SourceInfo,
+    TableWithMetadata,
+)
+from pylibcudf.types import DataType
+
+def read_csv(
+    source_info: SourceInfo,
+    *,
+    compression: CompressionType = CompressionType.AUTO,
+    byte_range_offset: int = 0,
+    byte_range_size: int = 0,
+    col_names: list[str] | None = None,
+    prefix: str = "",
+    mangle_dupe_cols: bool = True,
+    usecols: list[int] | list[str] | None = None,
+    nrows: int = -1,
+    skiprows: int = 0,
+    skipfooter: int = 0,
+    header: int = 0,
+    lineterminator: str = "\n",
+    delimiter: str | None = None,
+    thousands: str | None = None,
+    decimal: str = ".",
+    comment: str | None = None,
+    delim_whitespace: bool = False,
+    skipinitialspace: bool = False,
+    skip_blank_lines: bool = True,
+    quoting: QuoteStyle = QuoteStyle.MINIMAL,
+    quotechar: str = '"',
+    doublequote: bool = True,
+    parse_dates: list[str] | list[int] | None = None,
+    parse_hex: list[str] | list[int] | None = None,
+    # Technically this should be dict/list
+    # but using a fused type prevents using None as default
+    dtypes: Mapping[str, DataType] | list[DataType] | None = None,
+    true_values: list[str] | None = None,
+    false_values: list[str] | None = None,
+    na_values: list[str] | None = None,
+    keep_default_na: bool = True,
+    na_filter: bool = True,
+    dayfirst: bool = False,
+    # Note: These options are supported by the libcudf reader
+    # but are not exposed here since there is no demand for them
+    # on the Python side yet.
+    # detect_whitespace_around_quotes: bool = False,
+    # timestamp_type: DataType = DataType(type_id.EMPTY),
+) -> TableWithMetadata: ...
diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx
index 2c61cc42d82..858e580ab34 100644
--- a/python/pylibcudf/pylibcudf/io/csv.pyx
+++ b/python/pylibcudf/pylibcudf/io/csv.pyx
@@ -19,6 +19,8 @@ from pylibcudf.libcudf.types cimport data_type, size_type
 from pylibcudf.types cimport DataType
 
 
+__all__ = ["read_csv"]
+
 cdef tuple _process_parse_dates_hex(list cols):
     cdef vector[string] str_cols
     cdef vector[int] int_cols
diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyi b/python/pylibcudf/pylibcudf/io/datasource.pyi
new file mode 100644
index 00000000000..e52197f793b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/datasource.pyi
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+class Datasource:
+    def __init__(self): ...
diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyx b/python/pylibcudf/pylibcudf/io/datasource.pyx
index 02418444caa..aac1c0d1014 100644
--- a/python/pylibcudf/pylibcudf/io/datasource.pyx
+++ b/python/pylibcudf/pylibcudf/io/datasource.pyx
@@ -2,8 +2,10 @@
 
 from pylibcudf.libcudf.io.datasource cimport datasource
 
+__all__ = ["Datasource"]
 
 cdef class Datasource:
+    __hash__ = None
     cdef datasource* get_datasource(self) except * nogil:
         with gil:
             raise NotImplementedError("get_datasource() should not "
diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi
new file mode 100644
index 00000000000..b2bc6a43700
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/json.pyi
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from collections.abc import Mapping
+from typing import TypeAlias
+
+from pylibcudf.column import Column
+from pylibcudf.io.types import (
+    CompressionType,
+    JSONRecoveryMode,
+    SinkInfo,
+    SourceInfo,
+    TableWithMetadata,
+)
+from pylibcudf.types import DataType
+
+ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap]
+
+NameAndType: TypeAlias = tuple[str, DataType, list[NameAndType]]
+
+def read_json(
+    source_info: SourceInfo,
+    dtypes: list[NameAndType] | None = None,
+    compression: CompressionType = CompressionType.AUTO,
+    lines: bool = False,
+    byte_range_offset: int = 0,
+    byte_range_size: int = 0,
+    keep_quotes: bool = False,
+    mixed_types_as_string: bool = False,
+    prune_columns: bool = False,
+    recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL,
+) -> TableWithMetadata: ...
+def write_json(
+    sink_info: SinkInfo,
+    table_w_meta: TableWithMetadata,
+    na_rep: str = "",
+    include_nulls: bool = False,
+    lines: bool = False,
+    rows_per_chunk: int = 2**32 - 1,
+    true_value: str = "true",
+    false_value: str = "false",
+) -> None: ...
+def chunked_read_json(
+    source_info: SourceInfo,
+    dtypes: list[NameAndType] | None = None,
+    compression: CompressionType = CompressionType.AUTO,
+    keep_quotes: bool = False,
+    mixed_types_as_string: bool = False,
+    prune_columns: bool = False,
+    recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL,
+    chunk_size: int = 100_000_000,
+) -> tuple[list[Column], list[str], ChildNameToTypeMap]: ...
diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx
index 65f78f830f1..ad2989925c9 100644
--- a/python/pylibcudf/pylibcudf/io/json.pyx
+++ b/python/pylibcudf/pylibcudf/io/json.pyx
@@ -23,6 +23,7 @@ from pylibcudf.libcudf.io.types cimport (
 from pylibcudf.libcudf.types cimport data_type, size_type
 from pylibcudf.types cimport DataType
 
+__all__ = ["chunked_read_json", "read_json", "write_json"]
 
 cdef map[string, schema_element] _generate_schema_map(list dtypes):
     cdef map[string, schema_element] schema_map
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi
new file mode 100644
index 00000000000..4cf87f1a832
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/orc.pyi
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from typing import Any
+
+from pylibcudf.io.types import SourceInfo, TableWithMetadata
+from pylibcudf.types import DataType
+
+def read_orc(
+    source_info: SourceInfo,
+    columns: list[str] | None = None,
+    stripes: list[list[int]] | None = None,
+    skip_rows: int = 0,
+    nrows: int = -1,
+    use_index: bool = True,
+    use_np_dtypes: bool = True,
+    timestamp_type: DataType | None = None,
+    decimal128_columns: list[str] | None = None,
+) -> TableWithMetadata: ...
+
+class OrcColumnStatistics:
+    def __init__(self): ...
+    @property
+    def number_of_values(self) -> int | None: ...
+    @property
+    def has_null(self) -> bool | None: ...
+    def __getitem__(self, item: str) -> Any: ...
+    def __contains__(self, item: str) -> bool: ...
+    def get[T](self, item: str, default: None | T = None) -> T | None: ...
+
+class ParsedOrcStatistics:
+    def __init__(self): ...
+    @property
+    def column_names(self) -> list[str]: ...
+    @property
+    def file_stats(self) -> list[OrcColumnStatistics]: ...
+    @property
+    def stripes_stats(self) -> list[OrcColumnStatistics]: ...
+
+def read_parsed_orc_statistics(
+    source_info: SourceInfo,
+) -> ParsedOrcStatistics: ...
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx
index 70e0a7995a2..4270f5b4f95 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyx
+++ b/python/pylibcudf/pylibcudf/io/orc.pyx
@@ -30,6 +30,12 @@ from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.types cimport DataType
 from pylibcudf.variant cimport get_if, holds_alternative
 
+__all__ = [
+    "OrcColumnStatistics",
+    "ParsedOrcStatistics",
+    "read_orc",
+    "read_parsed_orc_statistics",
+]
 
 cdef class OrcColumnStatistics:
     def __init__(self):
@@ -39,6 +45,8 @@ cdef class OrcColumnStatistics:
             "use `OrcColumnStatistics.from_libcudf` instead."
         )
 
+    __hash__ = None
+
     @property
     def number_of_values(self):
         if self.number_of_values_c.has_value():
@@ -183,6 +191,8 @@ cdef class OrcColumnStatistics:
 
 cdef class ParsedOrcStatistics:
 
+    __hash__ = None
+
     @property
     def column_names(self):
         return [name.decode() for name in self.c_obj.column_names]
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi
new file mode 100644
index 00000000000..bcf1d1cce09
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyi
@@ -0,0 +1,36 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.expressions import Expression
+from pylibcudf.io.types import SourceInfo, TableWithMetadata
+
+class ChunkedParquetReader:
+    def __init__(
+        self,
+        source_info: SourceInfo,
+        columns: list[str] | None = None,
+        row_groups: list[list[int]] | None = None,
+        use_pandas_metadata: bool = True,
+        convert_strings_to_categories: bool = False,
+        skip_rows: int = 0,
+        nrows: int = 0,
+        chunk_read_limit: int = 0,
+        pass_read_limit: int = 1024000000,
+        allow_mismatched_pq_schemas: bool = False,
+    ) -> None: ...
+    def has_next(self) -> bool: ...
+    def read_chunk(self) -> TableWithMetadata: ...
+
+def read_parquet(
+    source_info: SourceInfo,
+    columns: list[str] | None = None,
+    row_groups: list[list[int]] | None = None,
+    filters: Expression | None = None,
+    convert_strings_to_categories: bool = False,
+    use_pandas_metadata: bool = True,
+    skip_rows: int = 0,
+    nrows: int = -1,
+    allow_mismatched_pq_schemas: bool = False,
+    # disabled see comment in parquet.pyx for more
+    # reader_column_schema: ReaderColumnSchema = *,
+    # timestamp_type: DataType = *
+) -> TableWithMetadata: ...
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index 981ca7b8159..b76a352d633 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -16,6 +16,8 @@ from pylibcudf.libcudf.io.parquet cimport (
 from pylibcudf.libcudf.io.types cimport table_with_metadata
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = ["ChunkedParquetReader", "read_parquet"]
+
 
 cdef parquet_reader_options _setup_parquet_reader_options(
     SourceInfo source_info,
@@ -123,6 +125,8 @@ cdef class ChunkedParquetReader:
                 )
             )
 
+    __hash__ = None
+
     cpdef bool has_next(self):
         """
         Returns True if there is another chunk in the Parquet file
diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx
index 352905ff0f8..0ad4dafb0cf 100644
--- a/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx
@@ -4,6 +4,13 @@ from pylibcudf.io.types cimport SourceInfo
 from pylibcudf.libcudf.io cimport parquet_metadata as cpp_parquet_metadata
 
 
+__all__ = [
+    "ParquetColumnSchema",
+    "ParquetMetadata",
+    "ParquetSchema",
+    "read_parquet_metadata",
+]
+
 cdef class ParquetColumnSchema:
     """
     Schema of a parquet column, including the nested columns.
@@ -164,7 +171,7 @@ cdef class ParquetMetadata:
 
         Returns
         -------
-        dict[bytes, bytes]
+        dict[str, str]
             Key value metadata as a map.
         """
         return {key.decode(): val.decode() for key, val in self.meta.metadata()}
diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx
index 667a054baaa..d3cbdc4cd60 100644
--- a/python/pylibcudf/pylibcudf/io/text.pyx
+++ b/python/pylibcudf/pylibcudf/io/text.pyx
@@ -10,6 +10,15 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.io cimport text as cpp_text
 
+__all__ = [
+    "DataChunkSource",
+    "ParseOptions",
+    "make_source",
+    "make_source_from_bgzip_file",
+    "make_source_from_file",
+    "multibyte_split",
+]
+
 cdef class ParseOptions:
     """
     Parsing options for `multibyte_split`
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyi b/python/pylibcudf/pylibcudf/io/timezone.pyi
new file mode 100644
index 00000000000..0582800c4af
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/timezone.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.table import Table
+
+def make_timezone_transition_table(
+    tzif_dir: str, timezone_name: str
+) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx
index f120b65fb2c..af7cf8a4ee5 100644
--- a/python/pylibcudf/pylibcudf/io/timezone.pyx
+++ b/python/pylibcudf/pylibcudf/io/timezone.pyx
@@ -11,6 +11,7 @@ from pylibcudf.libcudf.table.table cimport table
 
 from ..table cimport Table
 
+__all__ = ["make_timezone_transition_table"]
 
 cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name):
     """
diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi
new file mode 100644
index 00000000000..a4f4fc13bdc
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/types.pyi
@@ -0,0 +1,97 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import io
+import os
+from collections.abc import Mapping
+from enum import IntEnum
+from typing import Any, Literal, TypeAlias, overload
+
+from pylibcudf.column import Column
+from pylibcudf.io.datasource import Datasource
+from pylibcudf.table import Table
+
+class JSONRecoveryMode(IntEnum):
+    FAIL = ...
+    RECOVER_WITH_NULL = ...
+
+class CompressionType(IntEnum):
+    NONE = ...
+    AUTO = ...
+    SNAPPY = ...
+    GZIP = ...
+    BZIP2 = ...
+    BROTLI = ...
+    ZIP = ...
+    XZ = ...
+    ZLIB = ...
+    LZ4 = ...
+    LZO = ...
+    ZSTD = ...
+
+class ColumnEncoding(IntEnum):
+    USE_DEFAULT = ...
+    DICTIONARY = ...
+    PLAIN = ...
+    DELTA_BINARY_PACKED = ...
+    DELTA_LENGTH_BYTE_ARRAY = ...
+    DELTA_BYTE_ARRAY = ...
+    BYTE_STREAM_SPLIT = ...
+    DIRECT = ...
+    DIRECT_V2 = ...
+    DICTIONARY_V2 = ...
+
+class DictionaryPolicy(IntEnum):
+    NEVER = ...
+    ADAPTIVE = ...
+    ALWAYS = ...
+
+class StatisticsFreq(IntEnum):
+    STATISTICS_NONE = ...
+    STATISTICS_ROWGROUP = ...
+    STATISTICS_PAGE = ...
+    STATISTICS_COLUMN = ...
+
+class QuoteStyle(IntEnum):
+    MINIMAL = ...
+    ALL = ...
+    NONNUMERIC = ...
+    NONE = ...
+
+ColumnNameSpec: TypeAlias = tuple[str, list[ColumnNameSpec]]
+ChildNameSpec: TypeAlias = Mapping[str, ChildNameSpec]
+
+class TableWithMetadata:
+    tbl: Table
+    def __init__(
+        self, tbl: Table, column_names: list[ColumnNameSpec]
+    ) -> None: ...
+    @property
+    def columns(self) -> list[Column]: ...
+    @overload
+    def column_names(self, include_children: Literal[False]) -> list[str]: ...
+    @overload
+    def column_names(
+        self, include_children: Literal[True]
+    ) -> list[ColumnNameSpec]: ...
+    @overload
+    def column_names(
+        self, include_children: bool = False
+    ) -> list[str] | list[ColumnNameSpec]: ...
+    @property
+    def child_names(self) -> ChildNameSpec: ...
+    @property
+    def per_file_user_data(self) -> list[Mapping[str, str]]: ...
+
+class SourceInfo:
+    def __init__(
+        self, sources: list[str] | list[os.PathLike[Any]] | list[Datasource]
+    ) -> None: ...
+
+class SinkInfo:
+    def __init__(
+        self,
+        sinks: list[os.PathLike[Any]]
+        | list[io.StringIO]
+        | list[io.BytesIO]
+        | list[io.TextIOBase]
+        | list[str],
+    ) -> None: ...
diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
index c129903f8f1..5db4eeb9583 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -28,9 +28,21 @@ from pylibcudf.libcudf.io.types import (
     compression_type as CompressionType,  # no-cython-lint
     column_encoding as ColumnEncoding,  # no-cython-lint
     dictionary_policy as DictionaryPolicy,  # no-cython-lint
+    quote_style as QuoteStyle,  # no-cython-lint
     statistics_freq as StatisticsFreq, # no-cython-lint
 )
 
+__all__ = [
+    "ColumnEncoding",
+    "CompressionType",
+    "DictionaryPolicy",
+    "JSONRecoveryMode",
+    "QuoteStyle",
+    "SinkInfo",
+    "SourceInfo",
+    "StatisticsFreq",
+    "TableWithMetadata",
+]
 
 cdef class TableWithMetadata:
     """A container holding a table and its associated metadata
@@ -54,6 +66,8 @@ cdef class TableWithMetadata:
 
         self.metadata.schema_info = self._make_column_info(column_names)
 
+    __hash__ = None
+
     cdef vector[column_name_info] _make_column_info(self, list column_names):
         cdef vector[column_name_info] col_name_infos
         cdef column_name_info info
@@ -219,6 +233,8 @@ cdef class SourceInfo:
 
         self.c_obj = source_info(c_host_buffers)
 
+    __hash__ = None
+
 
 # Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you
 # write from cudf to any python file-like object (File/BytesIO/SocketIO etc)
@@ -301,3 +317,5 @@ cdef class SinkInfo:
         else:
             # we don't have sinks so we must have paths to sinks
             self.c_obj = sink_info(paths)
+
+    __hash__ = None
diff --git a/python/pylibcudf/pylibcudf/join.pyi b/python/pylibcudf/pylibcudf/join.pyi
new file mode 100644
index 00000000000..f34357baa67
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/join.pyi
@@ -0,0 +1,78 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.expressions import Expression
+from pylibcudf.table import Table
+from pylibcudf.types import NullEquality
+
+def inner_join(
+    left_keys: Table, right_keys: Table, nulls_equal: NullEquality
+) -> tuple[Column, Column]: ...
+def left_join(
+    left_keys: Table, right_keys: Table, nulls_equal: NullEquality
+) -> tuple[Column, Column]: ...
+def full_join(
+    left_keys: Table, right_keys: Table, nulls_equal: NullEquality
+) -> tuple[Column, Column]: ...
+def left_semi_join(
+    left_keys: Table, right_keys: Table, nulls_equal: NullEquality
+) -> Column: ...
+def left_anti_join(
+    left_keys: Table, right_keys: Table, nulls_equal: NullEquality
+) -> Column: ...
+def cross_join(left: Table, right: Table) -> Table: ...
+def conditional_inner_join(
+    left: Table, right: Table, binary_predicate: Expression
+) -> tuple[Column, Column]: ...
+def conditional_left_join(
+    left: Table, right: Table, binary_predicate: Expression
+) -> tuple[Column, Column]: ...
+def conditional_full_join(
+    left: Table, right: Table, binary_predicate: Expression
+) -> tuple[Column, Column]: ...
+def conditional_left_semi_join(
+    left: Table, right: Table, binary_predicate: Expression
+) -> Column: ...
+def conditional_left_anti_join(
+    left: Table, right: Table, binary_predicate: Expression
+) -> Column: ...
+def mixed_inner_join(
+    left_keys: Table,
+    right_keys: Table,
+    left_conditional: Table,
+    right_conditional: Table,
+    binary_predicate: Expression,
+    nulls_equal: NullEquality,
+) -> tuple[Column, Column]: ...
+def mixed_left_join(
+    left_keys: Table,
+    right_keys: Table,
+    left_conditional: Table,
+    right_conditional: Table,
+    binary_predicate: Expression,
+    nulls_equal: NullEquality,
+) -> tuple[Column, Column]: ...
+def mixed_full_join(
+    left_keys: Table,
+    right_keys: Table,
+    left_conditional: Table,
+    right_conditional: Table,
+    binary_predicate: Expression,
+    nulls_equal: NullEquality,
+) -> tuple[Column, Column]: ...
+def mixed_left_semi_join(
+    left_keys: Table,
+    right_keys: Table,
+    left_conditional: Table,
+    right_conditional: Table,
+    binary_predicate: Expression,
+    nulls_equal: NullEquality,
+) -> Column: ...
+def mixed_left_anti_join(
+    left_keys: Table,
+    right_keys: Table,
+    left_conditional: Table,
+    right_conditional: Table,
+    binary_predicate: Expression,
+    nulls_equal: NullEquality,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx
index 0d841eee194..c2efe05ffc4 100644
--- a/python/pylibcudf/pylibcudf/join.pyx
+++ b/python/pylibcudf/pylibcudf/join.pyx
@@ -15,6 +15,24 @@ from .column cimport Column
 from .expressions cimport Expression
 from .table cimport Table
 
+__all__ = [
+    "conditional_full_join",
+    "conditional_inner_join",
+    "conditional_left_anti_join",
+    "conditional_left_join",
+    "conditional_left_semi_join",
+    "cross_join",
+    "full_join",
+    "inner_join",
+    "left_anti_join",
+    "left_join",
+    "left_semi_join",
+    "mixed_full_join",
+    "mixed_inner_join",
+    "mixed_left_anti_join",
+    "mixed_left_join",
+    "mixed_left_semi_join",
+]
 
 cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map):
     # helper to convert a gather map to a Column
diff --git a/python/pylibcudf/pylibcudf/json.pyi b/python/pylibcudf/pylibcudf/json.pyi
new file mode 100644
index 00000000000..b93d4876dab
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/json.pyi
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class GetJsonObjectOptions:
+    def __init__(
+        self,
+        *,
+        allow_single_quotes: bool = False,
+        strip_quotes_from_single_strings: bool = True,
+        missing_fields_as_nulls: bool = False,
+    ) -> None: ...
+    def get_allow_single_quotes(self) -> bool: ...
+    def get_strip_quotes_from_single_strings(self) -> bool: ...
+    def get_missing_fields_as_nulls(self) -> bool: ...
+    def set_allow_single_quotes(self, val: bool) -> None: ...
+    def set_strip_quotes_from_single_strings(self, val: bool) -> None: ...
+    def set_missing_fields_as_nulls(self, val: bool) -> None: ...
+
+def get_json_object(
+    col: Column, json_path: Scalar, options: GetJsonObjectOptions | None = None
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx
index ebb82f80408..5ec1e1be971 100644
--- a/python/pylibcudf/pylibcudf/json.pyx
+++ b/python/pylibcudf/pylibcudf/json.pyx
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.scalar cimport Scalar
 
+__all__ = ["GetJsonObjectOptions", "get_json_object"]
 
 cdef class GetJsonObjectOptions:
     """Settings for ``get_json_object()``"""
@@ -26,6 +27,8 @@ cdef class GetJsonObjectOptions:
         )
         self.set_missing_fields_as_nulls(missing_fields_as_nulls)
 
+    __hash__ = None
+
     def get_allow_single_quotes(self):
         """
         Returns true/false depending on whether single-quotes for representing strings
diff --git a/python/pylibcudf/pylibcudf/labeling.pxd b/python/pylibcudf/pylibcudf/labeling.pxd
index 6f8797ae7d3..b1f9f2e806d 100644
--- a/python/pylibcudf/pylibcudf/labeling.pxd
+++ b/python/pylibcudf/pylibcudf/labeling.pxd
@@ -8,7 +8,7 @@ from .column cimport Column
 cpdef Column label_bins(
     Column input,
     Column left_edges,
-    bool left_inclusive,
+    inclusive left_inclusive,
     Column right_edges,
-    bool right_inclusive
+    inclusive right_inclusive
 )
diff --git a/python/pylibcudf/pylibcudf/labeling.pyi b/python/pylibcudf/pylibcudf/labeling.pyi
new file mode 100644
index 00000000000..c3a75d10baf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/labeling.pyi
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+
+class Inclusive(IntEnum):
+    YES = ...
+    NO = ...
+
+def label_bins(
+    input: Column,
+    left_edges: Column,
+    left_inclusive: Inclusive,
+    right_edges: Column,
+    right_inclusive: Inclusive,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx
index 226a9e14172..cae1830f6b9 100644
--- a/python/pylibcudf/pylibcudf/labeling.pyx
+++ b/python/pylibcudf/pylibcudf/labeling.pyx
@@ -10,13 +10,14 @@ from pylibcudf.libcudf.labeling import inclusive as Inclusive  # no-cython-lint
 
 from .column cimport Column
 
+__all__ = ["Inclusive", "label_bins"]
 
 cpdef Column label_bins(
     Column input,
     Column left_edges,
-    bool left_inclusive,
+    inclusive left_inclusive,
     Column right_edges,
-    bool right_inclusive
+    inclusive right_inclusive
 ):
     """Labels elements based on membership in the specified bins.
 
@@ -28,11 +29,11 @@ cpdef Column label_bins(
         Column of input elements to label according to the specified bins.
     left_edges : Column
         Column of the left edge of each bin.
-    left_inclusive : bool
+    left_inclusive : Inclusive
         Whether or not the left edge is inclusive.
     right_edges : Column
         Column of the right edge of each bin.
-    right_inclusive : bool
+    right_inclusive : Inclusive
         Whether or not the right edge is inclusive.
 
     Returns
@@ -42,24 +43,13 @@ cpdef Column label_bins(
         according to the specified bins.
     """
     cdef unique_ptr[column] c_result
-    cdef inclusive c_left_inclusive = (
-        inclusive.YES
-        if left_inclusive
-        else inclusive.NO
-    )
-    cdef inclusive c_right_inclusive = (
-        inclusive.YES
-        if right_inclusive
-        else inclusive.NO
-    )
-
     with nogil:
         c_result = cpp_labeling.label_bins(
             input.view(),
             left_edges.view(),
-            c_left_inclusive,
+            left_inclusive,
             right_edges.view(),
-            c_right_inclusive,
+            right_inclusive,
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
index 15beaee47d4..00669ff579a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
@@ -24,4 +24,5 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp
 )
 add_subdirectory(io)
+add_subdirectory(lists)
 add_subdirectory(strings)
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt
new file mode 100644
index 00000000000..c896db2c85a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt
@@ -0,0 +1,23 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources combine.pyx contains.pyx)
+
+set(linked_libraries cudf::cudf)
+
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_lists
+)
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
index d077958ce03..09a5d84c64f 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -9,10 +10,9 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 cdef extern from "cudf/lists/combine.hpp" namespace \
         "cudf::lists" nogil:
 
-    ctypedef enum concatenate_null_policy:
-        IGNORE "cudf::lists::concatenate_null_policy::IGNORE"
-        NULLIFY_OUTPUT_ROW \
-            "cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW"
+    cpdef enum class concatenate_null_policy(int32_t):
+        IGNORE
+        NULLIFY_OUTPUT_ROW
 
     cdef unique_ptr[column] concatenate_rows(
         const table_view input_table
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/lists.pxd b/python/pylibcudf/pylibcudf/lists.pxd
index e7d006e6e2e..10c1c26e24e 100644
--- a/python/pylibcudf/pylibcudf/lists.pxd
+++ b/python/pylibcudf/pylibcudf/lists.pxd
@@ -1,7 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from pylibcudf.libcudf.types cimport null_order, size_type
+from pylibcudf.libcudf.types cimport (
+    nan_equality, null_equality, null_order, order, size_type
+)
+from pylibcudf.libcudf.lists.combine cimport concatenate_null_policy
+from pylibcudf.libcudf.lists.contains cimport duplicate_find_option
 
 from .column cimport Column
 from .scalar cimport Scalar
@@ -19,13 +23,13 @@ cpdef Table explode_outer(Table, size_type explode_column_idx)
 
 cpdef Column concatenate_rows(Table)
 
-cpdef Column concatenate_list_elements(Column, bool dropna)
+cpdef Column concatenate_list_elements(Column, concatenate_null_policy null_policy)
 
 cpdef Column contains(Column, ColumnOrScalar)
 
 cpdef Column contains_nulls(Column)
 
-cpdef Column index_of(Column, ColumnOrScalar, bool)
+cpdef Column index_of(Column, ColumnOrScalar, duplicate_find_option)
 
 cpdef Column reverse(Column)
 
@@ -37,16 +41,24 @@ cpdef Column count_elements(Column)
 
 cpdef Column sequences(Column, Column, Column steps = *)
 
-cpdef Column sort_lists(Column, bool, null_order, bool stable = *)
+cpdef Column sort_lists(Column, order, null_order, bool stable = *)
 
-cpdef Column difference_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+cpdef Column difference_distinct(
+    Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=*
+)
 
-cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+cpdef Column have_overlap(
+    Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=*
+)
 
-cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+cpdef Column intersect_distinct(
+    Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=*
+)
 
-cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+cpdef Column union_distinct(
+    Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=*
+)
 
 cpdef Column apply_boolean_mask(Column, Column)
 
-cpdef Column distinct(Column, bool, bool)
+cpdef Column distinct(Column, null_equality, nan_equality)
diff --git a/python/pylibcudf/pylibcudf/lists.pyi b/python/pylibcudf/pylibcudf/lists.pyi
new file mode 100644
index 00000000000..dff6c400638
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/lists.pyi
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+from pylibcudf.types import NanEquality, NullEquality, NullOrder, Order
+
+class ConcatenateNullPolicy(IntEnum):
+    IGNORE = ...
+    NULLIFY_OUTPUT_ROW = ...
+
+class DuplicateFindOption(IntEnum):
+    FIND_FIRST = ...
+    FIND_LAST = ...
+
+def explode_outer(input: Table, explode_column_idx: int) -> Table: ...
+def concatenate_rows(input: Table) -> Column: ...
+def concatenate_list_elements(
+    input: Column, null_policy: ConcatenateNullPolicy
+) -> Column: ...
+def contains(input: Column, search_key: Column | Scalar) -> Column: ...
+def contains_nulls(input: Column) -> Column: ...
+def index_of(
+    input: Column,
+    search_key: Column | Scalar,
+    find_option: DuplicateFindOption,
+) -> Column: ...
+def reverse(input: Column) -> Column: ...
+def segmented_gather(input: Column, gather_map_list: Column) -> Column: ...
+def extract_list_element(input: Column, index: Column | int) -> Column: ...
+def count_elements(input: Column) -> Column: ...
+def sequences(
+    starts: Column, sizes: Column, steps: Column | None = None
+) -> Column: ...
+def sort_lists(
+    input: Column,
+    sort_order: Order,
+    na_position: NullOrder,
+    stable: bool = False,
+) -> Column: ...
+def difference_distinct(
+    lhs: Column,
+    rhs: Column,
+    nulls_equal: NullEquality = NullEquality.EQUAL,
+    nans_equal: NanEquality = NanEquality.ALL_EQUAL,
+) -> Column: ...
+def have_overlap(
+    lhs: Column,
+    rhs: Column,
+    nulls_equal: NullEquality = NullEquality.EQUAL,
+    nans_equal: NanEquality = NanEquality.ALL_EQUAL,
+) -> Column: ...
+def intersect_distinct(
+    lhs: Column,
+    rhs: Column,
+    nulls_equal: NullEquality = NullEquality.EQUAL,
+    nans_equal: NanEquality = NanEquality.ALL_EQUAL,
+) -> Column: ...
+def union_distinct(
+    lhs: Column,
+    rhs: Column,
+    nulls_equal: NullEquality = NullEquality.EQUAL,
+    nans_equal: NanEquality = NanEquality.ALL_EQUAL,
+) -> Column: ...
+def apply_boolean_mask(input: Column, boolean_mask: Column) -> Column: ...
+def distinct(
+    input: Column, nulls_equal: NullEquality, nans_equal: NanEquality
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx
index ecaf62d6895..ccc56eaa520 100644
--- a/python/pylibcudf/pylibcudf/lists.pyx
+++ b/python/pylibcudf/pylibcudf/lists.pyx
@@ -42,10 +42,35 @@ from pylibcudf.libcudf.types cimport (
 )
 from pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
 
+from pylibcudf.libcudf.lists.combine import concatenate_null_policy as ConcatenateNullPolicy # no-cython-lint
+from pylibcudf.libcudf.lists.contains import duplicate_find_option as DuplicateFindOption # no-cython-lint
+
 from .column cimport Column, ListColumnView
 from .scalar cimport Scalar
 from .table cimport Table
 
+__all__ = [
+    "ConcatenateNullPolicy",
+    "DuplicateFindOption",
+    "apply_boolean_mask",
+    "concatenate_list_elements",
+    "concatenate_rows",
+    "contains",
+    "contains_nulls",
+    "count_elements",
+    "difference_distinct",
+    "distinct",
+    "explode_outer",
+    "extract_list_element",
+    "have_overlap",
+    "index_of",
+    "intersect_distinct",
+    "reverse",
+    "segmented_gather",
+    "sequences",
+    "sort_lists",
+    "union_distinct",
+]
 
 cpdef Table explode_outer(Table input, size_type explode_column_idx):
     """Explode a column of lists into rows.
@@ -97,7 +122,9 @@ cpdef Column concatenate_rows(Table input):
     return Column.from_libcudf(move(c_result))
 
 
-cpdef Column concatenate_list_elements(Column input, bool dropna):
+cpdef Column concatenate_list_elements(
+    Column input, concatenate_null_policy null_policy
+):
     """Concatenate multiple lists on the same row into a single list.
 
     For details, see :cpp:func:`concatenate_list_elements`.
@@ -106,20 +133,14 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
     ----------
     input : Column
         The input column
-    dropna : bool
-        If true, null list elements will be ignored
-        from concatenation. Otherwise any input null values will result in
-        the corresponding output row being set to null.
+    null_policy : ConcatenateNullPolicy
+        How to treat null list elements.
 
     Returns
     -------
     Column
         A new Column of concatenated list elements
     """
-    cdef concatenate_null_policy null_policy = (
-        concatenate_null_policy.IGNORE if dropna
-        else concatenate_null_policy.NULLIFY_OUTPUT_ROW
-    )
     cdef unique_ptr[column] c_result
 
     with nogil:
@@ -191,7 +212,9 @@ cpdef Column contains_nulls(Column input):
     return Column.from_libcudf(move(c_result))
 
 
-cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_option):
+cpdef Column index_of(
+    Column input, ColumnOrScalar search_key, duplicate_find_option find_option
+):
     """Create a column of index values indicating the position of a search
     key row within the corresponding list row in the lists column.
 
@@ -207,9 +230,8 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
         The input column.
     search_key : Union[Column, Scalar]
         The search key.
-    find_first_option : bool
-        If true, index_of returns the first match.
-        Otherwise the last match is returned.
+    find_option : DuplicateFindOption
+        Which match to return if there are duplicates.
 
     Returns
     -------
@@ -220,11 +242,6 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
     """
     cdef unique_ptr[column] c_result
     cdef ListColumnView list_view = input.list_view()
-    cdef cpp_contains.duplicate_find_option find_option = (
-        cpp_contains.duplicate_find_option.FIND_FIRST if find_first_option
-        else cpp_contains.duplicate_find_option.FIND_LAST
-    )
-
     with nogil:
         c_result = cpp_contains.index_of(
             list_view.view(),
@@ -380,7 +397,7 @@ cpdef Column sequences(Column starts, Column sizes, Column steps = None):
 
 cpdef Column sort_lists(
     Column input,
-    bool ascending,
+    order sort_order,
     null_order na_position,
     bool stable = False
 ):
@@ -392,8 +409,8 @@ cpdef Column sort_lists(
     ----------
     input : Column
         The input column.
-    ascending : bool
-        If true, the sort order is ascending. Otherwise, the sort order is descending.
+    ascending : Order
+        Sort order in the list.
     na_position : NullOrder
         If na_position equals NullOrder.FIRST, then the null values in the output
         column are placed first. Otherwise, they are be placed after.
@@ -409,21 +426,17 @@ cpdef Column sort_lists(
     cdef unique_ptr[column] c_result
     cdef ListColumnView list_view = input.list_view()
 
-    cdef order c_sort_order = (
-        order.ASCENDING if ascending else order.DESCENDING
-    )
-
     with nogil:
         if stable:
             c_result = cpp_stable_sort_lists(
                     list_view.view(),
-                    c_sort_order,
+                    sort_order,
                     na_position,
             )
         else:
             c_result = cpp_sort_lists(
                     list_view.view(),
-                    c_sort_order,
+                    sort_order,
                     na_position,
             )
     return Column.from_libcudf(move(c_result))
@@ -432,8 +445,8 @@ cpdef Column sort_lists(
 cpdef Column difference_distinct(
     Column lhs,
     Column rhs,
-    bool nulls_equal=True,
-    bool nans_equal=True
+    null_equality nulls_equal=null_equality.EQUAL,
+    nan_equality nans_equal=nan_equality.ALL_EQUAL,
 ):
     """Create a column of index values indicating the position of a search
     key row within the corresponding list row in the lists column.
@@ -446,11 +459,10 @@ cpdef Column difference_distinct(
         The input lists column of elements that may be included.
     rhs : Column
         The input lists column of elements to exclude.
-    nulls_equal : bool, default True
-        If true, null elements are considered equal. Otherwise, unequal.
-    nans_equal : bool, default True
-        If true, libcudf will treat nan elements from {-nan, +nan}
-        as equal. Otherwise, unequal. Otherwise, unequal.
+    nulls_equal : NullEquality, default EQUAL
+        Are nulls considered equal.
+    nans_equal : NanEquality, default ALL_EQUAL
+        Are nans considered equal.
 
     Returns
     -------
@@ -461,19 +473,12 @@ cpdef Column difference_distinct(
     cdef ListColumnView lhs_view = lhs.list_view()
     cdef ListColumnView rhs_view = rhs.list_view()
 
-    cdef null_equality c_nulls_equal = (
-        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
-    )
-    cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
-    )
-
     with nogil:
         c_result = cpp_set_operations.difference_distinct(
             lhs_view.view(),
             rhs_view.view(),
-            c_nulls_equal,
-            c_nans_equal,
+            nulls_equal,
+            nans_equal,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -481,8 +486,8 @@ cpdef Column difference_distinct(
 cpdef Column have_overlap(
     Column lhs,
     Column rhs,
-    bool nulls_equal=True,
-    bool nans_equal=True
+    null_equality nulls_equal=null_equality.EQUAL,
+    nan_equality nans_equal=nan_equality.ALL_EQUAL,
 ):
     """Check if lists at each row of the given lists columns overlap.
 
@@ -494,11 +499,10 @@ cpdef Column have_overlap(
         The input lists column for one side.
     rhs : Column
         The input lists column for the other side.
-    nulls_equal : bool, default True
-        If true, null elements are considered equal. Otherwise, unequal.
-    nans_equal : bool, default True
-        If true, libcudf will treat nan elements from {-nan, +nan}
-        as equal. Otherwise, unequal. Otherwise, unequal.
+    nulls_equal : NullEquality, default EQUAL
+        Are nulls considered equal.
+    nans_equal : NanEquality, default ALL_EQUAL
+        Are nans considered equal.
 
     Returns
     -------
@@ -509,19 +513,12 @@ cpdef Column have_overlap(
     cdef ListColumnView lhs_view = lhs.list_view()
     cdef ListColumnView rhs_view = rhs.list_view()
 
-    cdef null_equality c_nulls_equal = (
-        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
-    )
-    cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
-    )
-
     with nogil:
         c_result = cpp_set_operations.have_overlap(
             lhs_view.view(),
             rhs_view.view(),
-            c_nulls_equal,
-            c_nans_equal,
+            nulls_equal,
+            nans_equal,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -529,8 +526,8 @@ cpdef Column have_overlap(
 cpdef Column intersect_distinct(
     Column lhs,
     Column rhs,
-    bool nulls_equal=True,
-    bool nans_equal=True
+    null_equality nulls_equal=null_equality.EQUAL,
+    nan_equality nans_equal=nan_equality.ALL_EQUAL,
 ):
     """Create a lists column of distinct elements common to two input lists columns.
 
@@ -542,11 +539,10 @@ cpdef Column intersect_distinct(
         The input lists column of elements that may be included.
     rhs : Column
         The input lists column of elements to exclude.
-    nulls_equal : bool, default True
-        If true, null elements are considered equal. Otherwise, unequal.
-    nans_equal : bool, default True
-        If true, libcudf will treat nan elements from {-nan, +nan}
-        as equal. Otherwise, unequal. Otherwise, unequal.
+    nulls_equal : NullEquality, default EQUAL
+        Are nulls considered equal.
+    nans_equal : NanEquality, default ALL_EQUAL
+        Are nans considered equal.
 
     Returns
     -------
@@ -557,19 +553,12 @@ cpdef Column intersect_distinct(
     cdef ListColumnView lhs_view = lhs.list_view()
     cdef ListColumnView rhs_view = rhs.list_view()
 
-    cdef null_equality c_nulls_equal = (
-        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
-    )
-    cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
-    )
-
     with nogil:
         c_result = cpp_set_operations.intersect_distinct(
             lhs_view.view(),
             rhs_view.view(),
-            c_nulls_equal,
-            c_nans_equal,
+            nulls_equal,
+            nans_equal,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -577,8 +566,8 @@ cpdef Column intersect_distinct(
 cpdef Column union_distinct(
     Column lhs,
     Column rhs,
-    bool nulls_equal=True,
-    bool nans_equal=True
+    null_equality nulls_equal=null_equality.EQUAL,
+    nan_equality nans_equal=nan_equality.ALL_EQUAL,
 ):
     """Create a lists column of distinct elements found in
     either of two input lists columns.
@@ -591,11 +580,10 @@ cpdef Column union_distinct(
         The input lists column of elements that may be included.
     rhs : Column
         The input lists column of elements to exclude.
-    nulls_equal : bool, default True
-        If true, null elements are considered equal. Otherwise, unequal.
-    nans_equal : bool, default True
-        If true, libcudf will treat nan elements from {-nan, +nan}
-        as equal. Otherwise, unequal. Otherwise, unequal.
+    nulls_equal : NullEquality, default EQUAL
+        Are nulls considered equal.
+    nans_equal : NanEquality, default ALL_EQUAL
+        Are nans considered equal.
 
     Returns
     -------
@@ -606,19 +594,12 @@ cpdef Column union_distinct(
     cdef ListColumnView lhs_view = lhs.list_view()
     cdef ListColumnView rhs_view = rhs.list_view()
 
-    cdef null_equality c_nulls_equal = (
-        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
-    )
-    cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
-    )
-
     with nogil:
         c_result = cpp_set_operations.union_distinct(
             lhs_view.view(),
             rhs_view.view(),
-            c_nulls_equal,
-            c_nans_equal,
+            nulls_equal,
+            nans_equal,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -651,7 +632,7 @@ cpdef Column apply_boolean_mask(Column input, Column boolean_mask):
     return Column.from_libcudf(move(c_result))
 
 
-cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal):
+cpdef Column distinct(Column input, null_equality nulls_equal, nan_equality nans_equal):
     """Create a new list column without duplicate elements in each list.
 
     For details, see :cpp:func:`distinct`.
@@ -660,11 +641,10 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal):
     ----------
     input : Column
         The input column.
-    nulls_equal : bool
-        If true, null elements are considered equal. Otherwise, unequal.
-    nans_equal : bool
-        If true, libcudf will treat nan elements from {-nan, +nan}
-        as equal. Otherwise, unequal. Otherwise, unequal.
+    nulls_equal : NullEquality
+        Are nulls considered equal.
+    nans_equal : NanEquality
+        Are nans considered equal.
 
     Returns
     -------
@@ -674,17 +654,10 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal):
     cdef unique_ptr[column] c_result
     cdef ListColumnView list_view = input.list_view()
 
-    cdef null_equality c_nulls_equal = (
-        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
-    )
-    cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
-    )
-
     with nogil:
         c_result = cpp_distinct(
             list_view.view(),
-            c_nulls_equal,
-            c_nans_equal,
+            nulls_equal,
+            nans_equal,
         )
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/merge.pyi b/python/pylibcudf/pylibcudf/merge.pyi
new file mode 100644
index 00000000000..b18eb01f8a2
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/merge.pyi
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.table import Table
+from pylibcudf.types import NullOrder, Order
+
+def merge(
+    tables_to_merge: list[Table],
+    key_cols: list[int],
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx
index 61a21aafdb2..c051cdc0c66 100644
--- a/python/pylibcudf/pylibcudf/merge.pyx
+++ b/python/pylibcudf/pylibcudf/merge.pyx
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.types cimport null_order, order, size_type
 
 from .table cimport Table
 
+__all__ = ["merge"]
 
 cpdef Table merge (
     list tables_to_merge,
diff --git a/python/pylibcudf/pylibcudf/null_mask.pyi b/python/pylibcudf/pylibcudf/null_mask.pyi
new file mode 100644
index 00000000000..1a6d96a0822
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/null_mask.pyi
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from rmm.pylibrmm.device_buffer import DeviceBuffer
+
+from pylibcudf.column import Column
+from pylibcudf.types import MaskState
+
+def copy_bitmask(col: Column) -> DeviceBuffer: ...
+def bitmask_allocation_size_bytes(number_of_bits: int) -> int: ...
+def create_null_mask(
+    size: int, state: MaskState = MaskState.UNINITIALIZED
+) -> DeviceBuffer: ...
+def bitmask_and(columns: list[Column]) -> tuple[DeviceBuffer, int]: ...
+def bitmask_or(columns: list[Column]) -> tuple[DeviceBuffer, int]: ...
diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx
index 74180951562..adc264e9af6 100644
--- a/python/pylibcudf/pylibcudf/null_mask.pyx
+++ b/python/pylibcudf/pylibcudf/null_mask.pyx
@@ -14,6 +14,13 @@ from pylibcudf.libcudf.types import mask_state as MaskState  # no-cython-lint
 from .column cimport Column
 from .table cimport Table
 
+__all__ = [
+    "bitmask_allocation_size_bytes",
+    "bitmask_and",
+    "bitmask_or",
+    "copy_bitmask",
+    "create_null_mask",
+]
 
 cdef DeviceBuffer buffer_to_python(device_buffer buf):
     return DeviceBuffer.c_from_unique_ptr(make_unique[device_buffer](move(buf)))
diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi
new file mode 100644
index 00000000000..ca39aa16d7e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class BPEMergePairs:
+    def __init__(self, merge_pairs: Column): ...
+
+def byte_pair_encoding(
+    input: Column, merge_pairs: BPEMergePairs, separator: Scalar | None = None
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
index 76caad276d4..7565b21084f 100644
--- a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
@@ -16,6 +16,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport (
 )
 from pylibcudf.scalar cimport Scalar
 
+__all__ = ["BPEMergePairs", "byte_pair_encoding"]
 
 cdef class BPEMergePairs:
     """The table of merge pairs for the BPE encoder.
@@ -27,6 +28,8 @@ cdef class BPEMergePairs:
         with nogil:
             self.c_obj = move(cpp_load_merge_pairs(c_pairs))
 
+    __hash__ = None
+
 cpdef Column byte_pair_encoding(
     Column input,
     BPEMergePairs merge_pairs,
diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi
new file mode 100644
index 00000000000..85bbbb880ee
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def edit_distance(input: Column, targets: Column) -> Column: ...
+def edit_distance_matrix(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
index dcacb2e1267..eceeaff24e3 100644
--- a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
@@ -9,6 +9,7 @@ from pylibcudf.libcudf.nvtext.edit_distance cimport (
     edit_distance_matrix as cpp_edit_distance_matrix,
 )
 
+__all__ = ["edit_distance", "edit_distance_matrix"]
 
 cpdef Column edit_distance(Column input, Column targets):
     """
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi
new file mode 100644
index 00000000000..2757518379d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def generate_ngrams(
+    input: Column, ngrams: int, separator: Scalar
+) -> Column: ...
+def generate_character_ngrams(input: Column, ngrams: int = 2) -> Column: ...
+def hash_character_ngrams(input: Column, ngrams: int = 2) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
index 09859d09e9e..521bc0ef4a4 100644
--- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
@@ -14,6 +14,11 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 
+__all__ = [
+    "generate_ngrams",
+    "generate_character_ngrams",
+    "hash_character_ngrams",
+]
 
 cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator):
     """
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi
new file mode 100644
index 00000000000..18263c5c8fd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def jaccard_index(input1: Column, input2: Column, width: int) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
index 3d8669865d9..90cace088f7 100644
--- a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.nvtext.jaccard cimport (
 )
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = ["jaccard_index"]
 
 cpdef Column jaccard_index(Column input1, Column input2, size_type width):
     """
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
new file mode 100644
index 00000000000..a2d9b6364f7
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def minhash(
+    input: Column, seeds: Column | Scalar, width: int = 4
+) -> Column: ...
+def minhash64(
+    input: Column, seeds: Column | Scalar, width: int = 4
+) -> Column: ...
+def word_minhash(input: Column, seeds: Column) -> Column: ...
+def word_minhash64(input: Column, seeds: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index 5a51e32b287..5448cc6de9b 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -20,6 +20,12 @@ from pylibcudf.scalar cimport Scalar
 from cython.operator import dereference
 import warnings
 
+__all__ = [
+    "minhash",
+    "minhash64",
+    "word_minhash",
+    "word_minhash64",
+]
 
 cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
     """
diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi
new file mode 100644
index 00000000000..224640ed44d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def ngrams_tokenize(
+    input: Column, ngrams: int, delimiter: Scalar, separator: Scalar
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
index 8a1854c5f0d..771c7c019fc 100644
--- a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
@@ -12,6 +12,7 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 
+__all__ = ["ngrams_tokenize"]
 
 cpdef Column ngrams_tokenize(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
new file mode 100644
index 00000000000..1d90a5a8960
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def normalize_spaces(input: Column) -> Column: ...
+def normalize_characters(input: Column, do_lower_case: bool) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
index 637d900b659..b259ccaefa6 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.nvtext.normalize cimport (
     normalize_spaces as cpp_normalize_spaces,
 )
 
+__all__ = ["normalize_characters", "normalize_spaces"]
 
 cpdef Column normalize_spaces(Column input):
     """
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyi b/python/pylibcudf/pylibcudf/nvtext/replace.pyi
new file mode 100644
index 00000000000..1f1ac72ce7c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyi
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def replace_tokens(
+    input: Column,
+    targets: Column,
+    replacements: Column,
+    delimiter: Scalar | None = None,
+) -> Column: ...
+def filter_tokens(
+    input: Column,
+    min_token_length: int,
+    replacement: Scalar | None = None,
+    delimiter: Scalar | None = None,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyx b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
index b65348ce14d..a27592fb434 100644
--- a/python/pylibcudf/pylibcudf/nvtext/replace.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
@@ -16,6 +16,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport (
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 
+__all__ = ["filter_tokens", "replace_tokens"]
 
 cpdef Column replace_tokens(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi
new file mode 100644
index 00000000000..d6ba1d189bd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def is_letter(
+    input: Column, check_vowels: bool, indices: Column | int
+) -> Column: ...
+def porter_stemmer_measure(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
index 854d1053624..c9e4f1274e4 100644
--- a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
@@ -12,6 +12,7 @@ from pylibcudf.libcudf.nvtext.stemmer cimport (
 )
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = ["is_letter", "porter_stemmer_measure"]
 
 cpdef Column is_letter(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi
new file mode 100644
index 00000000000..f6618e296b1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+class HashedVocabulary:
+    def __init__(self, hash_file: str): ...
+
+def subword_tokenize(
+    input: Column,
+    vocabulary_table: HashedVocabulary,
+    max_sequence_length: int,
+    stride: int,
+    do_lower_case: bool,
+    do_truncate: bool,
+) -> tuple[Column, Column, Column]: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx
index 04643d3bd84..14fb6f5fe1e 100644
--- a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx
@@ -13,6 +13,7 @@ from pylibcudf.libcudf.nvtext.subword_tokenize cimport (
     tokenizer_result as cpp_tokenizer_result,
 )
 
+__all__ = ["HashedVocabulary", "subword_tokenize"]
 
 cdef class HashedVocabulary:
     """The vocabulary data for use with the subword_tokenize function.
@@ -24,6 +25,8 @@ cdef class HashedVocabulary:
         with nogil:
             self.c_obj = move(cpp_load_vocabulary_file(c_hash_file))
 
+    __hash__ = None
+
 cpdef tuple[Column, Column, Column] subword_tokenize(
     Column input,
     HashedVocabulary vocabulary_table,
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi
new file mode 100644
index 00000000000..b9aa2393514
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class TokenizeVocabulary:
+    def __init__(self, vocab: Column): ...
+
+def tokenize_scalar(
+    input: Column, delimiter: Scalar | None = None
+) -> Column: ...
+def tokenize_column(input: Column, delimiters: Column) -> Column: ...
+def count_tokens_scalar(
+    input: Column, delimiter: Scalar | None = None
+) -> Column: ...
+def count_tokens_column(input: Column, delimiters: Column) -> Column: ...
+def character_tokenize(input: Column) -> Column: ...
+def detokenize(
+    input: Column, row_indices: Column, separator: Scalar | None = None
+) -> Column: ...
+def tokenize_with_vocabulary(
+    input: Column,
+    vocabulary: TokenizeVocabulary,
+    delimiter: Scalar,
+    default_id: int = -1,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
index ec02e8ebf4e..43d426489b4 100644
--- a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
@@ -20,6 +20,16 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport (
 )
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = [
+    "TokenizeVocabulary",
+    "character_tokenize",
+    "count_tokens_column",
+    "count_tokens_scalar",
+    "detokenize",
+    "tokenize_column",
+    "tokenize_scalar",
+    "tokenize_with_vocabulary",
+]
 
 cdef class TokenizeVocabulary:
     """The Vocabulary object to be used with ``tokenize_with_vocabulary``.
@@ -31,6 +41,8 @@ cdef class TokenizeVocabulary:
         with nogil:
             self.c_obj = move(cpp_load_vocabulary(c_vocab))
 
+    __hash__ = None
+
 cpdef Column tokenize_scalar(Column input, Scalar delimiter=None):
     """
     Returns a single column of strings by tokenizing the input
diff --git a/python/pylibcudf/pylibcudf/partitioning.pyi b/python/pylibcudf/pylibcudf/partitioning.pyi
new file mode 100644
index 00000000000..48a2ade23f1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/partitioning.pyi
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+
+def hash_partition(
+    input: Table, columns_to_hash: list[int], num_partitions: int
+) -> tuple[Table, list[int]]: ...
+def partition(
+    t: Table, partition_map: Column, num_partitions: int
+) -> tuple[Table, list[int]]: ...
+def round_robin_partition(
+    input: Table, num_partitions: int, start_partition: int = 0
+) -> tuple[Table, list[int]]: ...
diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx
index 3cff4843735..1dacabceb06 100644
--- a/python/pylibcudf/pylibcudf/partitioning.pyx
+++ b/python/pylibcudf/pylibcudf/partitioning.pyx
@@ -11,6 +11,11 @@ from pylibcudf.libcudf.table.table cimport table
 from .column cimport Column
 from .table cimport Table
 
+__all__ = [
+    "hash_partition",
+    "partition",
+    "round_robin_partition",
+]
 
 cpdef tuple[Table, list] hash_partition(
     Table input,
diff --git a/python/pylibcudf/pylibcudf/py.typed b/python/pylibcudf/pylibcudf/py.typed
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/quantiles.pyi b/python/pylibcudf/pylibcudf/quantiles.pyi
new file mode 100644
index 00000000000..dca6eed013a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/quantiles.pyi
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from collections.abc import Sequence
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+from pylibcudf.types import Interpolation, NullOrder, Order, Sorted
+
+def quantile(
+    input: Column,
+    q: Sequence[float],
+    interp: Interpolation = Interpolation.LINEAR,
+    ordered_indices: Column | None = None,
+    exact: bool = True,
+) -> Column: ...
+def quantiles(
+    input: Table,
+    q: Sequence[float],
+    interp: Interpolation = Interpolation.NEAREST,
+    is_input_sorted: Sorted = Sorted.NO,
+    column_order: list[Order] | None = None,
+    null_precedence: list[NullOrder] | None = None,
+) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx
index 7d92b598bd0..634218586ac 100644
--- a/python/pylibcudf/pylibcudf/quantiles.pyx
+++ b/python/pylibcudf/pylibcudf/quantiles.pyx
@@ -17,6 +17,7 @@ from .column cimport Column
 from .table cimport Table
 from .types cimport interpolation
 
+__all__ = ["quantile", "quantiles"]
 
 cpdef Column quantile(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/reduce.pyi b/python/pylibcudf/pylibcudf/reduce.pyi
new file mode 100644
index 00000000000..a09949b7b30
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/reduce.pyi
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.aggregation import Aggregation
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.types import DataType
+
+class ScanType(IntEnum):
+    INCLUSIVE = ...
+    EXCLUSIVE = ...
+
+def reduce(col: Column, agg: Aggregation, data_type: DataType) -> Scalar: ...
+def scan(col: Column, agg: Aggregation, inclusive: ScanType) -> Column: ...
+def minmax(col: Column) -> tuple[Scalar, Scalar]: ...
diff --git a/python/pylibcudf/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx
index d9ec3a9bdc4..1d6ffd9de10 100644
--- a/python/pylibcudf/pylibcudf/reduce.pyx
+++ b/python/pylibcudf/pylibcudf/reduce.pyx
@@ -16,6 +16,7 @@ from .types cimport DataType
 
 from pylibcudf.libcudf.reduce import scan_type as ScanType  # no-cython-lint
 
+__all__ = ["ScanType", "minmax", "reduce", "scan"]
 
 cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type):
     """Perform a reduction on a column
diff --git a/python/pylibcudf/pylibcudf/replace.pyi b/python/pylibcudf/pylibcudf/replace.pyi
new file mode 100644
index 00000000000..eed7a2a6c52
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/replace.pyi
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class ReplacePolicy(IntEnum):
+    PRECEDING = ...
+    FOLLOWING = ...
+
+def replace_nulls(
+    source_column: Column, replacement: Column | Scalar | ReplacePolicy
+) -> Column: ...
+def find_and_replace_all(
+    source_column: Column,
+    values_to_replace: Column,
+    replacement_values: Column,
+) -> Column: ...
+def clamp(
+    source_column: Column,
+    lo: Scalar,
+    hi: Scalar,
+    lo_replace: Scalar | None = None,
+    hi_replace: Scalar | None = None,
+) -> Column: ...
+def normalize_nans_and_zeros(
+    source_column: Column, inplace: bool = False
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx
index f77eba7ace5..51be2b29277 100644
--- a/python/pylibcudf/pylibcudf/replace.pyx
+++ b/python/pylibcudf/pylibcudf/replace.pyx
@@ -15,6 +15,14 @@ from pylibcudf.libcudf.replace import \
 from .column cimport Column
 from .scalar cimport Scalar
 
+__all__ = [
+    "ReplacePolicy",
+    "clamp",
+    "find_and_replace_all",
+    "normalize_nans_and_zeros",
+    "replace_nulls",
+]
+
 
 cpdef Column replace_nulls(Column source_column, ReplacementType replacement):
     """Replace nulls in source_column.
diff --git a/python/pylibcudf/pylibcudf/reshape.pyi b/python/pylibcudf/pylibcudf/reshape.pyi
new file mode 100644
index 00000000000..d8d0ffcc3e0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/reshape.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+
+def interleave_columns(source_table: Table) -> Column: ...
+def tile(source_table: Table, count: int) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx
index 6540b5198ab..bdc212a1985 100644
--- a/python/pylibcudf/pylibcudf/reshape.pyx
+++ b/python/pylibcudf/pylibcudf/reshape.pyx
@@ -13,6 +13,7 @@ from pylibcudf.libcudf.types cimport size_type
 from .column cimport Column
 from .table cimport Table
 
+__all__ = ["interleave_columns", "tile"]
 
 cpdef Column interleave_columns(Table source_table):
     """Interleave columns of a table into a single column.
diff --git a/python/pylibcudf/pylibcudf/rolling.pyi b/python/pylibcudf/pylibcudf/rolling.pyi
new file mode 100644
index 00000000000..ca0111e01ec
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/rolling.pyi
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.aggregation import Aggregation
+from pylibcudf.column import Column
+
+def rolling_window[WindowType: (Column, int)](
+    source: Column,
+    preceding_window: WindowType,
+    following_window: WindowType,
+    min_periods: int,
+    agg: Aggregation,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx
index 4fd0b005431..11acf57ccf4 100644
--- a/python/pylibcudf/pylibcudf/rolling.pyx
+++ b/python/pylibcudf/pylibcudf/rolling.pyx
@@ -11,6 +11,7 @@ from pylibcudf.libcudf.types cimport size_type
 from .aggregation cimport Aggregation
 from .column cimport Column
 
+__all__ = ["rolling_window"]
 
 cpdef Column rolling_window(
     Column source,
diff --git a/python/pylibcudf/pylibcudf/round.pyi b/python/pylibcudf/pylibcudf/round.pyi
new file mode 100644
index 00000000000..410cf5de586
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/round.pyi
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+
+class RoundingMethod(IntEnum):
+    HALF_UP = ...
+    HALF_EVEN = ...
+
+def round(
+    source: Column,
+    decimal_places: int = 0,
+    round_method: RoundingMethod = RoundingMethod.HALF_UP,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx
index 689363e652d..09e5a9cc3bc 100644
--- a/python/pylibcudf/pylibcudf/round.pyx
+++ b/python/pylibcudf/pylibcudf/round.pyx
@@ -11,6 +11,7 @@ from pylibcudf.libcudf.column.column cimport column
 
 from .column cimport Column
 
+__all__ = ["RoundingMethod", "round"]
 
 cpdef Column round(
     Column source,
diff --git a/python/pylibcudf/pylibcudf/scalar.pyi b/python/pylibcudf/pylibcudf/scalar.pyi
new file mode 100644
index 00000000000..0b72b10ef86
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/scalar.pyi
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+class Scalar:
+    def type(self) -> DataType: ...
+    def is_valid(self) -> bool: ...
+    @staticmethod
+    def empty_like(column: Column) -> Scalar: ...
diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx
index d4888a62ad1..1ac014e891e 100644
--- a/python/pylibcudf/pylibcudf/scalar.pyx
+++ b/python/pylibcudf/pylibcudf/scalar.pyx
@@ -11,6 +11,8 @@ from rmm.pylibrmm.memory_resource cimport get_current_device_resource
 from .column cimport Column
 from .types cimport DataType
 
+__all__ = ["Scalar"]
+
 
 # The DeviceMemoryResource attribute could be released prematurely
 # by the gc if the Scalar is in a reference cycle. Removing the tp_clear
@@ -37,6 +39,8 @@ cdef class Scalar:
         # DeviceScalar.
         raise ValueError("Scalar should be constructed with a factory")
 
+    __hash__ = None
+
     cdef const scalar* get(self) noexcept nogil:
         return self.c_obj.get()
 
diff --git a/python/pylibcudf/pylibcudf/search.pyi b/python/pylibcudf/pylibcudf/search.pyi
new file mode 100644
index 00000000000..7f292b129b2
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/search.pyi
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+from pylibcudf.types import NullOrder, Order
+
+def lower_bound(
+    haystack: Table,
+    needles: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Column: ...
+def upper_bound(
+    haystack: Table,
+    needles: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Column: ...
+def contains(haystack: Column, needles: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx
index 1a870248046..50353fcd0cc 100644
--- a/python/pylibcudf/pylibcudf/search.pyx
+++ b/python/pylibcudf/pylibcudf/search.pyx
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.types cimport null_order, order
 from .column cimport Column
 from .table cimport Table
 
+__all__ = ["contains", "lower_bound", "upper_bound"]
 
 cpdef Column lower_bound(
     Table haystack,
diff --git a/python/pylibcudf/pylibcudf/sorting.pyi b/python/pylibcudf/pylibcudf/sorting.pyi
new file mode 100644
index 00000000000..5255d869a4d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/sorting.pyi
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.aggregation import RankMethod
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+from pylibcudf.types import NullOrder, NullPolicy, Order
+
+def sorted_order(
+    source_table: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Column: ...
+def stable_sorted_order(
+    source_table: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Column: ...
+def rank(
+    input_view: Column,
+    method: RankMethod,
+    column_order: Order,
+    null_handling: NullPolicy,
+    null_precedence: NullOrder,
+    percentage: bool,
+) -> Column: ...
+def is_sorted(
+    tbl: Table, column_order: list[Order], null_precedence: list[NullOrder]
+) -> bool: ...
+def segmented_sort_by_key(
+    values: Table,
+    keys: Table,
+    segment_offsets: Column,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
+def stable_segmented_sort_by_key(
+    values: Table,
+    keys: Table,
+    segment_offsets: Column,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
+def sort_by_key(
+    values: Table,
+    keys: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
+def stable_sort_by_key(
+    values: Table,
+    keys: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
+def sort(
+    source_table: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
+def stable_sort(
+    source_table: Table,
+    column_order: list[Order],
+    null_precedence: list[NullOrder],
+) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx
index fc40f03e1fd..fb29ef8c571 100644
--- a/python/pylibcudf/pylibcudf/sorting.pyx
+++ b/python/pylibcudf/pylibcudf/sorting.pyx
@@ -12,6 +12,18 @@ from pylibcudf.libcudf.types cimport null_order, null_policy, order
 from .column cimport Column
 from .table cimport Table
 
+__all__ = [
+    "is_sorted",
+    "rank",
+    "segmented_sort_by_key",
+    "sort",
+    "sort_by_key",
+    "sorted_order",
+    "stable_segmented_sort_by_key",
+    "stable_sort",
+    "stable_sort_by_key",
+    "stable_sorted_order",
+]
 
 cpdef Column sorted_order(Table source_table, list column_order, list null_precedence):
     """Computes the row indices required to sort the table.
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/stream_compaction.pxd
index a4f39792f0c..a20a23e2e58 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pxd
@@ -17,6 +17,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold)
 
 cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold)
 
+cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask)
+
 cpdef Table unique(
     Table input,
     list keys,
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyi b/python/pylibcudf/pylibcudf/stream_compaction.pyi
new file mode 100644
index 00000000000..99cade48309
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyi
@@ -0,0 +1,53 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.table import Table
+from pylibcudf.types import NanEquality, NanPolicy, NullEquality, NullPolicy
+
+class DuplicateKeepOption(IntEnum):
+    KEEP_ANY = ...
+    KEEP_FIRST = ...
+    KEEP_LAST = ...
+    KEEP_NONE = ...
+
+def drop_nulls(
+    source_table: Table, keys: list[int], keep_threshold: int
+) -> Table: ...
+def drop_nans(
+    source_table: Table, keys: list[int], keep_threshold: int
+) -> Table: ...
+def apply_boolean_mask(source_table: Table, boolean_mask: Column) -> Table: ...
+def unique(
+    input: Table,
+    keys: list[int],
+    keep: DuplicateKeepOption,
+    nulls_equal: NullEquality,
+) -> Table: ...
+def distinct(
+    input: Table,
+    keys: list[int],
+    keep: DuplicateKeepOption,
+    nulls_equal: NullEquality,
+    nans_equal: NanEquality,
+) -> Table: ...
+def distinct_indices(
+    input: Table,
+    keep: DuplicateKeepOption,
+    nulls_equal: NullEquality,
+    nans_equal: NanEquality,
+) -> Column: ...
+def stable_distinct(
+    input: Table,
+    keys: list[int],
+    keep: DuplicateKeepOption,
+    nulls_equal: NullEquality,
+    nans_equal: NanEquality,
+) -> Table: ...
+def unique_count(
+    source: Column, null_handling: NullPolicy, nan_handling: NanPolicy
+) -> int: ...
+def distinct_count(
+    source: Column, null_handling: NullPolicy, nan_handling: NanPolicy
+) -> int: ...
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx
index 2145398a191..6e403ca1b07 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pyx
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx
@@ -21,6 +21,18 @@ from pylibcudf.libcudf.stream_compaction import \
 from .column cimport Column
 from .table cimport Table
 
+__all__ = [
+    "DuplicateKeepOption",
+    "apply_boolean_mask",
+    "distinct",
+    "distinct_count",
+    "distinct_indices",
+    "drop_nans",
+    "drop_nulls",
+    "stable_distinct",
+    "unique",
+    "unique_count",
+]
 
 cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold):
     """Filters out rows from the input table based on the presence of nulls.
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index fa7294c7dbd..67054f0b447 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -28,6 +28,7 @@
 from .side_type import SideType
 
 __all__ = [
+    "SideType",
     "attributes",
     "capitalize",
     "case",
@@ -46,9 +47,8 @@
     "replace",
     "replace_re",
     "slice",
-    "strip",
     "split",
-    "SideType",
+    "strip",
     "translate",
     "wrap",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyi b/python/pylibcudf/pylibcudf/strings/attributes.pyi
new file mode 100644
index 00000000000..7fd5c9773d4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def count_characters(source_strings: Column) -> Column: ...
+def count_bytes(source_strings: Column) -> Column: ...
+def code_points(source_strings: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx
index 8e46a32835d..f1eb09b4965 100644
--- a/python/pylibcudf/pylibcudf/strings/attributes.pyx
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx
@@ -6,6 +6,7 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport attributes as cpp_attributes
 
+__all__ = ["code_points", "count_bytes", "count_characters"]
 
 cpdef Column count_characters(Column source_strings):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyi b/python/pylibcudf/pylibcudf/strings/capitalize.pyi
new file mode 100644
index 00000000000..5c6689418e2
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyi
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.strings.char_types import StringCharacterTypes
+
+def capitalize(input: Column, delimiters: Scalar | None = None) -> Column: ...
+def title(
+    input: Column,
+    sequence_type: StringCharacterTypes = StringCharacterTypes.ALPHA,
+) -> Column: ...
+def is_title(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyx b/python/pylibcudf/pylibcudf/strings/capitalize.pyx
index 06b991c3cf1..a54480b8e4a 100644
--- a/python/pylibcudf/pylibcudf/strings/capitalize.pyx
+++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyx
@@ -14,6 +14,7 @@ from pylibcudf.strings.char_types cimport string_character_types
 
 from cython.operator import dereference
 
+__all__ = ["capitalize", "is_title", "title"]
 
 cpdef Column capitalize(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/case.pyi b/python/pylibcudf/pylibcudf/strings/case.pyi
new file mode 100644
index 00000000000..4e50db4d1da
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/case.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def to_lower(input: Column) -> Column: ...
+def to_upper(input: Column) -> Column: ...
+def swapcase(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/case.pyx b/python/pylibcudf/pylibcudf/strings/case.pyx
index 9e6cd7717d3..d0e054bef72 100644
--- a/python/pylibcudf/pylibcudf/strings/case.pyx
+++ b/python/pylibcudf/pylibcudf/strings/case.pyx
@@ -6,6 +6,7 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport case as cpp_case
 
+__all__ = ["swapcase", "to_lower", "to_upper"]
 
 cpdef Column to_lower(Column input):
     cdef unique_ptr[column] c_result
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyi b/python/pylibcudf/pylibcudf/strings/char_types.pyi
new file mode 100644
index 00000000000..daa36cbb68d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pyi
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class StringCharacterTypes(IntEnum):
+    DECIMAL = ...
+    NUMERIC = ...
+    DIGIT = ...
+    ALPHA = ...
+    SPACE = ...
+    UPPER = ...
+    LOWER = ...
+    ALPHANUM = ...
+    CASE_TYPES = ...
+    ALL_TYPES = ...
+
+def all_characters_of_type(
+    source_strings: Column,
+    types: StringCharacterTypes,
+    verify_types: StringCharacterTypes,
+) -> Column: ...
+def filter_characters_of_type(
+    source_strings: Column,
+    types_to_remove: StringCharacterTypes,
+    replacement: Scalar,
+    types_to_keep: StringCharacterTypes,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx
index cb04efe5e8f..0af4a1f9c37 100644
--- a/python/pylibcudf/pylibcudf/strings/char_types.pyx
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx
@@ -12,6 +12,11 @@ from cython.operator import dereference
 from pylibcudf.libcudf.strings.char_types import \
     string_character_types as StringCharacterTypes  # no-cython-lint
 
+__all__ = [
+    "StringCharacterTypes",
+    "all_characters_of_type",
+    "filter_characters_of_type",
+]
 
 cpdef Column all_characters_of_type(
     Column source_strings,
diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyi b/python/pylibcudf/pylibcudf/strings/combine.pyi
new file mode 100644
index 00000000000..3094b20f141
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/combine.pyi
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+
+class SeparatorOnNulls(IntEnum):
+    YES = ...
+    NO = ...
+
+class OutputIfEmptyList(IntEnum):
+    EMPTY_STRING = ...
+    NULL_ELEMENT = ...
+
+def concatenate(
+    strings_columns: Table,
+    separator: Column | Scalar,
+    narep: Scalar | None = None,
+    col_narep: Scalar | None = None,
+    separate_nulls: SeparatorOnNulls = SeparatorOnNulls.YES,
+) -> Column: ...
+def join_strings(
+    input: Column, separator: Scalar, narep: Scalar
+) -> Column: ...
+def join_list_elements(
+    lists_strings_column: Column,
+    separator: Column | Scalar,
+    separator_narep: Scalar,
+    string_narep: Scalar,
+    separate_nulls: SeparatorOnNulls,
+    empty_list_policy: OutputIfEmptyList,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx
index f17d5265ab4..dc1e72c799b 100644
--- a/python/pylibcudf/pylibcudf/strings/combine.pyx
+++ b/python/pylibcudf/pylibcudf/strings/combine.pyx
@@ -17,6 +17,13 @@ from pylibcudf.libcudf.strings.combine import \
 from pylibcudf.libcudf.strings.combine import \
     separator_on_nulls as SeparatorOnNulls  # no-cython-lint
 
+__all__ = [
+    "OutputIfEmptyList",
+    "SeparatorOnNulls",
+    "concatenate",
+    "join_list_elements",
+    "join_strings",
+]
 
 cpdef Column concatenate(
     Table strings_columns,
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyi b/python/pylibcudf/pylibcudf/strings/contains.pyi
new file mode 100644
index 00000000000..1f0620383b3
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/contains.pyi
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.strings.regex_program import RegexProgram
+
+def contains_re(input: Column, prog: RegexProgram) -> Column: ...
+def count_re(input: Column, prog: RegexProgram) -> Column: ...
+def matches_re(input: Column, prog: RegexProgram) -> Column: ...
+def like(
+    input: Column,
+    pattern: Column | Scalar,
+    escape_character: Scalar | None = None,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx
index d4b1130241d..7b4c53ed853 100644
--- a/python/pylibcudf/pylibcudf/strings/contains.pyx
+++ b/python/pylibcudf/pylibcudf/strings/contains.pyx
@@ -12,6 +12,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport (
 from pylibcudf.libcudf.strings cimport contains as cpp_contains
 from pylibcudf.strings.regex_program cimport RegexProgram
 
+__all__ = ["contains_re", "count_re", "like", "matches_re"]
 
 cpdef Column contains_re(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index aa27a7c8929..08b5034456e 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -10,3 +10,15 @@
     convert_lists,
     convert_urls,
 )
+
+__all__ = [
+    "convert_booleans",
+    "convert_datetime",
+    "convert_durations",
+    "convert_fixed_point",
+    "convert_floats",
+    "convert_integers",
+    "convert_ipv4",
+    "convert_lists",
+    "convert_urls",
+]
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi
new file mode 100644
index 00000000000..77c09242e9a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def to_booleans(input: Column, true_string: Scalar) -> Column: ...
+def from_booleans(
+    booleans: Column, true_string: Scalar, false_string: Scalar
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
index dc12b291b11..1899a3b27cc 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
@@ -12,6 +12,7 @@ from pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
 
+__all__ = ["from_booleans", "to_booleans"]
 
 cpdef Column to_booleans(Column input, Scalar true_string):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi
new file mode 100644
index 00000000000..c6857169765
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+def to_timestamps(
+    input: Column, timestamp_type: DataType, format: str
+) -> Column: ...
+def from_timestamps(
+    timestamps: Column, format: str, input_strings_names: Column
+) -> Column: ...
+def is_timestamp(input: Column, format: str) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
index 0ee60812e00..f1cd684166c 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
@@ -11,6 +11,7 @@ from pylibcudf.libcudf.strings.convert cimport (
 
 from pylibcudf.types import DataType
 
+__all__ = ["from_timestamps", "is_timestamp", "to_timestamps"]
 
 cpdef Column to_timestamps(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi
new file mode 100644
index 00000000000..a5787a5fe49
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+def to_durations(
+    input: Column, duration_type: DataType, format: str
+) -> Column: ...
+def from_durations(durations: Column, format: str | None = None) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
index 31980ace418..a9654afd00a 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
@@ -11,6 +11,7 @@ from pylibcudf.libcudf.strings.convert cimport (
 
 from pylibcudf.types import DataType
 
+__all__ = ["from_durations", "to_durations"]
 
 cpdef Column to_durations(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi
new file mode 100644
index 00000000000..1192d3dfcd6
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+def to_fixed_point(input: Column, output_type: DataType) -> Column: ...
+def from_fixed_point(input: Column) -> Column: ...
+def is_fixed_point(
+    input: Column, decimal_type: DataType | None = None
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
index 962a47dfadf..00cbc822f36 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
@@ -9,6 +9,8 @@ from pylibcudf.libcudf.strings.convert cimport (
 )
 from pylibcudf.types cimport DataType, type_id
 
+__all__ = ["from_fixed_point", "is_fixed_point", "to_fixed_point"]
+
 
 cpdef Column to_fixed_point(Column input, DataType output_type):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi
new file mode 100644
index 00000000000..ddf4042e10d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+def to_floats(strings: Column, output_type: DataType) -> Column: ...
+def from_floats(floats: Column) -> Column: ...
+def is_float(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
index 1296f4f9db5..b5199aac577 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
@@ -9,6 +9,7 @@ from pylibcudf.libcudf.strings.convert cimport (
 )
 from pylibcudf.types cimport DataType
 
+__all__ = ["from_floats", "is_float", "to_floats"]
 
 cpdef Column to_floats(Column strings, DataType output_type):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi
new file mode 100644
index 00000000000..b96226fba90
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+def to_integers(input: Column, output_type: DataType) -> Column: ...
+def from_integers(integers: Column) -> Column: ...
+def is_integer(input: Column, int_type: DataType | None = None) -> Column: ...
+def hex_to_integers(input: Column, output_type: DataType) -> Column: ...
+def is_hex(input: Column) -> Column: ...
+def integers_to_hex(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
index 5558683a502..12984e15ce9 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
@@ -9,6 +9,14 @@ from pylibcudf.libcudf.strings.convert cimport (
 )
 from pylibcudf.types cimport DataType
 
+__all__ = [
+    "from_integers",
+    "hex_to_integers",
+    "integers_to_hex",
+    "is_hex",
+    "is_integer",
+    "to_integers"
+]
 
 cpdef Column to_integers(Column input, DataType output_type):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi
new file mode 100644
index 00000000000..b017b32598c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def ipv4_to_integers(input: Column) -> Column: ...
+def integers_to_ipv4(integers: Column) -> Column: ...
+def is_ipv4(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
index 834781f95f3..e7c6aae4fa8 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
@@ -6,6 +6,7 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4
 
+__all__ = ["integers_to_ipv4", "ipv4_to_integers", "is_ipv4"]
 
 cpdef Column ipv4_to_integers(Column input):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi
new file mode 100644
index 00000000000..6ab3a4183e9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def format_list_column(
+    input: Column,
+    na_rep: Scalar | None = None,
+    separators: Column | None = None,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
index cbfe5f5aa8b..518f72f6644 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
@@ -17,6 +17,7 @@ from pylibcudf.types cimport type_id
 
 from cython.operator import dereference
 
+__all__ = ["format_list_column"]
 
 cpdef Column format_list_column(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi
new file mode 100644
index 00000000000..49b8468957c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def url_encode(input: Column) -> Column: ...
+def url_decode(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
index 82f8a75f1d9..bd5e23bca43 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
@@ -6,6 +6,7 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls
 
+__all__ = ["url_decode", "url_encode"]
 
 cpdef Column url_encode(Column input):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyi b/python/pylibcudf/pylibcudf/strings/extract.pyi
new file mode 100644
index 00000000000..4354bd3072d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/extract.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.strings.regex_program import RegexProgram
+from pylibcudf.table import Table
+
+def extract(input: Column, prog: RegexProgram) -> Table: ...
+def extract_all_record(input: Column, prog: RegexProgram) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx
index b56eccc8287..0ce70666e92 100644
--- a/python/pylibcudf/pylibcudf/strings/extract.pyx
+++ b/python/pylibcudf/pylibcudf/strings/extract.pyx
@@ -9,6 +9,7 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.strings.regex_program cimport RegexProgram
 from pylibcudf.table cimport Table
 
+__all__ = ["extract", "extract_all_record"]
 
 cpdef Table extract(Column input, RegexProgram prog):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/find.pyi b/python/pylibcudf/pylibcudf/strings/find.pyi
new file mode 100644
index 00000000000..3d04a9c3161
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/find.pyi
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def find(
+    input: Column, target: Column | Scalar, start: int = 0, stop: int = -1
+) -> Column: ...
+def rfind(
+    input: Column, target: Scalar, start: int = 0, stop: int = -1
+) -> Column: ...
+def contains(input: Column, target: Column | Scalar) -> Column: ...
+def starts_with(input: Column, target: Column | Scalar) -> Column: ...
+def ends_with(input: Column, target: Column | Scalar) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/find.pyx b/python/pylibcudf/pylibcudf/strings/find.pyx
index 6fc6dca24fd..f0af339ff08 100644
--- a/python/pylibcudf/pylibcudf/strings/find.pyx
+++ b/python/pylibcudf/pylibcudf/strings/find.pyx
@@ -10,6 +10,7 @@ from cython.operator import dereference
 
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
+__all__ = ["contains", "ends_with", "find", "rfind", "starts_with"]
 
 cpdef Column find(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyi b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi
new file mode 100644
index 00000000000..3d46fd2fa6d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def find_multiple(input: Column, targets: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
index 672aa606bd0..c9ce734b4be 100644
--- a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
@@ -6,6 +6,7 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple
 
+__all__ = ["find_multiple"]
 
 cpdef Column find_multiple(Column input, Column targets):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyi b/python/pylibcudf/pylibcudf/strings/findall.pyi
new file mode 100644
index 00000000000..77e38581d22
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/findall.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.strings.regex_program import RegexProgram
+
+def find_re(input: Column, pattern: RegexProgram) -> Column: ...
+def findall(input: Column, pattern: RegexProgram) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx
index 89fa4302824..23c84675a16 100644
--- a/python/pylibcudf/pylibcudf/strings/findall.pyx
+++ b/python/pylibcudf/pylibcudf/strings/findall.pyx
@@ -7,6 +7,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport findall as cpp_findall
 from pylibcudf.strings.regex_program cimport RegexProgram
 
+__all__ = ["findall", "find_re"]
 
 cpdef Column findall(Column input, RegexProgram pattern):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyi b/python/pylibcudf/pylibcudf/strings/padding.pyi
new file mode 100644
index 00000000000..a991935e6e5
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/padding.pyi
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.strings.side_type import SideType
+
+def pad(
+    input: Column, width: int, side: SideType, fill_char: str
+) -> Column: ...
+def zfill(input: Column, width: int) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx
index f6950eecf60..0e349a7be47 100644
--- a/python/pylibcudf/pylibcudf/strings/padding.pyx
+++ b/python/pylibcudf/pylibcudf/strings/padding.pyx
@@ -6,6 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport padding as cpp_padding
 from pylibcudf.libcudf.strings.side_type cimport side_type
 
+__all__ = ["pad", "zfill"]
 
 cpdef Column pad(Column input, size_type width, side_type side, str fill_char):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pyi b/python/pylibcudf/pylibcudf/strings/regex_flags.pyi
new file mode 100644
index 00000000000..c551cebf181
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from enum import IntEnum
+
+class RegexFlags(IntEnum):
+    DEFAULT = ...
+    MULTILINE = ...
+    DOTALL = ...
diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx
index ce3b6b10a42..65b504e0dc7 100644
--- a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx
+++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx
@@ -2,3 +2,5 @@
 
 from pylibcudf.libcudf.strings.regex_flags import \
     regex_flags as RegexFlags  # no-cython-lint
+
+__all__ = ["RegexFlags"]
diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyi b/python/pylibcudf/pylibcudf/strings/regex_program.pyi
new file mode 100644
index 00000000000..9abd6fa7802
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.strings.regex_flags import RegexFlags
+
+class RegexProgram:
+    def __init__(self): ...
+    @staticmethod
+    def create(pattern: str, flags: RegexFlags) -> RegexProgram: ...
diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyx b/python/pylibcudf/pylibcudf/strings/regex_program.pyx
index 91f585cd637..46bfde074d2 100644
--- a/python/pylibcudf/pylibcudf/strings/regex_program.pyx
+++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyx
@@ -11,6 +11,7 @@ from pylibcudf.strings.regex_flags import RegexFlags
 
 from pylibcudf.strings.regex_flags cimport regex_flags
 
+__all__ = ["RegexProgram"]
 
 cdef class RegexProgram:
     """Regex program class.
@@ -24,6 +25,8 @@ cdef class RegexProgram:
     def __init__(self, *args, **kwargs):
         raise ValueError("Do not instantiate RegexProgram directly, use create")
 
+    __hash__ = None
+
     @staticmethod
     def create(str pattern, int flags):
         """Create a program from a pattern.
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyi b/python/pylibcudf/pylibcudf/strings/repeat.pyi
new file mode 100644
index 00000000000..93a46b71caa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pyi
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def repeat_strings(input: Column, repeat_times: Column | int) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx
index fb2bb13c666..a497b1f438e 100644
--- a/python/pylibcudf/pylibcudf/strings/repeat.pyx
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx
@@ -6,6 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport repeat as cpp_repeat
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = ["repeat_strings"]
 
 cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyi b/python/pylibcudf/pylibcudf/strings/replace.pyi
new file mode 100644
index 00000000000..64df09ef7e8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/replace.pyi
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def replace(
+    input: Column, target: Scalar, repl: Scalar, maxrepl: int = -1
+) -> Column: ...
+def replace_multiple(
+    input: Column, target: Column, repl: Column, maxrepl: int = -1
+) -> Column: ...
+def replace_slice(
+    input: Column, repl: Scalar | None = None, start: int = 0, stop: int = -1
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx
index 2b94f5e3fee..3ba6c1b5530 100644
--- a/python/pylibcudf/pylibcudf/strings/replace.pyx
+++ b/python/pylibcudf/pylibcudf/strings/replace.pyx
@@ -16,6 +16,7 @@ from pylibcudf.libcudf.strings.replace cimport (
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 
+__all__ = ["replace", "replace_multiple", "replace_slice"]
 
 cpdef Column replace(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyi b/python/pylibcudf/pylibcudf/strings/replace_re.pyi
new file mode 100644
index 00000000000..056bafbf7ef
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyi
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from typing import overload
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.strings.regex_flags import RegexFlags
+from pylibcudf.strings.regex_program import RegexProgram
+
+@overload
+def replace_re(
+    input: Column,
+    pattern: RegexProgram,
+    replacement: Scalar,
+    max_replace_count: int = -1,
+) -> Column: ...
+@overload
+def replace_re(
+    input: Column,
+    patterns: list[str],
+    replacement: Column,
+    max_replace_count: int = -1,
+    flags: RegexFlags = RegexFlags.DEFAULT,
+) -> Column: ...
+def replace_with_backrefs(
+    input: Column, prog: RegexProgram, replacement: str
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyx b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
index ccc33fd4425..bdabc779ddf 100644
--- a/python/pylibcudf/pylibcudf/strings/replace_re.pyx
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
@@ -16,6 +16,7 @@ from pylibcudf.scalar cimport Scalar
 from pylibcudf.strings.regex_flags cimport regex_flags
 from pylibcudf.strings.regex_program cimport RegexProgram
 
+__all__ = ["replace_re", "replace_with_backrefs"]
 
 cpdef Column replace_re(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyi b/python/pylibcudf/pylibcudf/strings/side_type.pyi
new file mode 100644
index 00000000000..532edd60077
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from enum import IntEnum
+
+class SideType(IntEnum):
+    LEFT = ...
+    RIGHT = ...
+    BOTH = ...
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx
index cf0c770cc11..87db4206a9c 100644
--- a/python/pylibcudf/pylibcudf/strings/side_type.pyx
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx
@@ -1,3 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from pylibcudf.libcudf.strings.side_type import \
     side_type as SideType  # no-cython-lint
+
+__all__ = ["SideType"]
diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyi b/python/pylibcudf/pylibcudf/strings/slice.pyi
new file mode 100644
index 00000000000..7bf9a7cb8c6
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/slice.pyi
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+def slice_strings(
+    input: Column,
+    start: Column | Scalar | None = None,
+    stop: Column | Scalar | None = None,
+    step: Scalar | None = None,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyx b/python/pylibcudf/pylibcudf/strings/slice.pyx
index 70d10cab36c..d32de7c50e0 100644
--- a/python/pylibcudf/pylibcudf/strings/slice.pyx
+++ b/python/pylibcudf/pylibcudf/strings/slice.pyx
@@ -14,6 +14,7 @@ from pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
 
+__all__ = ["slice_strings"]
 
 cpdef Column slice_strings(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/strings/split/__init__.py
index 2033e5e275b..db2a597882e 100644
--- a/python/pylibcudf/pylibcudf/strings/split/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py
@@ -1,2 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from . import partition, split
+
+__all__ = ["partition", "split"]
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyi b/python/pylibcudf/pylibcudf/strings/split/partition.pyi
new file mode 100644
index 00000000000..f19a463bd7e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyi
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.table import Table
+
+def partition(input: Column, delimiter: Scalar | None = None) -> Table: ...
+def rpartition(input: Column, delimiter: Scalar | None = None) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
index 0fb4f186c41..75537ea46d3 100644
--- a/python/pylibcudf/pylibcudf/strings/split/partition.pyx
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
@@ -13,6 +13,7 @@ from pylibcudf.table cimport Table
 
 from cython.operator import dereference
 
+__all__ = ["partition", "rpartition"]
 
 cpdef Table partition(Column input, Scalar delimiter=None):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyi b/python/pylibcudf/pylibcudf/strings/split/split.pyi
new file mode 100644
index 00000000000..3ccf0bc2a01
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pyi
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.strings.regex_program import RegexProgram
+from pylibcudf.table import Table
+
+def split(
+    strings_column: Column, delimiter: Scalar, maxsplit: int
+) -> Table: ...
+def rsplit(
+    strings_column: Column, delimiter: Scalar, maxsplit: int
+) -> Table: ...
+def split_record(
+    strings: Column, delimiter: Scalar, maxsplit: int
+) -> Column: ...
+def rsplit_record(
+    strings: Column, delimiter: Scalar, maxsplit: int
+) -> Column: ...
+def split_re(input: Column, prog: RegexProgram, maxsplit: int) -> Table: ...
+def rsplit_re(input: Column, prog: RegexProgram, maxsplit: int) -> Table: ...
+def split_record_re(
+    input: Column, prog: RegexProgram, maxsplit: int
+) -> Column: ...
+def rsplit_record_re(
+    input: Column, prog: RegexProgram, maxsplit: int
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx
index e3827f6645e..90087f996f0 100644
--- a/python/pylibcudf/pylibcudf/strings/split/split.pyx
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx
@@ -13,6 +13,16 @@ from pylibcudf.table cimport Table
 
 from cython.operator import dereference
 
+__all__ = [
+    "rsplit",
+    "rsplit_re",
+    "rsplit_record",
+    "rsplit_record_re",
+    "split",
+    "split_re",
+    "split_record",
+    "split_record_re",
+]
 
 cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyi b/python/pylibcudf/pylibcudf/strings/strip.pyi
new file mode 100644
index 00000000000..680355fc88f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/strip.pyi
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+from pylibcudf.strings.side_type import SideType
+
+def strip(
+    input: Column,
+    side: SideType = SideType.BOTH,
+    to_strip: Scalar | None = None,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx
index 429a23c3cdf..805d959891b 100644
--- a/python/pylibcudf/pylibcudf/strings/strip.pyx
+++ b/python/pylibcudf/pylibcudf/strings/strip.pyx
@@ -13,6 +13,7 @@ from pylibcudf.libcudf.strings cimport strip as cpp_strip
 from pylibcudf.scalar cimport Scalar
 from pylibcudf.strings.side_type cimport side_type
 
+__all__ = ["strip"]
 
 cpdef Column strip(
     Column input,
diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyi b/python/pylibcudf/pylibcudf/strings/translate.pyi
new file mode 100644
index 00000000000..7158b6eb05c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/translate.pyi
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from collections.abc import Mapping
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.scalar import Scalar
+
+class FilterType(IntEnum):
+    KEEP = ...
+    REMOVE = ...
+
+def translate(
+    input: Column, chars_table: Mapping[int | str, int | str]
+) -> Column: ...
+def filter_characters(
+    input: Column,
+    characters_to_filter: Mapping[int | str, int | str],
+    keep_characters: FilterType,
+    replacement: Scalar,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx
index d85da8e6cdd..ba1e8dc5d27 100644
--- a/python/pylibcudf/pylibcudf/strings/translate.pyx
+++ b/python/pylibcudf/pylibcudf/strings/translate.pyx
@@ -14,6 +14,7 @@ from cython.operator import dereference
 from pylibcudf.libcudf.strings.translate import \
     filter_type as FilterType  # no-cython-lint
 
+__all__ = ["FilterType", "filter_characters", "translate"]
 
 cdef vector[pair[char_utf8, char_utf8]] _table_to_c_table(dict table):
     """
diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyi b/python/pylibcudf/pylibcudf/strings/wrap.pyi
new file mode 100644
index 00000000000..5658f279197
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/wrap.pyi
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+def wrap(input: Column, width: int) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx
index 2ced250f837..b696eb48e47 100644
--- a/python/pylibcudf/pylibcudf/strings/wrap.pyx
+++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx
@@ -7,6 +7,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.strings cimport wrap as cpp_wrap
 from pylibcudf.libcudf.types cimport size_type
 
+__all__ = ["wrap"]
 
 cpdef Column wrap(Column input, size_type width):
     """
diff --git a/python/pylibcudf/pylibcudf/table.pyi b/python/pylibcudf/pylibcudf/table.pyi
new file mode 100644
index 00000000000..5aef7e009c8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/table.pyi
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+class Table:
+    def __init__(self, column: list[Column]): ...
+    def num_columns(self) -> int: ...
+    def num_rows(self) -> int: ...
+    def columns(self) -> list[Column]: ...
diff --git a/python/pylibcudf/pylibcudf/table.pyx b/python/pylibcudf/pylibcudf/table.pyx
index d0d6f2343d0..0c1e88a927c 100644
--- a/python/pylibcudf/pylibcudf/table.pyx
+++ b/python/pylibcudf/pylibcudf/table.pyx
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.table.table cimport table
 
 from .column cimport Column
 
+__all__ = ["Table"]
 
 cdef class Table:
     """A list of columns of the same size.
@@ -24,6 +25,8 @@ cdef class Table:
             raise ValueError("All columns must be pylibcudf Column objects")
         self._columns = columns
 
+    __hash__ = None
+
     cdef table_view view(self) nogil:
         """Generate a libcudf table_view to pass to libcudf algorithms.
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
index bbb08e8b95a..a33122221f6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py
+++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
@@ -541,13 +541,6 @@ def py_shift_right_unsigned(x, y):
             plc.binaryop.BinaryOperator.LOGICAL_AND,
             pa.compute.and_,
         ),
-        (
-            "int64",
-            "int64",
-            "int64",
-            plc.binaryop.BinaryOperator.LOGICAL_AND,
-            pa.compute.and_,
-        ),
         (
             "int64",
             "int64",
@@ -562,13 +555,6 @@ def py_shift_right_unsigned(x, y):
             plc.binaryop.BinaryOperator.LOGICAL_OR,
             pa.compute.or_,
         ),
-        (
-            "int64",
-            "int64",
-            "int64",
-            plc.binaryop.BinaryOperator.LOGICAL_OR,
-            pa.compute.or_,
-        ),
         (
             "int64",
             "int64",
diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py
index beacfc63ce5..946d583d1cc 100644
--- a/python/pylibcudf/pylibcudf/tests/test_labeling.py
+++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py
@@ -6,8 +6,12 @@
 import pylibcudf as plc
 
 
-@pytest.mark.parametrize("left_inclusive", [True, False])
-@pytest.mark.parametrize("right_inclusive", [True, False])
+@pytest.mark.parametrize(
+    "left_inclusive", [plc.labeling.Inclusive.YES, plc.labeling.Inclusive.NO]
+)
+@pytest.mark.parametrize(
+    "right_inclusive", [plc.labeling.Inclusive.YES, plc.labeling.Inclusive.NO]
+)
 def test_label_bins(left_inclusive, right_inclusive):
     in_col = plc.interop.from_arrow(pa.array([1, 2, 3]))
     left_edges = plc.interop.from_arrow(pa.array([0, 5]))
diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py
index f3ef555f11d..8c1229c2a04 100644
--- a/python/pylibcudf/pylibcudf/tests/test_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_lists.py
@@ -62,12 +62,12 @@ def test_concatenate_rows(test_data):
     [
         (
             [[[1, 2], [3, 4], [5]], [[6], None, [7, 8, 9]]],
-            False,
+            plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW,
             [[1, 2, 3, 4, 5], None],
         ),
         (
             [[[1, 2], [3, 4], [5, None]], [[6], [None], [7, 8, 9]]],
-            True,
+            plc.lists.ConcatenateNullPolicy.IGNORE,
             [[1, 2, 3, 4, 5, None], [6, None, 7, 8, 9]],
         ),
     ],
@@ -138,7 +138,9 @@ def test_index_of_scalar(list_column, scalar):
 
     plc_column = plc.interop.from_arrow(arr)
     plc_scalar = plc.interop.from_arrow(scalar)
-    res = plc.lists.index_of(plc_column, plc_scalar, True)
+    res = plc.lists.index_of(
+        plc_column, plc_scalar, plc.lists.DuplicateFindOption.FIND_FIRST
+    )
 
     expect = pa.array([1, -1, -1, -1], type=pa.int32())
 
@@ -150,7 +152,9 @@ def test_index_of_list_column(list_column, search_key_column):
     arr2, expect = search_key_column
     plc_column1 = plc.interop.from_arrow(arr1)
     plc_column2 = plc.interop.from_arrow(arr2)
-    res = plc.lists.index_of(plc_column1, plc_column2, True)
+    res = plc.lists.index_of(
+        plc_column1, plc_column2, plc.lists.DuplicateFindOption.FIND_FIRST
+    )
 
     expect = pa.array(search_key_column[1], type=pa.int32())
 
@@ -227,39 +231,34 @@ def test_sequences():
 
 
 @pytest.mark.parametrize(
-    "ascending,na_position,expected",
+    "order,na_position,expected",
     [
         (
-            True,
+            plc.types.Order.ASCENDING,
             plc.types.NullOrder.BEFORE,
             [[1, 2, 3, 4], [None, 1, 2, 4], [-10, 0, 10, 10]],
         ),
         (
-            True,
+            plc.types.Order.ASCENDING,
             plc.types.NullOrder.AFTER,
             [[1, 2, 3, 4], [1, 2, 4, None], [-10, 0, 10, 10]],
         ),
         (
-            False,
+            plc.types.Order.DESCENDING,
             plc.types.NullOrder.BEFORE,
             [[4, 3, 2, 1], [4, 2, 1, None], [10, 10, 0, -10]],
         ),
         (
-            False,
-            plc.types.NullOrder.AFTER,
-            [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]],
-        ),
-        (
-            False,
+            plc.types.Order.DESCENDING,
             plc.types.NullOrder.AFTER,
             [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]],
         ),
     ],
 )
-def test_sort_lists(lists_column, ascending, na_position, expected):
+def test_sort_lists(lists_column, order, na_position, expected):
     plc_column = plc.interop.from_arrow(pa.array(lists_column))
-    res = plc.lists.sort_lists(plc_column, ascending, na_position, False)
-    res_stable = plc.lists.sort_lists(plc_column, ascending, na_position, True)
+    res = plc.lists.sort_lists(plc_column, order, na_position, False)
+    res_stable = plc.lists.sort_lists(plc_column, order, na_position, True)
 
     expect = pa.array(expected)
 
@@ -272,44 +271,44 @@ def test_sort_lists(lists_column, ascending, na_position, expected):
     [
         (
             plc.lists.difference_distinct,
-            True,
-            True,
+            plc.types.NanEquality.ALL_EQUAL,
+            plc.types.NullEquality.EQUAL,
             [[], [1, 2, 3], None, [4, 5]],
         ),
         (
             plc.lists.difference_distinct,
-            False,
-            True,
+            plc.types.NanEquality.UNEQUAL,
+            plc.types.NullEquality.EQUAL,
             [[], [1, 2, 3], None, [4, None, 5]],
         ),
         (
             plc.lists.have_overlap,
-            True,
-            True,
+            plc.types.NanEquality.ALL_EQUAL,
+            plc.types.NullEquality.EQUAL,
             [True, False, None, True],
         ),
         (
             plc.lists.have_overlap,
-            False,
-            False,
+            plc.types.NanEquality.UNEQUAL,
+            plc.types.NullEquality.UNEQUAL,
             [True, False, None, False],
         ),
         (
             plc.lists.intersect_distinct,
-            True,
-            True,
+            plc.types.NanEquality.ALL_EQUAL,
+            plc.types.NullEquality.EQUAL,
             [[np.nan, 1, 2], [], None, [None]],
         ),
         (
             plc.lists.intersect_distinct,
-            True,
-            False,
+            plc.types.NanEquality.ALL_EQUAL,
+            plc.types.NullEquality.UNEQUAL,
             [[1, 2], [], None, [None]],
         ),
         (
             plc.lists.union_distinct,
-            False,
-            True,
+            plc.types.NanEquality.UNEQUAL,
+            plc.types.NullEquality.EQUAL,
             [
                 [np.nan, 2, 1, 3],
                 [1, 2, 3, 4, 5],
@@ -319,8 +318,8 @@ def test_sort_lists(lists_column, ascending, na_position, expected):
         ),
         (
             plc.lists.union_distinct,
-            False,
-            False,
+            plc.types.NanEquality.UNEQUAL,
+            plc.types.NullEquality.UNEQUAL,
             [
                 [np.nan, np.nan, 2, 1, np.nan, 3],
                 [1, 2, 3, 4, 5],
@@ -352,20 +351,24 @@ def test_set_operations(
 @pytest.mark.parametrize(
     "nans_equal,nulls_equal,expected",
     [
-        (True, True, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]),
         (
-            False,
-            True,
+            plc.types.NanEquality.ALL_EQUAL,
+            plc.types.NullEquality.EQUAL,
+            [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]],
+        ),
+        (
+            plc.types.NanEquality.UNEQUAL,
+            plc.types.NullEquality.EQUAL,
             [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]],
         ),
         (
-            True,
-            False,
+            plc.types.NanEquality.ALL_EQUAL,
+            plc.types.NullEquality.UNEQUAL,
             [[np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]],
         ),
         (
-            False,
-            False,
+            plc.types.NanEquality.UNEQUAL,
+            plc.types.NullEquality.UNEQUAL,
             [
                 [np.nan, np.nan, 0, 1, 2, 3],
                 [3, 1, 2],
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
index f461657281a..e85cd1cc443 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
@@ -8,7 +8,7 @@
 import pylibcudf as plc
 
 
-@pytest.fixture()
+@pytest.fixture
 def str_data():
     pa_data = pa.array(["A", None])
     return pa_data, plc.interop.from_arrow(pa_data)
diff --git a/python/pylibcudf/pylibcudf/traits.pyi b/python/pylibcudf/pylibcudf/traits.pyi
new file mode 100644
index 00000000000..fdb31a262cf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/traits.pyi
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.types import DataType
+
+def is_relationally_comparable(typ: DataType) -> bool: ...
+def is_equality_comparable(typ: DataType) -> bool: ...
+def is_numeric(typ: DataType) -> bool: ...
+def is_numeric_not_bool(typ: DataType) -> bool: ...
+def is_index_type(typ: DataType) -> bool: ...
+def is_unsigned(typ: DataType) -> bool: ...
+def is_integral(typ: DataType) -> bool: ...
+def is_integral_not_bool(typ: DataType) -> bool: ...
+def is_floating_point(typ: DataType) -> bool: ...
+def is_boolean(typ: DataType) -> bool: ...
+def is_timestamp(typ: DataType) -> bool: ...
+def is_fixed_point(typ: DataType) -> bool: ...
+def is_duration(typ: DataType) -> bool: ...
+def is_chrono(typ: DataType) -> bool: ...
+def is_dictionary(typ: DataType) -> bool: ...
+def is_fixed_width(typ: DataType) -> bool: ...
+def is_compound(typ: DataType) -> bool: ...
+def is_nested(typ: DataType) -> bool: ...
+def is_bit_castable(source: DataType, target: DataType) -> bool: ...
diff --git a/python/pylibcudf/pylibcudf/traits.pyx b/python/pylibcudf/pylibcudf/traits.pyx
index 9c52e0ac1ab..3cf0a3a4b3b 100644
--- a/python/pylibcudf/pylibcudf/traits.pyx
+++ b/python/pylibcudf/pylibcudf/traits.pyx
@@ -5,6 +5,27 @@ from pylibcudf.libcudf.utilities cimport traits
 
 from .types cimport DataType
 
+__all__ = [
+    "is_bit_castable",
+    "is_boolean",
+    "is_chrono",
+    "is_compound",
+    "is_dictionary",
+    "is_duration",
+    "is_equality_comparable",
+    "is_fixed_point",
+    "is_fixed_width",
+    "is_floating_point",
+    "is_index_type",
+    "is_integral",
+    "is_integral_not_bool",
+    "is_nested",
+    "is_numeric",
+    "is_numeric_not_bool",
+    "is_relationally_comparable",
+    "is_timestamp",
+    "is_unsigned",
+]
 
 cpdef bool is_relationally_comparable(DataType typ):
     """Checks if the given data type supports relational comparisons.
diff --git a/python/pylibcudf/pylibcudf/transform.pyi b/python/pylibcudf/pylibcudf/transform.pyi
new file mode 100644
index 00000000000..5cbd2e635f0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/transform.pyi
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from pylibcudf.column import Column
+from pylibcudf.expressions import Expression
+from pylibcudf.gpumemoryview import gpumemoryview
+from pylibcudf.table import Table
+from pylibcudf.types import DataType
+
+def nans_to_nulls(input: Column) -> tuple[gpumemoryview, int]: ...
+def compute_column(input: Table, expr: Expression) -> Column: ...
+def bools_to_mask(input: Column) -> tuple[gpumemoryview, int]: ...
+def mask_to_bools(bitmask: int, begin_bit: int, end_bit: int) -> Column: ...
+def transform(
+    input: Column, unary_udf: str, output_type: DataType, is_ptx: bool
+) -> Column: ...
+def encode(input: Table) -> tuple[Table, Column]: ...
+def one_hot_encode(input: Column, categories: Column) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index e8d95cadb0c..9700bcff221 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -18,6 +18,15 @@ from .gpumemoryview cimport gpumemoryview
 from .types cimport DataType
 from .utils cimport int_to_bitmask_ptr
 
+__all__ = [
+    "bools_to_mask",
+    "compute_column",
+    "encode",
+    "mask_to_bools",
+    "nans_to_nulls",
+    "one_hot_encode",
+    "transform",
+]
 
 cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
     """Create a null mask preserving existing nulls and converting nans to null.
diff --git a/python/pylibcudf/pylibcudf/transpose.pyi b/python/pylibcudf/pylibcudf/transpose.pyi
new file mode 100644
index 00000000000..a84ab8a60ea
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/transpose.pyi
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from pylibcudf.table import Table
+
+def transpose(input_table: Table) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx
index a24f937ced3..5eb3e58cebc 100644
--- a/python/pylibcudf/pylibcudf/transpose.pyx
+++ b/python/pylibcudf/pylibcudf/transpose.pyx
@@ -9,6 +9,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 from .column cimport Column
 from .table cimport Table
 
+__all__ = ["transpose"]
 
 cpdef Table transpose(Table input_table):
     """Transpose a Table.
diff --git a/python/pylibcudf/pylibcudf/types.pyi b/python/pylibcudf/pylibcudf/types.pyi
new file mode 100644
index 00000000000..c91a95414bd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/types.pyi
@@ -0,0 +1,86 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from enum import IntEnum
+from typing import Final
+
+class Interpolation(IntEnum):
+    LINEAR = ...
+    LOWER = ...
+    HIGHER = ...
+    MIDPOINT = ...
+    NEAREST = ...
+
+class MaskState(IntEnum):
+    UNALLOCATED = ...
+    UNINITIALIZED = ...
+    ALL_VALID = ...
+    ALL_NULL = ...
+
+class NanEquality(IntEnum):
+    ALL_EQUAL = ...
+    UNEQUAL = ...
+
+class NanPolicy(IntEnum):
+    NAN_IS_NULL = ...
+    NAN_IS_VALID = ...
+
+class NullEquality(IntEnum):
+    EQUAL = ...
+    UNEQUAL = ...
+
+class NullOrder(IntEnum):
+    AFTER = ...
+    BEFORE = ...
+
+class NullPolicy(IntEnum):
+    EXCLUDE = ...
+    INCLUDE = ...
+
+class Order(IntEnum):
+    ASCENDING = ...
+    DESCENDING = ...
+
+class Sorted(IntEnum):
+    NO = ...
+    YES = ...
+
+class TypeId(IntEnum):
+    EMPTY = ...
+    INT8 = ...
+    INT16 = ...
+    INT32 = ...
+    INT64 = ...
+    UINT8 = ...
+    UINT16 = ...
+    UINT32 = ...
+    UINT64 = ...
+    FLOAT32 = ...
+    FLOAT64 = ...
+    BOOL8 = ...
+    TIMESTAMP_DAYS = ...
+    TIMESTAMP_SECONDS = ...
+    TIMESTAMP_MILLISECONDS = ...
+    TIMESTAMP_MICROSECONDS = ...
+    TIMESTAMP_NANOSECONDS = ...
+    DURATION_DAYS = ...
+    DURATION_SECONDS = ...
+    DURATION_MILLISECONDS = ...
+    DURATION_MICROSECONDS = ...
+    DURATION_NANOSECONDS = ...
+    DICTIONARY32 = ...
+    STRING = ...
+    LIST = ...
+    DECIMAL32 = ...
+    DECIMAL64 = ...
+    DECIMAL128 = ...
+    STRUCT = ...
+    NUM_TYPE_IDS = ...
+
+class DataType:
+    def __init__(self, type_id: TypeId, scale: int = 0): ...
+    def id(self) -> TypeId: ...
+    def scale(self) -> int: ...
+
+def size_of(t: DataType) -> int: ...
+
+SIZE_TYPE: Final[DataType]
+SIZE_TYPE_ID: Final[TypeId]
diff --git a/python/pylibcudf/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx
index a0c31f994a3..afa1b56f38a 100644
--- a/python/pylibcudf/pylibcudf/types.pyx
+++ b/python/pylibcudf/pylibcudf/types.pyx
@@ -20,6 +20,22 @@ from pylibcudf.libcudf.types import null_order as NullOrder  # no-cython-lint, i
 from pylibcudf.libcudf.types import order as Order  # no-cython-lint, isort:skip
 from pylibcudf.libcudf.types import sorted as Sorted  # no-cython-lint, isort:skip
 
+__all__ = [
+    "DataType",
+    "Interpolation",
+    "MaskState",
+    "NanEquality",
+    "NanPolicy",
+    "NullEquality",
+    "NullOrder",
+    "NullPolicy",
+    "Order",
+    "SIZE_TYPE",
+    "SIZE_TYPE_ID",
+    "Sorted",
+    "TypeId",
+    "size_of"
+]
 
 cdef class DataType:
     """Indicator for the logical data type of an element in a column.
diff --git a/python/pylibcudf/pylibcudf/unary.pyi b/python/pylibcudf/pylibcudf/unary.pyi
new file mode 100644
index 00000000000..7aa23b618f4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/unary.pyi
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from enum import IntEnum
+
+from pylibcudf.column import Column
+from pylibcudf.types import DataType
+
+class UnaryOperator(IntEnum):
+    SIN = ...
+    COS = ...
+    TAN = ...
+    ARCSIN = ...
+    ARCCOS = ...
+    ARCTAN = ...
+    SINH = ...
+    COSH = ...
+    TANH = ...
+    ARCSINH = ...
+    ARCCOSH = ...
+    ARCTANH = ...
+    EXP = ...
+    LOG = ...
+    SQRT = ...
+    CBRT = ...
+    CEIL = ...
+    FLOOR = ...
+    ABS = ...
+    RINT = ...
+    BIT_INVERT = ...
+    NOT = ...
+
+def unary_operation(input: Column, op: UnaryOperator) -> Column: ...
+def is_null(input: Column) -> Column: ...
+def is_valid(input: Column) -> Column: ...
+def cast(input: Column, data_type: DataType) -> Column: ...
+def is_nan(input: Column) -> Column: ...
+def is_not_nan(input: Column) -> Column: ...
+def is_supported_cast(from_: DataType, to: DataType) -> bool: ...
diff --git a/python/pylibcudf/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx
index 53e8c382b5e..b738ab53d1b 100644
--- a/python/pylibcudf/pylibcudf/unary.pyx
+++ b/python/pylibcudf/pylibcudf/unary.pyx
@@ -13,6 +13,16 @@ from pylibcudf.libcudf.unary import \
 from .column cimport Column
 from .types cimport DataType
 
+__all__ = [
+    "UnaryOperator",
+    "cast",
+    "is_nan",
+    "is_not_nan",
+    "is_null",
+    "is_supported_cast",
+    "is_valid",
+    "unary_operation",
+]
 
 cpdef Column unary_operation(Column input, unary_operator op):
     """Perform a unary operation on a column.
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index ac3018b9333..83ed95823da 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -56,13 +56,30 @@ Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 [tool.ruff]
 extend = "../../pyproject.toml"
 
+[tool.ruff.lint]
+extend-select = [
+  "TCH", # flake8-type-checking
+  "TID", # flake8-tidy-imports
+  "PT", # flake8-pytest-style
+]
+extend-ignore = [
+  "PT011", # pytest.raises(...) is too broad
+]
+
+[tool.ruff.lint.flake8-pytest-style]
+# https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style
+fixture-parentheses = false
+mark-parentheses = false
+parametrize-names-type = "csv"
+parametrize-values-type = "list"
+parametrize-values-row-type = "tuple"
+
 [tool.ruff.lint.isort]
 combine-as-imports = true
-known-first-party = ["cudf"]
-section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+known-first-party = ["pylibcudf"]
+section-order = ["future", "standard-library", "third-party", "rapids", "first-party", "local-folder"]
 
 [tool.ruff.lint.isort.sections]
-dask = ["dask", "distributed", "dask_cuda"]
 rapids = ["rmm"]
 
 [tool.ruff.lint.per-file-ignores]

From 796de4bd5131c38428b609c543323193f298624e Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 12 Nov 2024 11:59:04 -0500
Subject: [PATCH 16/19] Add cudf::strings::contains_multiple (#16900)

Add new `cudf::strings::contains_multiple` API to search multiple targets within a strings column.
Output is a table where the number of columns is the number of targets and each row is a boolean indicating that target was found at the row or not.
This PR is to help in collaboration with #16641

Authors:
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Chong Gao (https://github.com/res-life)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Chong Gao (https://github.com/res-life)
  - Yunsong Wang (https://github.com/PointKernel)
  - MithunR (https://github.com/mythrocks)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16900
---
 cpp/CMakeLists.txt                          |   1 +
 cpp/benchmarks/CMakeLists.txt               |   1 +
 cpp/benchmarks/string/find.cpp              |  14 +-
 cpp/benchmarks/string/find_multiple.cpp     |  77 +++++
 cpp/include/cudf/strings/find_multiple.hpp  |  40 ++-
 cpp/src/strings/search/contains_multiple.cu | 316 ++++++++++++++++++++
 cpp/src/strings/search/find_multiple.cu     |   5 +-
 cpp/tests/strings/find_multiple_tests.cpp   | 155 +++++++++-
 cpp/tests/strings/find_tests.cpp            |   4 +-
 9 files changed, 592 insertions(+), 21 deletions(-)
 create mode 100644 cpp/benchmarks/string/find_multiple.cpp
 create mode 100644 cpp/src/strings/search/contains_multiple.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 65b05fd518b..e237b0b2856 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -705,6 +705,7 @@ add_library(
   src/strings/replace/replace_slice.cu
   src/strings/reverse.cu
   src/strings/scan/scan_inclusive.cu
+  src/strings/search/contains_multiple.cu
   src/strings/search/findall.cu
   src/strings/search/find.cu
   src/strings/search/find_multiple.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 59f5602fd5a..419b78db9b0 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -375,6 +375,7 @@ ConfigureNVBench(
   string/count.cpp
   string/extract.cpp
   string/find.cpp
+  string/find_multiple.cpp
   string/join_strings.cpp
   string/lengths.cpp
   string/like.cpp
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index 996bdcf0332..3ea3ff13a2f 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -20,9 +20,7 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/combine.hpp>
 #include <cudf/strings/find.hpp>
-#include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -44,15 +42,13 @@ static void bench_find_string(nvbench::state& state)
   auto const col    = create_string_column(n_rows, row_width, hit_rate);
   auto const input  = cudf::strings_column_view(col->view());
 
-  std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
-  cudf::string_scalar target(h_targets[2]);
-  cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+  cudf::string_scalar target("0987 5W43");
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   auto const chars_size = input.chars_size(stream);
   state.add_element_count(chars_size, "chars_size");
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
-  if (api.substr(0, 4) == "find") {
+  if (api == "find") {
     state.add_global_memory_writes<nvbench::int32_t>(input.size());
   } else {
     state.add_global_memory_writes<nvbench::int8_t>(input.size());
@@ -61,10 +57,6 @@ static void bench_find_string(nvbench::state& state)
   if (api == "find") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { cudf::strings::find(input, target); });
-  } else if (api == "find_multi") {
-    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-      cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
-    });
   } else if (api == "contains") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
@@ -79,7 +71,7 @@ static void bench_find_string(nvbench::state& state)
 
 NVBENCH_BENCH(bench_find_string)
   .set_name("find_string")
-  .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"})
+  .add_string_axis("api", {"find", "contains", "starts_with", "ends_with"})
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
   .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
   .add_int64_axis("hit_rate", {20, 80});  // percentage
diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp
new file mode 100644
index 00000000000..0e780fdb302
--- /dev/null
+++ b/cpp/benchmarks/string/find_multiple.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/strings/find.hpp>
+#include <cudf/strings/find_multiple.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_find_string(nvbench::state& state)
+{
+  auto const n_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width    = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const hit_rate     = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+  auto const target_count = static_cast<cudf::size_type>(state.get_int64("targets"));
+  auto const api          = state.get_string("api");
+
+  auto const stream = cudf::get_default_stream();
+  auto const col    = create_string_column(n_rows, row_width, hit_rate);
+  auto const input  = cudf::strings_column_view(col->view());
+
+  // Note that these all match the first row of the raw_data in create_string_column.
+  // This is so the hit_rate can properly accounted for.
+  std::vector<std::string> const target_data(
+    {" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"});
+  auto h_targets = std::vector<std::string>{};
+  for (cudf::size_type i = 0; i < target_count; i++) {
+    h_targets.emplace_back(target_data[i % target_data.size()]);
+  }
+  cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto const chars_size = input.chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  if (api == "find") {
+    state.add_global_memory_writes<nvbench::int32_t>(input.size());
+  } else {
+    state.add_global_memory_writes<nvbench::int8_t>(input.size());
+  }
+
+  if (api == "find") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
+    });
+  } else if (api == "contains") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::contains_multiple(input, cudf::strings_column_view(targets));
+    });
+  }
+}
+
+NVBENCH_BENCH(bench_find_string)
+  .set_name("find_multiple")
+  .add_string_axis("api", {"find", "contains"})
+  .add_int64_axis("targets", {10, 20, 40})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
+  .add_int64_axis("hit_rate", {20, 80});  // percentage
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 1fe446db8da..e090766dd07 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -28,8 +28,42 @@ namespace strings {
  */
 
 /**
- * @brief Returns a lists column with character position values where each
- * of the target strings are found in each string.
+ * @brief Searches for the given target strings within each string in the provided column
+ *
+ * Each column in the result table corresponds to the result for the target string at the same
+ * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1st for 1st,
+ * etc.
+ *
+ * If the target is not found for a string, false is returned for that entry in the output column.
+ * If the target is an empty string, true is returned for all non-null entries in the output column.
+ *
+ * Any null input strings return corresponding null entries in the output columns.
+ *
+ * @code{.pseudo}
+ * input = ["a", "b", "c"]
+ * targets = ["a", "c"]
+ * output is a table with two boolean columns:
+ *   column 0: [true, false, false]
+ *   column 1: [false, false, true]
+ * @endcode
+ *
+ * @throw std::invalid_argument if `targets` is empty or contains nulls
+ *
+ * @param input Strings instance for this operation
+ * @param targets UTF-8 encoded strings to search for in each string in `input`
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Table of BOOL8 columns
+ */
+std::unique_ptr<table> contains_multiple(
+  strings_column_view const& input,
+  strings_column_view const& targets,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Searches for the given target strings within each string in the provided column
+ * and returns the position the targets were found
  *
  * The size of the output column is `input.size()`.
  * Each row of the output column is of size `targets.size()`.
@@ -45,7 +79,7 @@ namespace strings {
  *           [-1,-1, 1 ]}  // for "def": "a" and "b" not found, "e" at  pos 1
  * @endcode
  *
- * @throw cudf::logic_error if `targets` is empty or contains nulls
+ * @throw std::invalid_argument if `targets` is empty or contains nulls
  *
  * @param input Strings instance for this operation
  * @param targets Strings to search for in each string
diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu
new file mode 100644
index 00000000000..1183e3e4038
--- /dev/null
+++ b/cpp/src/strings/search/contains_multiple.cu
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/find_multiple.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cooperative_groups.h>
+#include <cub/cub.cuh>
+#include <cuda/functional>
+#include <thrust/binary_search.h>
+#include <thrust/equal.h>
+#include <thrust/fill.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+#include <thrust/unique.h>
+
+#include <vector>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+
+/**
+ * @brief Threshold to decide on using string or warp parallel functions.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * a warp-parallel function is used.
+ */
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64;
+
+/**
+ * @brief Kernel for finding multiple targets in each row of input strings
+ *
+ * The d_first_bytes is sorted and unique so the d_indices and d_offsets
+ * are used to map the corresponding character to its d_targets entry.
+ *
+ * Example
+ * d_targets = ["foo", "hello", "world", "hi"]
+ *  - sorted first-chars: ['f','h','h','w']
+ * d_indices = [0, 3, 1, 2]
+ * d_first_bytes = ['f', 'h', 'w']   (unique)
+ * d_offsets = [0, 1, 3]
+ * unique_count = 3
+ *
+ * If 'h' is found, lower_bound produces pos=1 in d_first_bytes.
+ * This corresponds to d_offset[1]==1 which has two values:
+ * - (d_offsets[2] - d_offsets[1]) = (3 - 1) = 2.
+ * Set map_idx = d_offsets[1] = 1 and the two targets to check are sequential
+ * in the d_indices array:
+ * - tgt1_idx = d_indices[map_idx]   = 3 --> d_targets[3] == 'hi'
+ * - tgt2_idx = d_indices[map_idx+1] = 1 --> d_targets[1] == 'hello'
+ * The logic now only needs to check for either of these 2 targets.
+ *
+ * This kernel works in either thread-per-string or warp-per-string depending
+ * on the template parameter. If tile_size==1, then this kernel executes as
+ * a row-per-string. If tile_size=32, the it executes as a warp-per-string.
+ * No other options are supported for now.
+ *
+ * @tparam tile_size Number of threads per string
+ * @param d_strings Input strings
+ * @param d_targets Target strings to search within input strings
+ * @param d_first_bytes Sorted, unique list of first bytes of the target strings
+ * @param d_indices Indices to map sorted d_first_bytes to d_targets
+ * @param d_offsets Offsets to map d_indices to d_targets
+ * @param unique_count Number of unique values in d_first_bytes (and d_offsets)
+ * @param working_memory Global memory to use if shared-memory is too small
+ * @param d_results Bool results for each target within each string row
+ */
+template <cudf::thread_index_type tile_size>
+CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings,
+                                       column_device_view const d_targets,
+                                       u_char const* d_first_bytes,
+                                       size_type const* d_indices,
+                                       size_type const* d_offsets,
+                                       size_type unique_count,
+                                       bool* working_memory,
+                                       cudf::device_span<bool*> d_results)
+{
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = idx / tile_size;
+  if (str_idx >= d_strings.size()) { return; }
+  if (d_strings.is_null(str_idx)) { return; }
+
+  // get the string for this tile
+  auto const d_str = d_strings.element<string_view>(str_idx);
+
+  namespace cg           = cooperative_groups;
+  auto const tile        = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto const lane_idx    = tile.thread_rank();
+  auto const num_targets = d_targets.size();
+
+  // size of shared_bools = num_targets * block_size
+  // each thread uses num_targets bools
+  extern __shared__ bool shared_bools[];
+  // bools for the current string
+  auto bools = working_memory == nullptr
+                 ? (shared_bools + (tile.meta_group_rank() * tile_size * num_targets))
+                 : (working_memory + (str_idx * tile_size * num_targets));
+
+  // initialize result: set true if target is empty, false otherwise
+  for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) {
+    auto const d_target = d_targets.element<string_view>(target_idx);
+    if constexpr (tile_size == 1) {
+      d_results[target_idx][str_idx] = d_target.empty();
+    } else {
+      auto const begin = bools + (target_idx * tile_size);
+      thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty());
+    }
+  }
+  tile.sync();
+
+  auto const last_ptr = d_first_bytes + unique_count;
+  for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes();
+       str_byte_idx += tile_size) {
+    // search for byte in first_bytes array
+    auto const sptr     = d_str.data() + str_byte_idx;
+    auto const chr      = static_cast<u_char>(*sptr);
+    auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr);
+    // if not found, continue to next byte
+    if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; }
+    // compute index of matched byte
+    auto const offset_idx = static_cast<size_type>(thrust::distance(d_first_bytes, byte_ptr));
+    auto map_idx          = d_offsets[offset_idx];
+    auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets;
+    // check for targets that begin with chr
+    while (map_idx < last_idx) {
+      auto const target_idx = d_indices[map_idx++];
+      auto const bool_idx   = (target_idx * tile_size) + lane_idx;
+      auto const found      = tile_size == 1 ? d_results[target_idx][str_idx] : bools[bool_idx];
+      if (!found) {  // not found before
+        auto const d_target = d_targets.element<string_view>(target_idx);
+        if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) {
+          // first char already checked, so just check the [1, end) chars match
+          auto const tp = d_target.data();
+          if (thrust::equal(thrust::seq, tp + 1, tp + d_target.size_bytes(), sptr + 1)) {
+            if constexpr (tile_size == 1) {
+              d_results[target_idx][str_idx] = true;
+            } else {
+              bools[bool_idx] = true;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if constexpr (tile_size > 1) {
+    tile.sync();
+    // reduce the bools for each target to store in the result
+    for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) {
+      auto const begin = bools + (target_idx * tile_size);
+      d_results[target_idx][str_idx] =
+        thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity<bool>{});
+      // cooperative_group any() implementation was almost 3x slower than this parallel reduce
+    }
+  }
+}
+}  // namespace
+
+std::unique_ptr<table> contains_multiple(strings_column_view const& input,
+                                         strings_column_view const& targets,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(
+    not targets.is_empty(), "Must specify at least one target string.", std::invalid_argument);
+  CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null", std::invalid_argument);
+
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+  auto const d_targets = column_device_view::create(targets.parent(), stream);
+
+  // copy the first byte of each target and sort them
+  auto first_bytes = rmm::device_uvector<u_char>(targets.size(), stream);
+  auto indices     = rmm::device_uvector<size_type>(targets.size(), stream);
+  {
+    auto tgt_itr = thrust::make_transform_iterator(
+      d_targets->begin<string_view>(),
+      cuda::proclaim_return_type<u_char>([] __device__(auto const& d_tgt) -> u_char {
+        return d_tgt.empty() ? u_char{0} : static_cast<u_char>(d_tgt.data()[0]);
+      }));
+    auto count_itr = thrust::make_counting_iterator<size_type>(0);
+    auto keys_out  = first_bytes.begin();
+    auto vals_out  = indices.begin();
+    auto num_items = targets.size();
+    auto cmp_op    = thrust::less();
+    auto sv        = stream.value();
+
+    std::size_t tmp_bytes = 0;
+    cub::DeviceMergeSort::SortPairsCopy(
+      nullptr, tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv);
+    auto tmp_stg = rmm::device_buffer(tmp_bytes, stream);
+    cub::DeviceMergeSort::SortPairsCopy(
+      tmp_stg.data(), tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv);
+  }
+
+  // remove duplicates to help speed up lower_bound
+  auto offsets = rmm::device_uvector<size_type>(targets.size(), stream);
+  thrust::sequence(rmm::exec_policy_nosync(stream), offsets.begin(), offsets.end());
+  auto const end = thrust::unique_by_key(
+    rmm::exec_policy_nosync(stream), first_bytes.begin(), first_bytes.end(), offsets.begin());
+  auto const unique_count =
+    static_cast<size_type>(thrust::distance(first_bytes.begin(), end.first));
+
+  // create output columns
+  auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+    return make_numeric_column(data_type{type_id::BOOL8},
+                               input.size(),
+                               cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                               input.null_count(),
+                               stream,
+                               mr);
+  });
+  auto results = std::vector<std::unique_ptr<column>>(results_iter, results_iter + targets.size());
+  auto d_results = [&] {
+    auto host_results_pointer_iter =
+      thrust::make_transform_iterator(results.begin(), [](auto const& results_column) {
+        return results_column->mutable_view().template data<bool>();
+      });
+    auto host_results_pointers =
+      std::vector<bool*>(host_results_pointer_iter, host_results_pointer_iter + results.size());
+    return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr);
+  }();
+
+  constexpr cudf::thread_index_type block_size = 256;
+  // calculated (benchmarked) for efficient use of shared-memory
+  constexpr size_type targets_threshold = 32;
+
+  auto d_first_bytes = first_bytes.data();
+  auto d_indices     = indices.data();
+  auto d_offsets     = offsets.data();
+
+  bool const row_parallel = ((input.null_count() == input.size()) ||
+                             ((input.chars_size(stream) / (input.size() - input.null_count())) <=
+                              AVG_CHAR_BYTES_THRESHOLD));
+
+  if (row_parallel) {
+    // Smaller strings perform better with a row per string
+    cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()), block_size};
+    multi_contains_kernel<1>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(*d_strings,
+                                                                           *d_targets,
+                                                                           d_first_bytes,
+                                                                           d_indices,
+                                                                           d_offsets,
+                                                                           unique_count,
+                                                                           nullptr,
+                                                                           d_results);
+  } else {
+    constexpr cudf::thread_index_type tile_size = cudf::detail::warp_size;
+
+    auto const shared_mem_size =
+      (targets.size() <= targets_threshold) ? (block_size * targets.size()) : 0;
+    auto const work_mem_size =
+      (targets.size() <= targets_threshold) ? 0 : tile_size * targets.size() * input.size();
+    auto working_memory = rmm::device_uvector<bool>(work_mem_size, stream);
+
+    cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * tile_size,
+                               block_size};
+    multi_contains_kernel<tile_size>
+      <<<grid.num_blocks, grid.num_threads_per_block, shared_mem_size, stream.value()>>>(
+        *d_strings,
+        *d_targets,
+        d_first_bytes,
+        d_indices,
+        d_offsets,
+        unique_count,
+        working_memory.data(),
+        d_results);
+  }
+
+  return std::make_unique<table>(std::move(results));
+}
+
+}  // namespace detail
+
+std::unique_ptr<table> contains_multiple(strings_column_view const& strings,
+                                         strings_column_view const& targets,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::contains_multiple(strings, targets, stream, mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index ec7015878dd..67226b259d4 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -42,8 +42,9 @@ std::unique_ptr<column> find_multiple(strings_column_view const& input,
 {
   auto const strings_count = input.size();
   auto const targets_count = targets.size();
-  CUDF_EXPECTS(targets_count > 0, "Must include at least one search target");
-  CUDF_EXPECTS(!targets.has_nulls(), "Search targets cannot contain null strings");
+  CUDF_EXPECTS(targets_count > 0, "Must include at least one search target", std::invalid_argument);
+  CUDF_EXPECTS(
+    !targets.has_nulls(), "Search targets cannot contain null strings", std::invalid_argument);
 
   auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_strings      = *strings_column;
diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp
index 41a5940c880..3c8483b153d 100644
--- a/cpp/tests/strings/find_multiple_tests.cpp
+++ b/cpp/tests/strings/find_multiple_tests.cpp
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -75,8 +76,158 @@ TEST_F(StringsFindMultipleTest, ErrorTest)
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   auto empty_view                     = cudf::strings_column_view(zero_size_strings_column);
   // targets must have at least one string
-  EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), std::invalid_argument);
+  EXPECT_THROW(cudf::strings::contains_multiple(strings_view, empty_view), std::invalid_argument);
 
   // targets cannot have nulls
-  EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), std::invalid_argument);
+  EXPECT_THROW(cudf::strings::contains_multiple(strings_view, strings_view), std::invalid_argument);
+}
+
+TEST_F(StringsFindMultipleTest, MultiContains)
+{
+  constexpr int num_rows = 1024 + 1;
+  // replicate the following 9 rows:
+  std::vector<std::string> s = {
+    "Héllo, there world and goodbye",
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving",
+    "the following code snippet demonstrates how to use search for values in an ordered range",
+    "it returns the last position where value could be inserted without violating the ordering",
+    "algorithms execution is parallelized as determined by an execution policy. t",
+    "he this is a continuation of previous row to make sure string boundaries are honored",
+    "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~",
+    "",
+    ""};
+
+  // replicate strings
+  auto string_itr =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; });
+
+  // nulls: 8, 8 + 1 * 9, 8 + 2 * 9 ......
+  auto string_v = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) { return (i + 1) % s.size() != 0; });
+
+  auto const strings =
+    cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v);
+  auto strings_view = cudf::strings_column_view(strings);
+  std::vector<std::string> match_targets({" the ", "a", "", "é"});
+  cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(),
+                                                          match_targets.end());
+  auto results =
+    cudf::strings::contains_multiple(strings_view, cudf::strings_column_view(multi_targets_column));
+
+  std::vector<bool> ret_0 = {0, 1, 0, 1, 0, 0, 0, 0, 0};
+  std::vector<bool> ret_1 = {1, 1, 1, 1, 1, 1, 1, 0, 0};
+  std::vector<bool> ret_2 = {1, 1, 1, 1, 1, 1, 1, 1, 0};
+  std::vector<bool> ret_3 = {1, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  auto make_bool_col_fn = [&string_v, &num_rows](std::vector<bool> bools) {
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, [&](auto i) { return bools[i % bools.size()]; });
+    return cudf::test::fixed_width_column_wrapper<bool>(iter, iter + num_rows, string_v);
+  };
+
+  auto expected_0 = make_bool_col_fn(ret_0);
+  auto expected_1 = make_bool_col_fn(ret_1);
+  auto expected_2 = make_bool_col_fn(ret_2);
+  auto expected_3 = make_bool_col_fn(ret_3);
+
+  auto expected = cudf::table_view({expected_0, expected_1, expected_2, expected_3});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected);
+}
+
+TEST_F(StringsFindMultipleTest, MultiContainsMoreTargets)
+{
+  auto const strings = cudf::test::strings_column_wrapper{
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
+    "quick brown fox jumped",
+    "the following code snippet demonstrates how to use search for values in an ordered rangethe "
+    "following code snippet",
+    "thé it returns the last position where value could be inserted without violating ordering thé "
+    "it returns the last position"};
+  auto strings_view = cudf::strings_column_view(strings);
+  std::vector<std::string> targets({"lazy brown", "non-exist", ""});
+
+  std::vector<cudf::test::fixed_width_column_wrapper<bool>> expects;
+  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({1, 0, 0}));
+  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0}));
+  expects.push_back(cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1}));
+
+  std::vector<std::string> match_targets;
+  int max_num_targets = 50;
+
+  for (int num_targets = 1; num_targets < max_num_targets; num_targets++) {
+    match_targets.clear();
+    for (int i = 0; i < num_targets; i++) {
+      match_targets.push_back(targets[i % targets.size()]);
+    }
+
+    cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(),
+                                                            match_targets.end());
+    auto results = cudf::strings::contains_multiple(
+      strings_view, cudf::strings_column_view(multi_targets_column));
+    EXPECT_EQ(results->num_columns(), num_targets);
+    for (int i = 0; i < num_targets; i++) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(i), expects[i % expects.size()]);
+    }
+  }
+}
+
+TEST_F(StringsFindMultipleTest, MultiContainsLongStrings)
+{
+  constexpr int num_rows = 1024 + 1;
+  // replicate the following 7 rows:
+  std::vector<std::string> s = {
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
+    "quick brown fox jumped",
+    "the following code snippet demonstrates how to use search for values in an ordered rangethe "
+    "following code snippet",
+    "thé it returns the last position where value could be inserted without violating ordering thé "
+    "it returns the last position",
+    "algorithms execution is parallelized as determined by an execution policy. t algorithms "
+    "execution is parallelized as ",
+    "he this is a continuation of previous row to make sure string boundaries are honored he this "
+    "is a continuation of previous row",
+    "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ "
+    "!@#$%^&*()~abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKL",
+    ""};
+
+  // replicate strings
+  auto string_itr =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; });
+
+  // nulls: 6, 6 + 1 * 7, 6 + 2 * 7 ......
+  auto string_v = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) { return (i + 1) % s.size() != 0; });
+
+  auto const strings =
+    cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v);
+
+  auto sv      = cudf::strings_column_view(strings);
+  auto targets = cudf::test::strings_column_wrapper({" the ", "search", "", "string", "ox", "é "});
+  auto results = cudf::strings::contains_multiple(sv, cudf::strings_column_view(targets));
+
+  std::vector<bool> ret_0 = {1, 0, 1, 0, 0, 0, 0};
+  std::vector<bool> ret_1 = {0, 1, 0, 0, 0, 0, 0};
+  std::vector<bool> ret_2 = {1, 1, 1, 1, 1, 1, 0};
+  std::vector<bool> ret_3 = {0, 0, 0, 0, 1, 0, 0};
+  std::vector<bool> ret_4 = {1, 0, 0, 0, 0, 0, 0};
+  std::vector<bool> ret_5 = {0, 0, 1, 0, 0, 0, 0};
+
+  auto make_bool_col_fn = [&string_v, &num_rows](std::vector<bool> bools) {
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, [&](auto i) { return bools[i % bools.size()]; });
+    return cudf::test::fixed_width_column_wrapper<bool>(iter, iter + num_rows, string_v);
+  };
+
+  auto expected_0 = make_bool_col_fn(ret_0);
+  auto expected_1 = make_bool_col_fn(ret_1);
+  auto expected_2 = make_bool_col_fn(ret_2);
+  auto expected_3 = make_bool_col_fn(ret_3);
+  auto expected_4 = make_bool_col_fn(ret_4);
+  auto expected_5 = make_bool_col_fn(ret_5);
+
+  auto expected =
+    cudf::table_view({expected_0, expected_1, expected_2, expected_3, expected_4, expected_5});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected);
 }
diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp
index 2da95ba5c27..a3066c40650 100644
--- a/cpp/tests/strings/find_tests.cpp
+++ b/cpp/tests/strings/find_tests.cpp
@@ -17,16 +17,14 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/attributes.hpp>
 #include <cudf/strings/find.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <thrust/iterator/transform_iterator.h>
-
 #include <vector>
 
 struct StringsFindTest : public cudf::test::BaseFixture {};

From 1f9ad2f33867789d734c9be9bbacaabe1e348884 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 12 Nov 2024 16:20:29 -0600
Subject: [PATCH 17/19] enforce wheel size limits, README formatting in CI
 (#17284)

Contributes to https://github.com/rapidsai/build-planning/issues/110

Proposes adding 2 types of validation on wheels in CI, to ensure we continue to produce wheels that are suitable for PyPI.

* checks on wheel size (compressed),
  - *to be sure they're under PyPI limits*
  - *and to prompt discussion on PRs that significantly increase wheel sizes*
* checks on README formatting
  - *to ensure they'll render properly as the PyPI project homepages*
  - *e.g. like how https://github.com/scikit-learn/scikit-learn/blob/main/README.rst becomes https://pypi.org/project/scikit-learn/*

## Notes for Reviewers

### How I tested this

Initially set the size threshold for `libcudf` to a value that I knew it'd violate (75MB compressed, when the wheels are 400+ MB compressed).

Saw CI fail as expected, and print a summary with the expected contents.

```text
checking 'final_dist/libcudf_cu11-24.12.0a333-py3-none-manylinux_2_28_aarch64.whl'
----- package inspection summary -----
file size
  * compressed size: 0.4G
  * uncompressed size: 0.6G
  * compression space saving: 34.6%
contents
  * directories: 164
  * files: 1974 (2 compiled)
size by extension
  * .so - 0.6G (97.0%)
  * .h - 6.7M (1.0%)
  * no-extension - 4.8M (0.7%)
  * .cuh - 3.8M (0.6%)
  * .hpp - 2.2M (0.3%)
  * .a - 1.1M (0.2%)
  * .inl - 0.8M (0.1%)
  * .cmake - 0.1M (0.0%)
  * .md - 8.3K (0.0%)
  * .py - 4.0K (0.0%)
  * .pc - 0.2K (0.0%)
  * .txt - 34.0B (0.0%)
largest files
  * (0.6G) libcudf/lib64/libcudf.so
  * (3.3M) libcudf/bin/flatc
  * (1.0M) libcudf/lib64/libflatbuffers.a
  * (0.5M) libcudf/include/libcudf/rapids/libcudacxx/cuda/std/__atomic/functions/cuda_ptx_generated.h
  * (0.2M) libcudf_cu11-24.12.0a333.dist-info/RECORD
------------ check results -----------
1. [distro-too-large-compressed] Compressed size 0.4G is larger than the allowed size (75.0M).
errors found while checking: 1
```

([build link](https://github.com/rapidsai/cudf/actions/runs/11748370606/job/32732391718?pr=17284#step:13:3062))

Updated that threshold in `python/libcudf/pyproject.toml`, and saw the build succeed (but the summary still printed).

#

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17284
---
 ci/build_wheel_cudf.sh            |  2 ++
 ci/build_wheel_cudf_polars.sh     |  1 +
 ci/build_wheel_dask_cudf.sh       |  1 +
 ci/build_wheel_libcudf.sh         |  2 ++
 ci/build_wheel_pylibcudf.sh       |  2 ++
 ci/validate_wheel.sh              | 21 +++++++++++++++++++++
 python/cudf/pyproject.toml        |  8 ++++++++
 python/cudf_kafka/pyproject.toml  |  8 ++++++++
 python/cudf_polars/pyproject.toml |  8 ++++++++
 python/custreamz/pyproject.toml   |  8 ++++++++
 python/dask_cudf/pyproject.toml   |  8 ++++++++
 python/libcudf/pyproject.toml     |  8 ++++++++
 python/pylibcudf/pyproject.toml   |  8 ++++++++
 13 files changed, 85 insertions(+)
 create mode 100755 ci/validate_wheel.sh

diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index ae4eb0d5c66..32dd5a7fa62 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -27,4 +27,6 @@ python -m auditwheel repair \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
+./ci/validate_wheel.sh ${package_dir} final_dist
+
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh
index 79853cdbdb2..38048125247 100755
--- a/ci/build_wheel_cudf_polars.sh
+++ b/ci/build_wheel_cudf_polars.sh
@@ -6,6 +6,7 @@ set -euo pipefail
 package_dir="python/cudf_polars"
 
 ./ci/build_wheel.sh cudf-polars ${package_dir}
+./ci/validate_wheel.sh ${package_dir} dist
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh
index 00c64afa2ef..b0ae2f23abc 100755
--- a/ci/build_wheel_dask_cudf.sh
+++ b/ci/build_wheel_dask_cudf.sh
@@ -6,6 +6,7 @@ set -euo pipefail
 package_dir="python/dask_cudf"
 
 ./ci/build_wheel.sh dask-cudf ${package_dir}
+./ci/validate_wheel.sh ${package_dir} dist
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index aabd3814a24..af49942c8cd 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -37,4 +37,6 @@ python -m auditwheel repair \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
+./ci/validate_wheel.sh ${package_dir} final_dist
+
 RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist"
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
index c4a89f20f5f..5a8f3397714 100755
--- a/ci/build_wheel_pylibcudf.sh
+++ b/ci/build_wheel_pylibcudf.sh
@@ -25,4 +25,6 @@ python -m auditwheel repair \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
+./ci/validate_wheel.sh ${package_dir} final_dist
+
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
new file mode 100755
index 00000000000..5910a5c59fe
--- /dev/null
+++ b/ci/validate_wheel.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir=$1
+wheel_dir_relative_path=$2
+
+cd "${package_dir}"
+
+rapids-logger "validate packages with 'pydistcheck'"
+
+pydistcheck \
+    --inspect \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
+
+rapids-logger "validate packages with 'twine'"
+
+twine check \
+    --strict \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index ca6dbddfecc..280dd52bb22 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -83,6 +83,14 @@ cudf-pandas-tests = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index ec0bc0eb22b..b2ea3f06e48 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -47,6 +47,14 @@ rapids = ["rmm", "cudf", "dask_cudf"]
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["E402", "F401"]
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 2e75dff5c9e..32ea142a96c 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -49,6 +49,14 @@ license-files = ["LICENSE"]
 [tool.setuptools.dynamic]
 version = {file = "cudf_polars/VERSION"}
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index d3baf3bf4d2..dd67a019c77 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -65,6 +65,14 @@ include = [
 ]
 exclude = ["*tests*"]
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.ruff]
 extend = "../../pyproject.toml"
 
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index c4bfc3054bc..07d9143db36 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -81,6 +81,14 @@ section-order = ["future", "standard-library", "third-party", "dask", "rapids",
 dask = ["dask", "distributed", "dask_cuda"]
 rapids = ["rmm", "cudf"]
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 62726bb0df4..8c650eb2144 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -48,6 +48,14 @@ Homepage = "https://github.com/rapidsai/cudf"
 [project.entry-points."cmake.prefix"]
 libcudf = "libcudf"
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 600 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '525M'
+
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index 83ed95823da..e83db47830c 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -85,6 +85,14 @@ rapids = ["rmm"]
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["E402", "F401"]
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 # --import-mode=importlib because two test_json.py exists and tests directory is not a structured module
 addopts = "--tb=native --strict-config --strict-markers --import-mode=importlib"

From bbaa1ab1eab41d26ca2b280b3b48a73ed3f411b9 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 12 Nov 2024 22:57:21 +0000
Subject: [PATCH 18/19] Support polars 1.13 (#17299)

Polars 1.13 is out, so add support for that.

I needed to change some of the logic in the callback raising after @Matt711's changes, I am not sure why tests were passing previously.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17299
---
 ci/test_cudf_polars_polars_tests.sh           | 23 +-----
 ci/test_wheel_cudf_polars.sh                  | 23 +-----
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-125_arch-x86_64.yaml             |  2 +-
 conda/recipes/cudf-polars/meta.yaml           |  2 +-
 dependencies.yaml                             |  2 +-
 python/cudf_polars/cudf_polars/callback.py    | 75 ++++++++-----------
 python/cudf_polars/cudf_polars/dsl/ir.py      |  3 +-
 .../cudf_polars/cudf_polars/dsl/nodebase.py   |  4 +-
 .../cudf_polars/cudf_polars/testing/plugin.py |  2 +-
 python/cudf_polars/pyproject.toml             |  2 +-
 python/cudf_polars/tests/test_config.py       |  2 +-
 12 files changed, 44 insertions(+), 98 deletions(-)

diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
index f5bcdc62604..fefe26984cb 100755
--- a/ci/test_cudf_polars_polars_tests.sh
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -3,22 +3,6 @@
 
 set -eou pipefail
 
-# We will only fail these tests if the PR touches code in pylibcudf
-# or cudf_polars itself.
-# Note, the three dots mean we are doing diff between the merge-base
-# of upstream and HEAD. So this is asking, "does _this branch_ touch
-# files in cudf_polars/pylibcudf", rather than "are there changes
-# between upstream and this branch which touch cudf_polars/pylibcudf"
-# TODO: is the target branch exposed anywhere in an environment variable?
-if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
-then
-    HAS_CHANGES=1
-    rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
-else
-    HAS_CHANGES=0
-    rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
-fi
-
 rapids-logger "Download wheels"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
@@ -63,9 +47,4 @@ if [ ${EXITCODE} != 0 ]; then
 else
     rapids-logger "Running polars test suite PASSED"
 fi
-
-if [ ${HAS_CHANGES} == 1 ]; then
-    exit ${EXITCODE}
-else
-    exit 0
-fi
+exit ${EXITCODE}
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 2884757e46b..6c827406f78 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -3,22 +3,6 @@
 
 set -eou pipefail
 
-# We will only fail these tests if the PR touches code in pylibcudf
-# or cudf_polars itself.
-# Note, the three dots mean we are doing diff between the merge-base
-# of upstream and HEAD. So this is asking, "does _this branch_ touch
-# files in cudf_polars/pylibcudf", rather than "are there changes
-# between upstream and this branch which touch cudf_polars/pylibcudf"
-# TODO: is the target branch exposed anywhere in an environment variable?
-if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/pylibcudf/)" ];
-then
-    HAS_CHANGES=1
-    rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
-else
-    HAS_CHANGES=0
-    rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
-fi
-
 rapids-logger "Download wheels"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
@@ -65,9 +49,4 @@ if [ ${EXITCODE} != 0 ]; then
 else
     rapids-logger "Testing PASSED"
 fi
-
-if [ ${HAS_CHANGES} == 1 ]; then
-    exit ${EXITCODE}
-else
-    exit 0
-fi
+exit ${EXITCODE}
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 01764411346..e91443ddba8 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -66,7 +66,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.13
+- polars>=1.11,<1.14
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<19.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 9074e6332d9..2dccb595e59 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -64,7 +64,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.13
+- polars>=1.11,<1.14
 - pre-commit
 - pyarrow>=14.0.0,<19.0.0a0
 - pydata-sphinx-theme!=0.14.2
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index edf92b930d9..7a477291e7a 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.11,<1.12
+    - polars >=1.11,<1.14
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/dependencies.yaml b/dependencies.yaml
index e47e0c7523c..b5165f82d5f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -734,7 +734,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.11,<1.13
+          - polars>=1.11,<1.14
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index ff4933c7564..d085f21e0ad 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -148,12 +148,7 @@ def _callback(
         return ir.evaluate(cache={}).to_polars()
 
 
-def execute_with_cudf(
-    nt: NodeTraverser,
-    *,
-    config: GPUEngine,
-    exception: type[Exception] | tuple[type[Exception], ...] = Exception,
-) -> None:
+def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
     """
     A post optimization callback that attempts to execute the plan with cudf.
 
@@ -165,10 +160,15 @@ def execute_with_cudf(
     config
         GPUEngine configuration object
 
-    exception
-        Optional exception, or tuple of exceptions, to catch during
-        translation. Defaults to ``Exception``.
+    Raises
+    ------
+    ValueError
+        If the config contains unsupported keys.
+    NotImplementedError
+        If translation of the plan is unsupported.
 
+    Notes
+    -----
     The NodeTraverser is mutated if the libcudf executor can handle the plan.
     """
     device = config.device
@@ -178,38 +178,27 @@ def execute_with_cudf(
         raise ValueError(
             f"Engine configuration contains unsupported settings {unsupported}"
         )
-    try:
-        with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
-            translator = Translator(nt)
-            ir = translator.translate_ir()
-            ir_translation_errors = translator.errors
-            if len(ir_translation_errors):
-                # TODO: Display these errors in user-friendly way.
-                # tracked in https://github.com/rapidsai/cudf/issues/17051
-                unique_errors = sorted(set(ir_translation_errors), key=str)
-                error_message = "Query contained unsupported operations"
-                verbose_error_message = (
-                    f"{error_message}\nThe errors were:\n{unique_errors}"
-                )
-                unsupported_ops_exception = NotImplementedError(
-                    error_message, unique_errors
-                )
-                if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
-                    warnings.warn(verbose_error_message, UserWarning, stacklevel=2)
-                if raise_on_fail:
-                    raise unsupported_ops_exception
-            else:
-                nt.set_udf(
-                    partial(
-                        _callback, ir, device=device, memory_resource=memory_resource
-                    )
-                )
-    except exception as e:
-        if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
-            warnings.warn(
-                f"Query execution with GPU not supported, reason: {type(e)}: {e}",
-                PerformanceWarning,
-                stacklevel=2,
+    with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
+        translator = Translator(nt)
+        ir = translator.translate_ir()
+        ir_translation_errors = translator.errors
+        if len(ir_translation_errors):
+            # TODO: Display these errors in user-friendly way.
+            # tracked in https://github.com/rapidsai/cudf/issues/17051
+            unique_errors = sorted(set(ir_translation_errors), key=str)
+            formatted_errors = "\n".join(
+                f"- {e.__class__.__name__}: {e}" for e in unique_errors
+            )
+            error_message = (
+                "Query execution with GPU not possible: unsupported operations."
+                f"\nThe errors were:\n{formatted_errors}"
+            )
+            exception = NotImplementedError(error_message, unique_errors)
+            if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
+                warnings.warn(error_message, PerformanceWarning, stacklevel=2)
+            if raise_on_fail:
+                raise exception
+        else:
+            nt.set_udf(
+                partial(_callback, ir, device=device, memory_resource=memory_resource)
             )
-        if raise_on_fail:
-            raise
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 1f935190f28..98e8a83b04e 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -227,6 +227,7 @@ class ErrorNode(IR):
     def __init__(self, schema: Schema, error: str):
         self.schema = schema
         self.error = error
+        self.children = ()
 
 
 class PythonScan(IR):
@@ -546,7 +547,7 @@ def do_evaluate(
                 # shifts the row index.
                 # But prior to 1.13, polars had this wrong, so we match behaviour
                 # https://github.com/pola-rs/polars/issues/19607
-                offset += skip_rows  # pragma: no cover; polars 1.13 not yet released
+                offset += skip_rows
             dtype = schema[name]
             step = plc.interop.from_arrow(
                 pa.scalar(1, type=plc.interop.to_arrow(dtype))
diff --git a/python/cudf_polars/cudf_polars/dsl/nodebase.py b/python/cudf_polars/cudf_polars/dsl/nodebase.py
index 228d300f467..dd5c40a00be 100644
--- a/python/cudf_polars/cudf_polars/dsl/nodebase.py
+++ b/python/cudf_polars/cudf_polars/dsl/nodebase.py
@@ -43,9 +43,7 @@ class Node(Generic[T]):
     def _ctor_arguments(self, children: Sequence[T]) -> Sequence[Any | T]:
         return (*(getattr(self, attr) for attr in self._non_child), *children)
 
-    def reconstruct(
-        self, children: Sequence[T]
-    ) -> Self:  # pragma: no cover; not yet used
+    def reconstruct(self, children: Sequence[T]) -> Self:
         """
         Rebuild this node with new children.
 
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index 2f95cd38c57..080a1af6e19 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -40,7 +40,7 @@ def pytest_configure(config: pytest.Config) -> None:
     )
     config.addinivalue_line(
         "filterwarnings",
-        "ignore:.*Query execution with GPU not supported",
+        "ignore:.*Query execution with GPU not possible",
     )
 
 
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 32ea142a96c..785e87391e7 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.11,<1.13",
+    "polars>=1.11,<1.14",
     "pylibcudf==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
index 9900f598e5f..25b71716eed 100644
--- a/python/cudf_polars/tests/test_config.py
+++ b/python/cudf_polars/tests/test_config.py
@@ -30,7 +30,7 @@ def raise_unimplemented(self, *args):
         pytest.raises(pl.exceptions.ComputeError),
         pytest.warns(
             pl.exceptions.PerformanceWarning,
-            match="Query execution with GPU not supported",
+            match="Query execution with GPU not possible",
         ),
     ):
         # And ensure that collecting issues the correct warning.

From 487f97c036ae7919e98ddc8bf5412a8002a493c5 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 12 Nov 2024 15:20:58 -0800
Subject: [PATCH 19/19] Always prefer `device_read`s and `device_write`s when
 kvikIO is enabled (#17260)

Issue #17259

Avoid checking `_gds_read_preferred_threshold` threshold when deciding whether `device_read`/device_write` is preferred to host IO + copy. The reasons are twofold:
1. KvikIO already has an internal threshold for GDS use so we don't need to check on our end as well.
2. Without actual GDS use, kvikIO uses a pinned bounce buffer to efficiently copy to/from the device.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Basit Ayantunde (https://github.com/lamarrr)

URL: https://github.com/rapidsai/cudf/pull/17260
---
 cpp/src/io/utilities/data_sink.cpp  | 8 ++++++--
 cpp/src/io/utilities/datasource.cpp | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 15de5d85614..68377ad6d5f 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -72,8 +72,12 @@ class file_sink : public data_sink {
 
   [[nodiscard]] bool is_device_write_preferred(size_t size) const override
   {
-    if (size < _gds_write_preferred_threshold) { return false; }
-    return supports_device_write();
+    if (!supports_device_write()) { return false; }
+
+    // Always prefer device writes if kvikio is enabled
+    if (!_kvikio_file.closed()) { return true; }
+
+    return size >= _gds_write_preferred_threshold;
   }
 
   std::future<void> device_write_async(void const* gpu_data,
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 5ccc91e4220..0870e4a84a7 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -95,8 +95,12 @@ class file_source : public datasource {
 
   [[nodiscard]] bool is_device_read_preferred(size_t size) const override
   {
-    if (size < _gds_read_preferred_threshold) { return false; }
-    return supports_device_read();
+    if (!supports_device_read()) { return false; }
+
+    // Always prefer device reads if kvikio is enabled
+    if (!_kvikio_file.closed()) { return true; }
+
+    return size >= _gds_read_preferred_threshold;
   }
 
   std::future<size_t> device_read_async(size_t offset,