diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 9a0b4155035..d2ca78924e1 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -5,4 +5,3 @@ auto_merger: true
 branch_checker: true
 label_checker: true
 release_drafter: true
-recently_updated: true
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 11e7a96b751..d5b0c9a5edb 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -30,7 +30,7 @@ export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
 pushd cpp/doxygen
-aws s3 cp s3://rapidsai-docs/librmm/${RAPIDS_VERSION_NUMBER}/html/rmm.tag . || echo "Failed to download rmm Doxygen tag"
+aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_NUMBER}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
 doxygen Doxyfile
 mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html"
 mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html"
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 151d250d7e9..409405a8a73 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,6 +9,7 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
+- aws-sdk-cpp<1.11
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index ad52b8f8b97..2abc1769962 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -9,6 +9,7 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
+- aws-sdk-cpp<1.11
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 25b3f19de77..b1f5b083e06 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -22,6 +22,9 @@ gbench_version:
 gtest_version:
   - ">=1.13.0"
 
+aws_sdk_cpp_version:
+  - "<1.11"
+
 libarrow_version:
   - "=12"
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 627065817ba..28357f0d96d 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -74,6 +74,7 @@ requirements:
     - gtest {{ gtest_version }}
     - gmock {{ gtest_version }}
     - zlib {{ zlib_version }}
+    - aws-sdk-cpp {{ aws_sdk_cpp_version }}
 
 outputs:
   - name: libcudf
@@ -107,6 +108,7 @@ outputs:
         - dlpack {{ dlpack_version }}
         - gtest {{ gtest_version }}
         - gmock {{ gtest_version }}
+        - aws-sdk-cpp {{ aws_sdk_cpp_version }}
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 5e7862f4b3b..cd6b3cfdc03 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -173,7 +173,7 @@ ConfigureBench(ITERATOR_BENCH iterator/iterator.cu)
 # ##################################################################################################
 # * search benchmark ------------------------------------------------------------------------------
 ConfigureBench(SEARCH_BENCH search/search.cpp)
-ConfigureNVBench(SEARCH_NVBENCH search/contains.cpp)
+ConfigureNVBench(SEARCH_NVBENCH search/contains_scalar.cpp search/contains_table.cpp)
 
 # ##################################################################################################
 # * sort benchmark --------------------------------------------------------------------------------
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 6b8af91b842..b1aaef41340 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/io/cuio_common.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 
 #include <cstdio>
@@ -141,17 +142,18 @@ std::vector<std::string> select_column_names(std::vector<std::string> const& col
   return col_names_to_read;
 }
 
-std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks, int chunk)
+std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks, int chunk_idx)
 {
   CUDF_EXPECTS(num_segments >= num_chunks,
                "Number of chunks cannot be greater than the number of segments in the file");
-  auto start_segment = [num_segments, num_chunks](int chunk) {
-    return num_segments * chunk / num_chunks;
-  };
-  std::vector<cudf::size_type> selected_segments;
-  for (auto segment = start_segment(chunk); segment < start_segment(chunk + 1); ++segment) {
-    selected_segments.push_back(segment);
-  }
+  CUDF_EXPECTS(chunk_idx < num_chunks,
+               "Chunk index must be smaller than the number of chunks in the file");
+
+  auto const segments_in_chunk = cudf::util::div_rounding_up_unsafe(num_segments, num_chunks);
+  auto const begin_segment     = std::min(chunk_idx * segments_in_chunk, num_segments);
+  auto const end_segment       = std::min(begin_segment + segments_in_chunk, num_segments);
+  std::vector<cudf::size_type> selected_segments(end_segment - begin_segment);
+  std::iota(selected_segments.begin(), selected_segments.end(), begin_segment);
 
   return selected_segments;
 }
diff --git a/cpp/benchmarks/io/orc/orc_reader_options.cpp b/cpp/benchmarks/io/orc/orc_reader_options.cpp
index 647a411c89d..1f656f7ea70 100644
--- a/cpp/benchmarks/io/orc/orc_reader_options.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_options.cpp
@@ -19,6 +19,7 @@
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -30,7 +31,7 @@
 constexpr int64_t data_size = 512 << 20;
 // The number of separate read calls to use when reading files in multiple chunks
 // Each call reads roughly equal amounts of data
-constexpr int32_t chunked_read_num_chunks = 8;
+constexpr int32_t chunked_read_num_chunks = 4;
 
 std::vector<std::string> get_top_level_col_names(cudf::io::source_info const& source)
 {
@@ -88,7 +89,7 @@ void BM_orc_read_varying_options(nvbench::state& state,
 
   auto const num_stripes =
     cudf::io::read_orc_metadata(source_sink.make_source_info()).num_stripes();
-  cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
+  auto const chunk_row_cnt = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks);
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
@@ -99,7 +100,6 @@ void BM_orc_read_varying_options(nvbench::state& state,
       timer.start();
       cudf::size_type rows_read = 0;
       for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
-        auto const is_last_chunk = chunk == (num_chunks - 1);
         switch (RowSelection) {
           case row_selection::ALL: break;
           case row_selection::STRIPES:
@@ -108,7 +108,6 @@ void BM_orc_read_varying_options(nvbench::state& state,
           case row_selection::NROWS:
             read_options.set_skip_rows(chunk * chunk_row_cnt);
             read_options.set_num_rows(chunk_row_cnt);
-            if (is_last_chunk) read_options.set_num_rows(-1);
             break;
           default: CUDF_FAIL("Unsupported row selection method");
         }
@@ -132,9 +131,6 @@ using col_selections = nvbench::enum_type_list<column_selection::ALL,
                                                column_selection::ALTERNATE,
                                                column_selection::FIRST_HALF,
                                                column_selection::SECOND_HALF>;
-using row_selections =
-  nvbench::enum_type_list<row_selection::ALL, row_selection::STRIPES, row_selection::NROWS>;
-
 NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
                     NVBENCH_TYPE_AXES(col_selections,
                                       nvbench::enum_type_list<row_selection::ALL>,
@@ -146,6 +142,8 @@ NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
     {"column_selection", "row_selection", "uses_index", "uses_numpy_dtype", "timestamp_type"})
   .set_min_samples(4);
 
+using row_selections =
+  nvbench::enum_type_list<row_selection::ALL, row_selection::NROWS, row_selection::STRIPES>;
 NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
                     NVBENCH_TYPE_AXES(nvbench::enum_type_list<column_selection::ALL>,
                                       row_selections,
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
index 4105f2182d7..9f221de7da2 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
@@ -19,6 +19,7 @@
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -26,21 +27,21 @@
 
 // Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
-constexpr std::size_t data_size      = 512 << 20;
-constexpr std::size_t row_group_size = 128 << 20;
+constexpr std::size_t data_size = 512 << 20;
+// The number of separate read calls to use when reading files in multiple chunks
+// Each call reads roughly equal amounts of data
+constexpr int32_t chunked_read_num_chunks = 4;
 
 std::vector<std::string> get_top_level_col_names(cudf::io::source_info const& source)
 {
-  cudf::io::parquet_reader_options const read_options =
-    cudf::io::parquet_reader_options::builder(source);
-  auto const schema = cudf::io::read_parquet(read_options).metadata.schema_info;
-
-  std::vector<std::string> names;
-  names.reserve(schema.size());
-  std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) {
-    return c.name;
-  });
-  return names;
+  auto const top_lvl_cols = cudf::io::read_parquet_metadata(source).schema().root().children();
+  std::vector<std::string> col_names;
+  std::transform(top_lvl_cols.cbegin(),
+                 top_lvl_cols.cend(),
+                 std::back_inserter(col_names),
+                 [](auto const& col_meta) { return col_meta.name(); });
+
+  return col_names;
 }
 
 template <column_selection ColSelection,
@@ -55,6 +56,8 @@ void BM_parquet_read_options(nvbench::state& state,
                                                 nvbench::enum_type<UsesPandasMetadata>,
                                                 nvbench::enum_type<Timestamp>>)
 {
+  auto const num_chunks = RowSelection == row_selection::ALL ? 1 : chunked_read_num_chunks;
+
   auto constexpr str_to_categories = ConvertsStrings == converts_strings::YES;
   auto constexpr uses_pd_metadata  = UsesPandasMetadata == uses_pandas_metadata::YES;
 
@@ -87,9 +90,8 @@ void BM_parquet_read_options(nvbench::state& state,
       .use_pandas_metadata(uses_pd_metadata)
       .timestamp_type(ts_type);
 
-  // TODO: add read_parquet_metadata to properly calculate #row_groups
-  auto constexpr num_row_groups = data_size / row_group_size;
-  auto constexpr num_chunks     = 1;
+  auto const num_row_groups = read_parquet_metadata(source_sink.make_source_info()).num_rowgroups();
+  auto const chunk_row_cnt  = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks);
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
@@ -100,18 +102,15 @@ void BM_parquet_read_options(nvbench::state& state,
       timer.start();
       cudf::size_type rows_read = 0;
       for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
-        auto const is_last_chunk = chunk == (num_chunks - 1);
         switch (RowSelection) {
           case row_selection::ALL: break;
           case row_selection::ROW_GROUPS: {
-            auto row_groups_to_read = segments_in_chunk(num_row_groups, num_chunks, chunk);
-            if (is_last_chunk) {
-              // Need to assume that an additional "overflow" row group is present
-              row_groups_to_read.push_back(num_row_groups);
-            }
-            read_options.set_row_groups({row_groups_to_read});
+            read_options.set_row_groups({segments_in_chunk(num_row_groups, num_chunks, chunk)});
           } break;
-          case row_selection::NROWS: [[fallthrough]];
+          case row_selection::NROWS:
+            read_options.set_skip_rows(chunk * chunk_row_cnt);
+            read_options.set_num_rows(chunk_row_cnt);
+            break;
           default: CUDF_FAIL("Unsupported row selection method");
         }
 
@@ -130,14 +129,26 @@ void BM_parquet_read_options(nvbench::state& state,
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+using row_selections =
+  nvbench::enum_type_list<row_selection::ALL, row_selection::NROWS, row_selection::ROW_GROUPS>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_options,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<column_selection::ALL>,
+                                      row_selections,
+                                      nvbench::enum_type_list<converts_strings::YES>,
+                                      nvbench::enum_type_list<uses_pandas_metadata::YES>,
+                                      nvbench::enum_type_list<cudf::type_id::EMPTY>))
+  .set_name("parquet_read_row_selection")
+  .set_type_axes_names({"column_selection",
+                        "row_selection",
+                        "str_to_categories",
+                        "uses_pandas_metadata",
+                        "timestamp_type"})
+  .set_min_samples(4);
+
 using col_selections = nvbench::enum_type_list<column_selection::ALL,
                                                column_selection::ALTERNATE,
                                                column_selection::FIRST_HALF,
                                                column_selection::SECOND_HALF>;
-
-// TODO: row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file
-// and determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
-
 NVBENCH_BENCH_TYPES(BM_parquet_read_options,
                     NVBENCH_TYPE_AXES(col_selections,
                                       nvbench::enum_type_list<row_selection::ALL>,
diff --git a/cpp/benchmarks/search/contains.cpp b/cpp/benchmarks/search/contains_scalar.cpp
similarity index 100%
rename from cpp/benchmarks/search/contains.cpp
rename to cpp/benchmarks/search/contains_scalar.cpp
diff --git a/cpp/benchmarks/search/contains_table.cpp b/cpp/benchmarks/search/contains_table.cpp
new file mode 100644
index 00000000000..17702d0741c
--- /dev/null
+++ b/cpp/benchmarks/search/contains_table.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/detail/search.hpp>
+#include <cudf/lists/list_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+auto constexpr num_unique_elements = 1000;
+
+template <typename Type>
+static void nvbench_contains_table(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const size               = state.get_int64("table_size");
+  auto const dtype              = cudf::type_to_id<Type>();
+  double const null_probability = state.get_float64("null_probability");
+
+  auto builder = data_profile_builder().null_probability(null_probability);
+  if (dtype == cudf::type_id::LIST) {
+    builder.distribution(dtype, distribution_id::UNIFORM, 0, num_unique_elements)
+      .distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_unique_elements)
+      .list_depth(1);
+  } else {
+    builder.distribution(dtype, distribution_id::UNIFORM, 0, num_unique_elements);
+  }
+
+  auto const haystack = create_random_table(
+    {dtype}, table_size_bytes{static_cast<size_t>(size)}, data_profile{builder}, 0);
+  auto const needles = create_random_table(
+    {dtype}, table_size_bytes{static_cast<size_t>(size)}, data_profile{builder}, 1);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const stream_view = rmm::cuda_stream_view{launch.get_stream()};
+    [[maybe_unused]] auto const result =
+      cudf::detail::contains(haystack->view(),
+                             needles->view(),
+                             cudf::null_equality::EQUAL,
+                             cudf::nan_equality::ALL_EQUAL,
+                             stream_view,
+                             rmm::mr::get_current_device_resource());
+  });
+
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH_TYPES(nvbench_contains_table,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
+  .set_name("contains_table")
+  .set_type_axes_names({"type"})
+  .add_float64_axis("null_probability", {0.0, 0.1})
+  .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000});
diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp
index 0319577f6b9..f3fd5cc5729 100644
--- a/cpp/benchmarks/text/ngrams.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -36,11 +36,12 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt)
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
   cudf::strings_column_view input(column->view());
+  auto const separator = cudf::string_scalar("_");
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);
     switch (nt) {
-      case ngrams_type::tokens: nvtext::generate_ngrams(input); break;
+      case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break;
       case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
     }
   }
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index 423fe667b05..b556a84c541 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -67,8 +67,11 @@ static void bench_tokenize(nvbench::state& state)
       auto result = nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
     });
   } else if (tokenize_type == "ngrams") {
-    state.exec(nvbench::exec_tag::sync,
-               [&](nvbench::launch& launch) { auto result = nvtext::ngrams_tokenize(input); });
+    auto const delimiter = cudf::string_scalar("");
+    auto const separator = cudf::string_scalar("_");
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::ngrams_tokenize(input, 2, delimiter, separator);
+    });
   } else if (tokenize_type == "characters") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = nvtext::character_tokenize(input); });
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 4731c4919e3..6532dae3695 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -16,14 +16,13 @@
 
 #pragma once
 
+#include <cudf/detail/normalizing_iterator.cuh>
+
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/optional.h>
@@ -32,193 +31,6 @@
 namespace cudf {
 namespace detail {
 
-/**
- * @brief The base class for the input or output index normalizing iterator.
- *
- * This implementation uses CRTP to define the `input_indexalator` and the
- * `output_indexalator` classes. This is so this class can manipulate the
- * uniquely typed subclass member variable `p_` directly without requiring
- * virtual functions since iterator instances will be copied to device memory.
- *
- * The base class mainly manages updating the `p_` member variable while the
- * subclasses handle accessing individual elements in device memory.
- *
- * @tparam T The derived class type for the iterator.
- */
-template <class T>
-struct base_indexalator {
-  using difference_type   = ptrdiff_t;
-  using value_type        = size_type;
-  using pointer           = size_type*;
-  using iterator_category = std::random_access_iterator_tag;
-
-  base_indexalator()                                   = default;
-  base_indexalator(base_indexalator const&)            = default;
-  base_indexalator(base_indexalator&&)                 = default;
-  base_indexalator& operator=(base_indexalator const&) = default;
-  base_indexalator& operator=(base_indexalator&&)      = default;
-
-  /**
-   * @brief Prefix increment operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator++()
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ += width_;
-    return derived;
-  }
-
-  /**
-   * @brief Postfix increment operator.
-   */
-  CUDF_HOST_DEVICE inline T operator++(int)
-  {
-    T tmp{static_cast<T&>(*this)};
-    operator++();
-    return tmp;
-  }
-
-  /**
-   * @brief Prefix decrement operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator--()
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ -= width_;
-    return derived;
-  }
-
-  /**
-   * @brief Postfix decrement operator.
-   */
-  CUDF_HOST_DEVICE inline T operator--(int)
-  {
-    T tmp{static_cast<T&>(*this)};
-    operator--();
-    return tmp;
-  }
-
-  /**
-   * @brief Compound assignment by sum operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator+=(difference_type offset)
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ += offset * width_;
-    return derived;
-  }
-
-  /**
-   * @brief Increment by offset operator.
-   */
-  CUDF_HOST_DEVICE inline T operator+(difference_type offset) const
-  {
-    auto tmp = T{static_cast<T const&>(*this)};
-    tmp.p_ += (offset * width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Addition assignment operator.
-   */
-  CUDF_HOST_DEVICE inline friend T operator+(difference_type offset, T const& rhs)
-  {
-    T tmp{rhs};
-    tmp.p_ += (offset * rhs.width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Compound assignment by difference operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator-=(difference_type offset)
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ -= offset * width_;
-    return derived;
-  }
-
-  /**
-   * @brief Decrement by offset operator.
-   */
-  CUDF_HOST_DEVICE inline T operator-(difference_type offset) const
-  {
-    auto tmp = T{static_cast<T const&>(*this)};
-    tmp.p_ -= (offset * width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Subtraction assignment operator.
-   */
-  CUDF_HOST_DEVICE inline friend T operator-(difference_type offset, T const& rhs)
-  {
-    T tmp{rhs};
-    tmp.p_ -= (offset * rhs.width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Compute offset from iterator difference operator.
-   */
-  CUDF_HOST_DEVICE inline difference_type operator-(T const& rhs) const
-  {
-    return (static_cast<T const&>(*this).p_ - rhs.p_) / width_;
-  }
-
-  /**
-   * @brief Equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator==(T const& rhs) const
-  {
-    return rhs.p_ == static_cast<T const&>(*this).p_;
-  }
-  /**
-   * @brief Not equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator!=(T const& rhs) const
-  {
-    return rhs.p_ != static_cast<T const&>(*this).p_;
-  }
-  /**
-   * @brief Less than operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator<(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ < rhs.p_;
-  }
-  /**
-   * @brief Greater than operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator>(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ > rhs.p_;
-  }
-  /**
-   * @brief Less than or equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator<=(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ <= rhs.p_;
-  }
-  /**
-   * @brief Greater than or equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator>=(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ >= rhs.p_;
-  }
-
- protected:
-  /**
-   * @brief Constructor assigns width and type member variables for base class.
-   */
-  base_indexalator(int32_t width, data_type dtype) : width_(width), dtype_(dtype) {}
-
-  int width_;        /// integer type width = 1,2,4, or 8
-  data_type dtype_;  /// for type-dispatcher calls
-};
-
 /**
  * @brief The index normalizing input iterator.
  *
@@ -244,65 +56,7 @@ struct base_indexalator {
  *  auto result = thrust::find(thrust::device, begin, end, size_type{12} );
  * @endcode
  */
-struct input_indexalator : base_indexalator<input_indexalator> {
-  friend struct indexalator_factory;
-  friend struct base_indexalator<input_indexalator>;  // for CRTP
-
-  using reference = size_type const;  // this keeps STL and thrust happy
-
-  input_indexalator()                                    = default;
-  input_indexalator(input_indexalator const&)            = default;
-  input_indexalator(input_indexalator&&)                 = default;
-  input_indexalator& operator=(input_indexalator const&) = default;
-  input_indexalator& operator=(input_indexalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns the value at the current iterator position.
-   */
-  __device__ inline size_type operator*() const { return operator[](0); }
-
-  /**
-   * @brief Dispatch functor for resolving a size_type value from any index type.
-   */
-  struct index_as_size_type {
-    template <typename T, std::enable_if_t<is_index_type<T>()>* = nullptr>
-    __device__ size_type operator()(void const* tp)
-    {
-      return static_cast<size_type>(*static_cast<T const*>(tp));
-    }
-    template <typename T, std::enable_if_t<not is_index_type<T>()>* = nullptr>
-    __device__ size_type operator()(void const* tp)
-    {
-      CUDF_UNREACHABLE("only index types are supported");
-    }
-  };
-  /**
-   * @brief Array subscript operator returns a value at the input
-   * `idx` position as a `size_type` value.
-   */
-  __device__ inline size_type operator[](size_type idx) const
-  {
-    void const* tp = p_ + (idx * width_);
-    return type_dispatcher(dtype_, index_as_size_type{}, tp);
-  }
-
- protected:
-  /**
-   * @brief Create an input index normalizing iterator.
-   *
-   * Use the indexalator_factory to create an iterator instance.
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param width     The width of the integer type (1, 2, 4, or 8)
-   * @param data_type Index integer type of width `width`
-   */
-  input_indexalator(void const* data, int width, data_type dtype)
-    : base_indexalator<input_indexalator>(width, dtype), p_{static_cast<char const*>(data)}
-  {
-  }
-
-  char const* p_;  /// pointer to the integer data in device memory
-};
+using input_indexalator = input_normalator<cudf::size_type>;
 
 /**
  * @brief The index normalizing output iterator.
@@ -328,79 +82,7 @@ struct input_indexalator : base_indexalator<input_indexalator> {
  *                      thrust::less<Element>());
  * @endcode
  */
-struct output_indexalator : base_indexalator<output_indexalator> {
-  friend struct indexalator_factory;
-  friend struct base_indexalator<output_indexalator>;  // for CRTP
-
-  using reference = output_indexalator const&;  // required for output iterators
-
-  output_indexalator()                                     = default;
-  output_indexalator(output_indexalator const&)            = default;
-  output_indexalator(output_indexalator&&)                 = default;
-  output_indexalator& operator=(output_indexalator const&) = default;
-  output_indexalator& operator=(output_indexalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns this iterator instance in order
-   * to capture the `operator=(size_type)` calls.
-   */
-  __device__ inline output_indexalator const& operator*() const { return *this; }
-
-  /**
-   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
-   *
-   * This allows capturing the subsequent `operator=(size_type)` call in this class.
-   */
-  __device__ inline output_indexalator const operator[](size_type idx) const
-  {
-    output_indexalator tmp{*this};
-    tmp.p_ += (idx * width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Dispatch functor for setting the index value from a size_type value.
-   */
-  struct size_type_to_index {
-    template <typename T, std::enable_if_t<is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void* tp, size_type const value)
-    {
-      (*static_cast<T*>(tp)) = static_cast<T>(value);
-    }
-    template <typename T, std::enable_if_t<not is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void* tp, size_type const value)
-    {
-      CUDF_UNREACHABLE("only index types are supported");
-    }
-  };
-
-  /**
-   * @brief Assign a size_type value to the current iterator position.
-   */
-  __device__ inline output_indexalator const& operator=(size_type const value) const
-  {
-    void* tp = p_;
-    type_dispatcher(dtype_, size_type_to_index{}, tp, value);
-    return *this;
-  }
-
- protected:
-  /**
-   * @brief Create an output index normalizing iterator.
-   *
-   * Use the indexalator_factory to create an iterator instance.
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param width     The width of the integer type (1, 2, 4, or 8)
-   * @param data_type Index integer type of width `width`
-   */
-  output_indexalator(void* data, int width, data_type dtype)
-    : base_indexalator<output_indexalator>(width, dtype), p_{static_cast<char*>(data)}
-  {
-  }
-
-  char* p_;  /// pointer to the integer data in device memory
-};
+using output_indexalator = output_normalator<cudf::size_type>;
 
 /**
  * @brief Use this class to create an indexalator instance.
@@ -413,7 +95,7 @@ struct indexalator_factory {
     template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
     input_indexalator operator()(column_view const& indices)
     {
-      return input_indexalator(indices.data<IndexType>(), sizeof(IndexType), indices.type());
+      return input_indexalator(indices.data<IndexType>(), indices.type());
     }
     template <typename IndexType,
               typename... Args,
@@ -433,7 +115,7 @@ struct indexalator_factory {
     {
       // note: using static_cast<scalar_type_t<IndexType> const&>(index) creates a copy
       auto const scalar_impl = static_cast<scalar_type_t<IndexType> const*>(&index);
-      return input_indexalator(scalar_impl->data(), sizeof(IndexType), index.type());
+      return input_indexalator(scalar_impl->data(), index.type());
     }
     template <typename IndexType,
               typename... Args,
@@ -451,7 +133,7 @@ struct indexalator_factory {
     template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
     output_indexalator operator()(mutable_column_view const& indices)
     {
-      return output_indexalator(indices.data<IndexType>(), sizeof(IndexType), indices.type());
+      return output_indexalator(indices.data<IndexType>(), indices.type());
     }
     template <typename IndexType,
               typename... Args,
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 3d4832c8d17..44024333239 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -104,13 +104,67 @@ std::shared_ptr<arrow::Array> to_arrow_array(cudf::type_id id, Ts&&... args)
   }
 }
 
+/**
+ * @brief Invokes an `operator()` template with the type instantiation based on
+ * the specified `arrow::DataType`'s `id()`.
+ *
+ * This function is analogous to libcudf's type_dispatcher, but instead applies
+ * to Arrow functions. Its primary use case is to leverage Arrow's
+ * metaprogramming facilities like arrow::TypeTraits that require translating
+ * the runtime dtype information into compile-time types.
+ */
+template <typename Functor, typename... Ts>
+constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype,
+                                               Functor f,
+                                               Ts&&... args)
+{
+  switch (dtype.id()) {
+    case arrow::Type::INT8:
+      return f.template operator()<arrow::Int8Type>(std::forward<Ts>(args)...);
+    case arrow::Type::INT16:
+      return f.template operator()<arrow::Int16Type>(std::forward<Ts>(args)...);
+    case arrow::Type::INT32:
+      return f.template operator()<arrow::Int32Type>(std::forward<Ts>(args)...);
+    case arrow::Type::INT64:
+      return f.template operator()<arrow::Int64Type>(std::forward<Ts>(args)...);
+    case arrow::Type::UINT8:
+      return f.template operator()<arrow::UInt8Type>(std::forward<Ts>(args)...);
+    case arrow::Type::UINT16:
+      return f.template operator()<arrow::UInt16Type>(std::forward<Ts>(args)...);
+    case arrow::Type::UINT32:
+      return f.template operator()<arrow::UInt32Type>(std::forward<Ts>(args)...);
+    case arrow::Type::UINT64:
+      return f.template operator()<arrow::UInt64Type>(std::forward<Ts>(args)...);
+    case arrow::Type::FLOAT:
+      return f.template operator()<arrow::FloatType>(std::forward<Ts>(args)...);
+    case arrow::Type::DOUBLE:
+      return f.template operator()<arrow::DoubleType>(std::forward<Ts>(args)...);
+    case arrow::Type::BOOL:
+      return f.template operator()<arrow::BooleanType>(std::forward<Ts>(args)...);
+    case arrow::Type::TIMESTAMP:
+      return f.template operator()<arrow::TimestampType>(std::forward<Ts>(args)...);
+    case arrow::Type::DURATION:
+      return f.template operator()<arrow::DurationType>(std::forward<Ts>(args)...);
+    case arrow::Type::STRING:
+      return f.template operator()<arrow::StringType>(std::forward<Ts>(args)...);
+    case arrow::Type::LIST:
+      return f.template operator()<arrow::ListType>(std::forward<Ts>(args)...);
+    case arrow::Type::DECIMAL128:
+      return f.template operator()<arrow::Decimal128Type>(std::forward<Ts>(args)...);
+    case arrow::Type::STRUCT:
+      return f.template operator()<arrow::StructType>(std::forward<Ts>(args)...);
+    default: {
+      CUDF_FAIL("Invalid type.");
+    }
+  }
+}
+
 // Converting arrow type to cudf type
 data_type arrow_to_cudf_type(arrow::DataType const& arrow_type);
 
 /**
- * @copydoc cudf::to_arrow
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::to_arrow(table_view input, std::vector<column_metadata> const& metadata,
+ * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr)
  */
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata,
@@ -118,13 +172,27 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        arrow::MemoryPool* ar_mr);
 
 /**
- * @copydoc cudf::arrow_to_cudf
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::to_arrow(cudf::scalar const& input, column_metadata const& metadata,
+ * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr)
+ */
+std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
+                                        column_metadata const& metadata,
+                                        rmm::cuda_stream_view stream,
+                                        arrow::MemoryPool* ar_mr);
+/**
+ * @copydoc cudf::from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream,
+ * rmm::mr::device_memory_resource* mr)
  */
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr);
 
+/**
+ * @copydoc cudf::from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream,
+ * rmm::mr::device_memory_resource* mr)
+ */
+std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
new file mode 100644
index 00000000000..51b3133f84f
--- /dev/null
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief The base class for the input or output normalizing iterator
+ *
+ * The base class mainly manages updating the `p_` member variable while the
+ * subclasses handle accessing individual elements in device memory.
+ *
+ * @tparam Derived The derived class type for the iterator
+ * @tparam Integer The type the iterator normalizes to
+ */
+template <class Derived, typename Integer>
+struct base_normalator {
+  static_assert(std::is_integral_v<Integer>);
+  using difference_type   = std::ptrdiff_t;
+  using value_type        = Integer;
+  using pointer           = Integer*;
+  using iterator_category = std::random_access_iterator_tag;
+
+  base_normalator()                                  = default;
+  base_normalator(base_normalator const&)            = default;
+  base_normalator(base_normalator&&)                 = default;
+  base_normalator& operator=(base_normalator const&) = default;
+  base_normalator& operator=(base_normalator&&)      = default;
+
+  /**
+   * @brief Prefix increment operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator++()
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ += width_;
+    return derived;
+  }
+
+  /**
+   * @brief Postfix increment operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator++(int)
+  {
+    Derived tmp{static_cast<Derived&>(*this)};
+    operator++();
+    return tmp;
+  }
+
+  /**
+   * @brief Prefix decrement operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator--()
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ -= width_;
+    return derived;
+  }
+
+  /**
+   * @brief Postfix decrement operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator--(int)
+  {
+    Derived tmp{static_cast<Derived&>(*this)};
+    operator--();
+    return tmp;
+  }
+
+  /**
+   * @brief Compound assignment by sum operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator+=(difference_type offset)
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ += offset * width_;
+    return derived;
+  }
+
+  /**
+   * @brief Increment by offset operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator+(difference_type offset) const
+  {
+    auto tmp = Derived{static_cast<Derived const&>(*this)};
+    tmp.p_ += (offset * width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Addition assignment operator.
+   */
+  CUDF_HOST_DEVICE inline friend Derived operator+(difference_type offset, Derived const& rhs)
+  {
+    Derived tmp{rhs};
+    tmp.p_ += (offset * rhs.width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Compound assignment by difference operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator-=(difference_type offset)
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ -= offset * width_;
+    return derived;
+  }
+
+  /**
+   * @brief Decrement by offset operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator-(difference_type offset) const
+  {
+    auto tmp = Derived{static_cast<Derived const&>(*this)};
+    tmp.p_ -= (offset * width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Subtraction assignment operator.
+   */
+  CUDF_HOST_DEVICE inline friend Derived operator-(difference_type offset, Derived const& rhs)
+  {
+    Derived tmp{rhs};
+    tmp.p_ -= (offset * rhs.width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Compute offset from iterator difference operator.
+   */
+  CUDF_HOST_DEVICE inline difference_type operator-(Derived const& rhs) const
+  {
+    return (static_cast<Derived const&>(*this).p_ - rhs.p_) / width_;
+  }
+
+  /**
+   * @brief Equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator==(Derived const& rhs) const
+  {
+    return rhs.p_ == static_cast<Derived const&>(*this).p_;
+  }
+
+  /**
+   * @brief Not equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator!=(Derived const& rhs) const
+  {
+    return rhs.p_ != static_cast<Derived const&>(*this).p_;
+  }
+
+  /**
+   * @brief Less than operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator<(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ < rhs.p_;
+  }
+
+  /**
+   * @brief Greater than operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator>(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ > rhs.p_;
+  }
+
+  /**
+   * @brief Less than or equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator<=(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ <= rhs.p_;
+  }
+
+  /**
+   * @brief Greater than or equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator>=(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ >= rhs.p_;
+  }
+
+ protected:
+  /**
+   * @brief Constructor assigns width and type member variables for base class.
+   */
+  explicit base_normalator(data_type dtype) : width_(size_of(dtype)), dtype_(dtype) {}
+
+  int width_;        /// integer type width = 1,2,4, or 8
+  data_type dtype_;  /// for type-dispatcher calls
+};
+
+/**
+ * @brief The integer normalizing input iterator
+ *
+ * This is an iterator that can be used for index types (integers) without
+ * requiring a type-specific instance. It can be used for any iterator
+ * interface for reading an array of integer values of type
+ * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
+ * Reading specific elements always return a type of `Integer`
+ *
+ * @tparam Integer Type returned by all read functions
+ */
+template <typename Integer>
+struct input_normalator : base_normalator<input_normalator<Integer>, Integer> {
+  friend struct base_normalator<input_normalator<Integer>, Integer>;  // for CRTP
+
+  using reference = Integer const;  // this keeps STL and thrust happy
+
+  input_normalator()                                   = default;
+  input_normalator(input_normalator const&)            = default;
+  input_normalator(input_normalator&&)                 = default;
+  input_normalator& operator=(input_normalator const&) = default;
+  input_normalator& operator=(input_normalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline Integer operator*() const { return operator[](0); }
+
+  /**
+   * @brief Dispatch functor for resolving a Integer value from any integer type
+   */
+  struct normalize_type {
+    template <typename T, std::enable_if_t<cuda::std::is_integral_v<T>>* = nullptr>
+    __device__ Integer operator()(void const* tp)
+    {
+      return static_cast<Integer>(*static_cast<T const*>(tp));
+    }
+    template <typename T, std::enable_if_t<not cuda::std::is_integral_v<T>>* = nullptr>
+    __device__ Integer operator()(void const*)
+    {
+      CUDF_UNREACHABLE("only integral types are supported");
+    }
+  };
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a `Integer` value.
+   */
+  __device__ inline Integer operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return type_dispatcher(this->dtype_, normalize_type{}, tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator.
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param data_type Type of data in data
+   */
+  input_normalator(void const* data, data_type dtype)
+    : base_normalator<input_normalator<Integer>, Integer>(dtype), p_{static_cast<char const*>(data)}
+  {
+  }
+
+  char const* p_;  /// pointer to the integer data in device memory
+};
+
+/**
+ * @brief The integer normalizing output iterator
+ *
+ * This is an iterator that can be used for index types (integers) without
+ * requiring a type-specific instance. It can be used for any iterator
+ * interface for writing an array of integer values of type
+ * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
+ * Setting specific elements always accept the `Integer` type values.
+ *
+ * @tparam Integer The type used for all write functions
+ */
+template <typename Integer>
+struct output_normalator : base_normalator<output_normalator<Integer>, Integer> {
+  friend struct base_normalator<output_normalator<Integer>, Integer>;  // for CRTP
+
+  using reference = output_normalator const&;  // required for output iterators
+
+  output_normalator()                                    = default;
+  output_normalator(output_normalator const&)            = default;
+  output_normalator(output_normalator&&)                 = default;
+  output_normalator& operator=(output_normalator const&) = default;
+  output_normalator& operator=(output_normalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(Integer)` calls.
+   */
+  __device__ inline output_normalator const& operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(Integer)` call in this class.
+   */
+  __device__ inline output_normalator const operator[](size_type idx) const
+  {
+    output_normalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Dispatch functor for setting the index value from a size_type value.
+   */
+  struct normalize_type {
+    template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+    __device__ void operator()(void* tp, Integer const value)
+    {
+      (*static_cast<T*>(tp)) = static_cast<T>(value);
+    }
+    template <typename T, std::enable_if_t<not std::is_integral_v<T>>* = nullptr>
+    __device__ void operator()(void*, Integer const)
+    {
+      CUDF_UNREACHABLE("only index types are supported");
+    }
+  };
+
+  /**
+   * @brief Assign an Integer value to the current iterator position
+   */
+  __device__ inline output_normalator const& operator=(Integer const value) const
+  {
+    void* tp = p_;
+    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
+    return *this;
+  }
+
+  /**
+   * @brief Create an output normalizing iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param data_type Type of data in data
+   */
+  output_normalator(void* data, data_type dtype)
+    : base_normalator<output_normalator<Integer>, Integer>(dtype), p_{static_cast<char*>(data)}
+  {
+  }
+
+  char* p_;  /// pointer to the integer data in device memory
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index 4c4ad7834f4..4277baf3edd 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -81,6 +81,8 @@ std::unique_ptr<column> contains(column_view const& haystack,
  * output   = { false, true, true }
  * @endcode
  *
+ * @throws cudf::logic_error If column types of haystack and needles don't match
+ *
  * @param haystack The table containing the search space
  * @param needles A table of rows whose existence to check in the search space
  * @param compare_nulls Control whether nulls should be compared as equal or not
diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
index fb13eabe11a..959b785bf87 100644
--- a/cpp/include/cudf/dictionary/encode.hpp
+++ b/cpp/include/cudf/dictionary/encode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,12 +53,14 @@ namespace dictionary {
  *
  * @param column The column to dictionary encode
  * @param indices_type The integer type to use for the indices
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Returns a dictionary column
  */
 std::unique_ptr<column> encode(
   column_view const& column,
   data_type indices_type              = data_type{type_id::UINT32},
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -72,11 +74,13 @@ std::unique_ptr<column> encode(
  * @endcode
  *
  * @param dictionary_column Existing dictionary column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column with type matching the dictionary_column's keys
  */
 std::unique_ptr<column> decode(
   dictionary_column_view const& dictionary_column,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/dictionary/search.hpp b/cpp/include/cudf/dictionary/search.hpp
index ed7a9c84693..1b72cf42acd 100644
--- a/cpp/include/cudf/dictionary/search.hpp
+++ b/cpp/include/cudf/dictionary/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,12 +37,14 @@ namespace dictionary {
  *
  * @param dictionary The dictionary to search for the key.
  * @param key The value to search for in the dictionary keyset.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned scalar's device memory.
- * @return Numeric scalar index value of the key within the dictionary
+ * @return Numeric scalar index value of the key within the dictionary.
  */
 std::unique_ptr<scalar> get_index(
   dictionary_column_view const& dictionary,
   scalar const& key,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/dictionary/update_keys.hpp b/cpp/include/cudf/dictionary/update_keys.hpp
index 2fcfb5e1f7c..81728e1ff73 100644
--- a/cpp/include/cudf/dictionary/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/update_keys.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,13 +51,15 @@ namespace dictionary {
  * @throw cudf_logic_error if the new_keys contain nulls.
  *
  * @param dictionary_column Existing dictionary column.
- * @param new_keys New keys to incorporate into the dictionary_column
+ * @param new_keys New keys to incorporate into the dictionary_column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> add_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& new_keys,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -81,13 +83,15 @@ std::unique_ptr<column> add_keys(
  * @throw cudf_logic_error if the keys_to_remove contain nulls.
  *
  * @param dictionary_column Existing dictionary column.
- * @param keys_to_remove The keys to remove from the dictionary_column
+ * @param keys_to_remove The keys to remove from the dictionary_column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> remove_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys_to_remove,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -103,11 +107,13 @@ std::unique_ptr<column> remove_keys(
  * @endcode
  *
  * @param dictionary_column Existing dictionary column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> remove_unused_keys(
   dictionary_column_view const& dictionary_column,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -134,12 +140,14 @@ std::unique_ptr<column> remove_unused_keys(
  *
  * @param dictionary_column Existing dictionary column.
  * @param keys New keys to use for the output column. Must not contain nulls.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> set_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -149,11 +157,13 @@ std::unique_ptr<column> set_keys(
  * The result is a vector of new dictionaries with a common set of keys.
  *
  * @param input Dictionary columns to match keys.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary columns.
  */
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/hash/hash_allocator.cuh b/cpp/include/cudf/hashing/detail/hash_allocator.cuh
similarity index 100%
rename from cpp/src/hash/hash_allocator.cuh
rename to cpp/include/cudf/hashing/detail/hash_allocator.cuh
diff --git a/cpp/src/hash/helper_functions.cuh b/cpp/include/cudf/hashing/detail/helper_functions.cuh
similarity index 100%
rename from cpp/src/hash/helper_functions.cuh
rename to cpp/include/cudf/hashing/detail/helper_functions.cuh
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index e210179b147..865cc004107 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -126,23 +126,56 @@ struct column_metadata {
  *
  * @param input table_view that needs to be converted to arrow Table
  * @param metadata Contains hierarchy of names of columns and children
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param ar_mr arrow memory pool to allocate memory for arrow Table
  * @return arrow Table generated from `input`
  */
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata = {},
-                                       arrow::MemoryPool* ar_mr = arrow::default_memory_pool());
+                                       rmm::cuda_stream_view stream = cudf::get_default_stream(),
+                                       arrow::MemoryPool* ar_mr     = arrow::default_memory_pool());
 
+/**
+ * @brief Create `arrow::Scalar` from cudf scalar `input`
+ *
+ * Converts the `cudf::scalar` to `arrow::Scalar`.
+ *
+ * @param input scalar that needs to be converted to arrow Scalar
+ * @param metadata Contains hierarchy of names of columns and children
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param ar_mr arrow memory pool to allocate memory for arrow Scalar
+ * @return arrow Scalar generated from `input`
+ */
+std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
+                                        column_metadata const& metadata = {},
+                                        rmm::cuda_stream_view stream = cudf::get_default_stream(),
+                                        arrow::MemoryPool* ar_mr = arrow::default_memory_pool());
 /**
  * @brief Create `cudf::table` from given arrow Table input
  *
  * @param input arrow:Table that needs to be converted to `cudf::table`
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr    Device memory resource used to allocate `cudf::table`
  * @return cudf table generated from given arrow Table
  */
 
 std::unique_ptr<table> from_arrow(
   arrow::Table const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create `cudf::scalar` from given arrow Scalar input
+ *
+ * @param input `arrow::Scalar` that needs to be converted to `cudf::scalar`
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr    Device memory resource used to allocate `cudf::scalar`
+ * @return cudf scalar generated from given arrow Scalar
+ */
+
+std::unique_ptr<cudf::scalar> from_arrow(
+  arrow::Scalar const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 6924e77ae9b..e4e803b2d3c 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -43,6 +44,7 @@ namespace cudf {
  * @param null_precedence The desired order of null compared to other elements
  * for each column. Size must be equal to `input.num_columns()` or empty.
  * If empty, all columns will be sorted in `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A non-nullable column of elements containing the permuted row indices of
  * `input` if it were sorted
@@ -51,6 +53,7 @@ std::unique_ptr<column> sorted_order(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -65,27 +68,30 @@ std::unique_ptr<column> stable_sorted_order(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Checks whether the rows of a `table` are sorted in a lexicographical
  *        order.
  *
- * @param[in] table             Table whose rows need to be compared for ordering
- * @param[in] column_order      The expected sort order for each column. Size
- *                              must be equal to `in.num_columns()` or empty. If
- *                              empty, it is expected all columns are in
- *                              ascending order.
- * @param[in] null_precedence   The desired order of null compared to other
- *                              elements for each column. Size must be equal to
- *                              `input.num_columns()` or empty. If empty,
- *                              `null_order::BEFORE` is assumed for all columns.
- *
- * @returns bool                true if sorted as expected, false if not
+ * @param table             Table whose rows need to be compared for ordering
+ * @param column_order      The expected sort order for each column. Size
+ *                          must be equal to `in.num_columns()` or empty. If
+ *                          empty, it is expected all columns are in
+ *                          ascending order.
+ * @param null_precedence   The desired order of null compared to other
+ *                          elements for each column. Size must be equal to
+ *                          `input.num_columns()` or empty. If empty,
+ *                          `null_order::BEFORE` is assumed for all columns.
+ *
+ * @param stream            CUDA stream used for device memory operations and kernel launches
+ * @returns                 true if sorted as expected, false if not
  */
 bool is_sorted(cudf::table_view const& table,
                std::vector<order> const& column_order,
-               std::vector<null_order> const& null_precedence);
+               std::vector<null_order> const& null_precedence,
+               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Performs a lexicographic sort of the rows of a table
@@ -98,6 +104,7 @@ bool is_sorted(cudf::table_view const& table,
  * elements for each column in `input`. Size must be equal to
  * `input.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return New table containing the desired sorted order of `input`
  */
@@ -105,6 +112,7 @@ std::unique_ptr<table> sort(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -124,6 +132,7 @@ std::unique_ptr<table> sort(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return The reordering of `values` determined by the lexicographic order of
  * the rows of `keys`.
@@ -133,6 +142,7 @@ std::unique_ptr<table> sort_by_key(
   table_view const& keys,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -154,6 +164,7 @@ std::unique_ptr<table> sort_by_key(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return The reordering of `values` determined by the lexicographic order of
  * the rows of `keys`.
@@ -163,6 +174,7 @@ std::unique_ptr<table> stable_sort_by_key(
   table_view const& keys,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -189,6 +201,7 @@ std::unique_ptr<table> stable_sort_by_key(
  * @param null_precedence The desired order of null compared to other elements
  * for column
  * @param percentage flag to convert ranks to percentage in range (0,1]
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A column of containing the rank of the each element of the column of `input`. The output
  * column type will be `size_type`column by default or else `double` when
@@ -201,6 +214,7 @@ std::unique_ptr<column> rank(
   null_policy null_handling,
   null_order null_precedence,
   bool percentage,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -241,6 +255,7 @@ std::unique_ptr<column> rank(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to allocate any returned objects
  * @return sorted order of the segment sorted table
  *
@@ -250,6 +265,7 @@ std::unique_ptr<column> segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -262,6 +278,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -306,6 +323,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to allocate any returned objects
  * @return table with elements in each segment sorted
  *
@@ -316,6 +334,7 @@ std::unique_ptr<table> segmented_sort_by_key(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -329,6 +348,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index cc8cac35ef4..c0932b81dc3 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -944,8 +944,10 @@ class dictionary_column_wrapper : public detail::column_wrapper {
   template <typename InputIterator>
   dictionary_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{}
   {
-    wrapped = cudf::dictionary::encode(
-      fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end));
+    wrapped =
+      cudf::dictionary::encode(fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end),
+                               cudf::data_type{type_id::UINT32},
+                               cudf::test::get_default_stream());
   }
 
   /**
@@ -978,7 +980,9 @@ class dictionary_column_wrapper : public detail::column_wrapper {
     : column_wrapper{}
   {
     wrapped = cudf::dictionary::encode(
-      fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end, v));
+      fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end, v),
+      cudf::data_type{type_id::UINT32},
+      cudf::test::get_default_stream());
   }
 
   /**
@@ -1134,7 +1138,9 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
   template <typename StringsIterator>
   dictionary_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
   {
-    wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end));
+    wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end),
+                                       cudf::data_type{type_id::UINT32},
+                                       cudf::test::get_default_stream());
   }
 
   /**
@@ -1169,7 +1175,9 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
   dictionary_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
     : column_wrapper{}
   {
-    wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v));
+    wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v),
+                                       cudf::data_type{type_id::UINT32},
+                                       cudf::test::get_default_stream());
   }
 
   /**
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index 5d66401df9d..46f2c0e7bc9 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -47,19 +47,19 @@ namespace nvtext {
  * @throw cudf::logic_error if `separator` is invalid
  * @throw cudf::logic_error if there are not enough strings to generate any ngrams
  *
- * @param strings Strings column to tokenize and produce ngrams from.
- * @param ngrams The ngram number to generate.
- *               Default is 2 = bigram.
- * @param separator The string to use for separating ngram tokens.
- *                  Default is "_" character.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param input Strings column to tokenize and produce ngrams from
+ * @param ngrams The ngram number to generate
+ * @param separator The string to use for separating ngram tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> generate_ngrams(
-  cudf::strings_column_view const& strings,
-  cudf::size_type ngrams               = 2,
-  cudf::string_scalar const& separator = cudf::string_scalar{"_"},
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  cudf::strings_column_view const& input,
+  cudf::size_type ngrams,
+  cudf::string_scalar const& separator,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Generates ngrams of characters within each string.
@@ -79,15 +79,17 @@ std::unique_ptr<cudf::column> generate_ngrams(
  * @throw cudf::logic_error if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
- * @param strings Strings column to produce ngrams from.
+ * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate.
  *               Default is 2 = bigram.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> generate_character_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams              = 2,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -113,14 +115,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
  * @throw cudf::logic_error if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
- * @param strings Strings column to produce ngrams from.
+ * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate. Default is 5.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A lists column of hash values
  */
 std::unique_ptr<cudf::column> hash_character_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams              = 5,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp
index 17f20f7ea4c..9d76ef8689f 100644
--- a/cpp/include/nvtext/ngrams_tokenize.hpp
+++ b/cpp/include/nvtext/ngrams_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,22 +66,22 @@ namespace nvtext {
  *
  * All null row entries are ignored and the output contains all valid rows.
  *
- * @param strings Strings column to tokenize and produce ngrams from.
- * @param ngrams The ngram number to generate.
- *               Default is 2 = bigram.
+ * @param input Strings column to tokenize and produce ngrams from
+ * @param ngrams The ngram number to generate
  * @param delimiter UTF-8 characters used to separate each string into tokens.
- *                  The default of empty string will separate tokens using whitespace.
- * @param separator The string to use for separating ngram tokens.
- *                  Default is "_" character.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ *                  An empty string will separate tokens using whitespace.
+ * @param separator The string to use for separating ngram tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> ngrams_tokenize(
-  cudf::strings_column_view const& strings,
-  cudf::size_type ngrams               = 2,
-  cudf::string_scalar const& delimiter = cudf::string_scalar{""},
-  cudf::string_scalar const& separator = cudf::string_scalar{"_"},
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  cudf::strings_column_view const& input,
+  cudf::size_type ngrams,
+  cudf::string_scalar const& delimiter,
+  cudf::string_scalar const& separator,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index ab22c07e4d5..3973100aced 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -130,10 +130,11 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 
 std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::add_keys(dictionary_column, keys, cudf::get_default_stream(), mr);
+  return detail::add_keys(dictionary_column, keys, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu
index 01411d06b62..fdf546b5875 100644
--- a/cpp/src/dictionary/decode.cu
+++ b/cpp/src/dictionary/decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,10 +65,11 @@ std::unique_ptr<column> decode(dictionary_column_view const& source,
 }  // namespace detail
 
 std::unique_ptr<column> decode(dictionary_column_view const& source,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::decode(source, cudf::get_default_stream(), mr);
+  return detail::decode(source, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index fe8e777b694..c92b57f0cac 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,10 +89,11 @@ data_type get_indices_type_for_size(size_type keys_size)
 
 std::unique_ptr<column> encode(column_view const& input_column,
                                data_type indices_type,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::encode(input_column, indices_type, cudf::get_default_stream(), mr);
+  return detail::encode(input_column, indices_type, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 9fe4a63373b..86b70f1119b 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -195,17 +195,19 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
 
 std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
                                     column_view const& keys_to_remove,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::remove_keys(dictionary_column, keys_to_remove, cudf::get_default_stream(), mr);
+  return detail::remove_keys(dictionary_column, keys_to_remove, stream, mr);
 }
 
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::remove_unused_keys(dictionary_column, cudf::get_default_stream(), mr);
+  return detail::remove_unused_keys(dictionary_column, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 8e97a387780..e35aded1984 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -79,10 +79,8 @@ struct find_index_fn {
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
     auto keys_view   = column_device_view::create(input.keys(), stream);
-    auto iter        = thrust::equal_range(rmm::exec_policy(cudf::get_default_stream()),
-                                    keys_view->begin<Element>(),
-                                    keys_view->end<Element>(),
-                                    find_key);
+    auto iter        = thrust::equal_range(
+      rmm::exec_policy(stream), keys_view->begin<Element>(), keys_view->end<Element>(), find_key);
     return type_dispatcher(input.indices().type(),
                            dispatch_scalar_index{},
                            thrust::distance(keys_view->begin<Element>(), iter.first),
@@ -176,10 +174,11 @@ std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionar
 
 std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   scalar const& key,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::get_index(dictionary, key, cudf::get_default_stream(), mr);
+  return detail::get_index(dictionary, key, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 36f5021d305..b49cf7850b1 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -241,17 +241,20 @@ std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_d
 
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::set_keys(dictionary_column, keys, cudf::get_default_stream(), mr);
+  return detail::set_keys(dictionary_column, keys, stream, mr);
 }
 
 std::vector<std::unique_ptr<column>> match_dictionaries(
-  cudf::host_span<dictionary_column_view const> input, rmm::mr::device_memory_resource* mr)
+  cudf::host_span<dictionary_column_view const> input,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::match_dictionaries(input, cudf::get_default_stream(), mr);
+  return detail::match_dictionaries(input, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index 439b1c2d066..d773c2763df 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include <hash/hash_allocator.cuh>
-#include <hash/helper_functions.cuh>
 #include <hash/managed.cuh>
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
index 87075a39ea3..183042fc0f4 100644
--- a/cpp/src/hash/unordered_multiset.cuh
+++ b/cpp/src/hash/unordered_multiset.cuh
@@ -16,11 +16,10 @@
 
 #pragma once
 
-#include <hash/helper_functions.cuh>
-
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index 30cfee97fd8..e39625c92e7 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -419,6 +419,52 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
            : get_empty_type_column(array.length());
 }
 
+struct BuilderGenerator {
+  template <typename T,
+            CUDF_ENABLE_IF(!std::is_same_v<T, arrow::ListType> &&
+                           !std::is_same_v<T, arrow::StructType>)>
+  std::shared_ptr<arrow::ArrayBuilder> operator()(std::shared_ptr<arrow::DataType> const& type)
+  {
+    return std::make_shared<typename arrow::TypeTraits<T>::BuilderType>(
+      type, arrow::default_memory_pool());
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(std::is_same_v<T, arrow::ListType> ||
+                           std::is_same_v<T, arrow::StructType>)>
+  std::shared_ptr<arrow::ArrayBuilder> operator()(std::shared_ptr<arrow::DataType> const& type)
+  {
+    CUDF_FAIL("Type not supported by BuilderGenerator");
+  }
+};
+
+std::shared_ptr<arrow::ArrayBuilder> make_builder(std::shared_ptr<arrow::DataType> const& type)
+{
+  switch (type->id()) {
+    case arrow::Type::STRUCT: {
+      std::vector<std::shared_ptr<arrow::ArrayBuilder>> field_builders;
+
+      for (auto field : type->fields()) {
+        auto const vt = field->type();
+        if (vt->id() == arrow::Type::STRUCT || vt->id() == arrow::Type::LIST) {
+          field_builders.push_back(make_builder(vt));
+        } else {
+          field_builders.push_back(arrow_type_dispatcher(*vt, BuilderGenerator{}, vt));
+        }
+      }
+      return std::make_shared<arrow::StructBuilder>(
+        type, arrow::default_memory_pool(), field_builders);
+    }
+    case arrow::Type::LIST: {
+      return std::make_shared<arrow::ListBuilder>(arrow::default_memory_pool(),
+                                                  make_builder(type->field(0)->type()));
+    }
+    default: {
+      return arrow_type_dispatcher(*type, BuilderGenerator{}, type);
+    }
+  }
+}
+
 }  // namespace
 
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
@@ -462,14 +508,54 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
   return std::make_unique<table>(std::move(columns));
 }
 
+std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  // Get a builder for the scalar type
+  auto builder = detail::make_builder(input.type);
+
+  auto status = builder->AppendScalar(input);
+  if (status != arrow::Status::OK()) {
+    if (status.IsNotImplemented()) {
+      // The only known failure case here is for nulls
+      CUDF_FAIL("Cannot create untyped null scalars or nested types with untyped null leaf nodes",
+                std::invalid_argument);
+    }
+    CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed");
+  }
+
+  auto maybe_array = builder->Finish();
+  if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); }
+  auto array = *maybe_array;
+
+  auto field = arrow::field("", input.type);
+
+  auto table = arrow::Table::Make(arrow::schema({field}), {array});
+
+  auto cudf_table = detail::from_arrow(*table, stream, mr);
+
+  auto cv = cudf_table->view().column(0);
+  return get_element(cv, 0, stream);
+}
+
 }  // namespace detail
 
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
-  return detail::from_arrow(input_table, cudf::get_default_stream(), mr);
+  return detail::from_arrow(input_table, stream, mr);
 }
 
+std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+
+  return detail::from_arrow(input, stream, mr);
+}
 }  // namespace cudf
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 958a2fcb95f..0cd750bc947 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -15,14 +15,16 @@
  */
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -77,7 +79,10 @@ std::shared_ptr<arrow::Buffer> fetch_mask_buffer(column_view input_view,
     auto mask_buffer = allocate_arrow_bitmap(static_cast<int64_t>(input_view.size()), ar_mr);
     CUDF_CUDA_TRY(cudaMemcpyAsync(
       mask_buffer->mutable_data(),
-      (input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(),
+      (input_view.offset() > 0)
+        ? cudf::detail::copy_bitmask(input_view, stream, rmm::mr::get_current_device_resource())
+            .data()
+        : input_view.null_mask(),
       mask_size_in_bytes,
       cudaMemcpyDefault,
       stream.value()));
@@ -139,29 +144,36 @@ struct dispatch_to_arrow {
   }
 };
 
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const&,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
+// Convert decimal types from libcudf to arrow where those types are not
+// directly supported by Arrow. These types must be fit into 128 bits, the
+// smallest decimal resolution supported by Arrow.
+template <typename DeviceType>
+std::shared_ptr<arrow::Array> unsupported_decimals_to_arrow(column_view input,
+                                                            int32_t precision,
+                                                            arrow::MemoryPool* ar_mr,
+                                                            rmm::cuda_stream_view stream)
 {
-  using DeviceType                = int64_t;
-  size_type const BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
 
   rmm::device_uvector<DeviceType> buf(input.size() * BIT_WIDTH_RATIO, stream);
 
   auto count = thrust::make_counting_iterator(0);
 
-  thrust::for_each(rmm::exec_policy(cudf::get_default_stream()),
-                   count,
-                   count + input.size(),
-                   [in = input.begin<DeviceType>(), out = buf.data()] __device__(auto in_idx) {
-                     auto const out_idx = in_idx * 2;
-                     out[out_idx]       = in[in_idx];
-                     out[out_idx + 1]   = in[in_idx] < 0 ? -1 : 0;
-                   });
+  thrust::for_each(
+    rmm::exec_policy(cudf::get_default_stream()),
+    count,
+    count + input.size(),
+    [in = input.begin<DeviceType>(), out = buf.data(), BIT_WIDTH_RATIO] __device__(auto in_idx) {
+      auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+      // The lowest order bits are the value, the remainder
+      // simply matches the sign bit to satisfy the two's
+      // complement integer representation of negative numbers.
+      out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+      for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+        out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+      }
+    });
 
   auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
   auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
@@ -169,7 +181,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
   CUDF_CUDA_TRY(cudaMemcpyAsync(
     data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value()));
 
-  auto type    = arrow::decimal(18, -input.type().scale());
+  auto type    = arrow::decimal(precision, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
   auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
   auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
@@ -177,6 +189,28 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
   return std::make_shared<arrow::Decimal128Array>(data);
 }
 
+template <>
+std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal32>(
+  column_view input,
+  cudf::type_id,
+  column_metadata const&,
+  arrow::MemoryPool* ar_mr,
+  rmm::cuda_stream_view stream)
+{
+  return unsupported_decimals_to_arrow<int32_t>(input, 9, ar_mr, stream);
+}
+
+template <>
+std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
+  column_view input,
+  cudf::type_id,
+  column_metadata const&,
+  arrow::MemoryPool* ar_mr,
+  rmm::cuda_stream_view stream)
+{
+  return unsupported_decimals_to_arrow<int64_t>(input, 18, ar_mr, stream);
+}
+
 template <>
 std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>(
   column_view input,
@@ -403,14 +437,37 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
 
   return result;
 }
+
+std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
+                                        column_metadata const& metadata,
+                                        rmm::cuda_stream_view stream,
+                                        arrow::MemoryPool* ar_mr)
+{
+  auto const column = cudf::make_column_from_scalar(input, 1, stream);
+  cudf::table_view const tv{{column->view()}};
+  auto const arrow_table  = cudf::to_arrow(tv, {metadata}, stream);
+  auto const ac           = arrow_table->column(0);
+  auto const maybe_scalar = ac->GetScalar(0);
+  if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); }
+  return maybe_scalar.ValueOrDie();
+}
 }  // namespace detail
 
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata,
+                                       rmm::cuda_stream_view stream,
                                        arrow::MemoryPool* ar_mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_arrow(input, metadata, cudf::get_default_stream(), ar_mr);
+  return detail::to_arrow(input, metadata, stream, ar_mr);
 }
 
+std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
+                                        column_metadata const& metadata,
+                                        rmm::cuda_stream_view stream,
+                                        arrow::MemoryPool* ar_mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow(input, metadata, stream, ar_mr);
+}
 }  // namespace cudf
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 9231040eb70..da5b0eedfbd 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -15,8 +15,6 @@
  */
 
 #include "nested_json.hpp"
-#include <hash/hash_allocator.cuh>
-#include <hash/helper_functions.cuh>
 #include <io/utilities/hostdevice_vector.hpp>
 
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -24,7 +22,9 @@
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 1772e5e43fa..d16237d7afe 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -924,6 +924,9 @@ std::unique_ptr<column> parse_data(
   if (col_size == 0) { return make_empty_column(col_type); }
   auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
   auto null_count_data = d_null_count.data();
+  if (null_mask.is_empty()) {
+    null_mask = cudf::detail::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr);
+  }
 
   // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
   auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data});
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 4c1b1ed98b1..e96505e5ed6 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -17,13 +17,12 @@
 
 #include <cudf/detail/join.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <hash/hash_allocator.cuh>
-#include <hash/helper_functions.cuh>
-
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
 #include <cuco/static_map.cuh>
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index f8e7b4c6126..40a14d805e1 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,12 +36,12 @@ namespace cudf {
 namespace lists {
 namespace detail {
 /**
- * @brief Returns a numeric column containing lengths of each element.
+ * @brief Returns a numeric column containing lengths of each element
  *
- * @param input Input lists column.
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param input Input lists column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New INT32 column with lengths.
+ * @return New size_type column with lengths
  */
 std::unique_ptr<column> count_elements(lists_column_view const& input,
                                        rmm::cuda_stream_view stream,
@@ -52,7 +52,7 @@ std::unique_ptr<column> count_elements(lists_column_view const& input,
   // create output column
   auto output = make_fixed_width_column(data_type{type_to_id<size_type>()},
                                         input.size(),
-                                        copy_bitmask(input.parent()),
+                                        cudf::detail::copy_bitmask(input.parent(), stream, mr),
                                         input.null_count(),
                                         stream,
                                         mr);
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index 260636a61cf..49054ebb046 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -70,13 +70,13 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
   auto output_offset = build_output_offsets(input, stream, mr);
   auto const child   = input.get_sliced_child(stream);
 
-  auto const sorted_child_table = segmented_sort_by_key(table_view{{child}},
-                                                        table_view{{child}},
-                                                        output_offset->view(),
-                                                        {column_order},
-                                                        {null_precedence},
-                                                        stream,
-                                                        mr);
+  auto const sorted_child_table = cudf::detail::segmented_sort_by_key(table_view{{child}},
+                                                                      table_view{{child}},
+                                                                      output_offset->view(),
+                                                                      {column_order},
+                                                                      {null_precedence},
+                                                                      stream,
+                                                                      mr);
 
   return make_lists_column(input.size(),
                            std::move(output_offset),
@@ -98,13 +98,13 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
   auto output_offset = build_output_offsets(input, stream, mr);
   auto const child   = input.get_sliced_child(stream);
 
-  auto const sorted_child_table = stable_segmented_sort_by_key(table_view{{child}},
-                                                               table_view{{child}},
-                                                               output_offset->view(),
-                                                               {column_order},
-                                                               {null_precedence},
-                                                               stream,
-                                                               mr);
+  auto const sorted_child_table = cudf::detail::stable_segmented_sort_by_key(table_view{{child}},
+                                                                             table_view{{child}},
+                                                                             output_offset->view(),
+                                                                             {column_order},
+                                                                             {null_precedence},
+                                                                             stream,
+                                                                             mr);
 
   return make_lists_column(input.size(),
                            std::move(output_offset),
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 2b48aed2d29..950cb484ddf 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -163,7 +163,9 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
   auto output =
     detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
   // mask will not change
-  if (input.nullable()) { output->set_null_mask(copy_bitmask(input), input.null_count()); }
+  if (input.nullable()) {
+    output->set_null_mask(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
+  }
 
   auto output_device_view =
     cudf::mutable_column_device_view::create(output->mutable_view(), stream);
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 6e69b5157c2..7ac784bef43 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -357,6 +357,16 @@ template <typename T>
 struct device_value_accessor {
   column_device_view const col;  ///< column view of column in device
 
+  /// Checks that the type used to access device values matches the rep-type
+  /// of the order-by column.
+  struct is_correct_range_rep {
+    template <typename U>  /// Order-by type.
+    constexpr bool operator()() const
+    {
+      return std::is_same_v<T, cudf::detail::range_rep_type<U>>;
+    }
+  };
+
   /**
    * @brief constructor
    *
@@ -364,8 +374,11 @@ struct device_value_accessor {
    */
   explicit __device__ device_value_accessor(column_device_view const& col_) : col{col_}
   {
-    cudf_assert(type_id_matches_device_storage_type<T>(col.type().id()) &&
-                "the data type mismatch");
+    // For non-timestamp types, T must match the order-by column's type.
+    // For timestamp types, T must match the range rep type for the order-by column.
+    cudf_assert((type_id_matches_device_storage_type<T>(col.type().id()) or
+                 cudf::type_dispatcher(col.type(), is_correct_range_rep{})) &&
+                "data type mismatch when accessing the order-by column");
   }
 
   /**
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index e37f0686ac3..43624ba691d 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -26,7 +26,7 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-#include <cuco/static_map.cuh>
+#include <cuco/static_set.cuh>
 
 #include <type_traits>
 
@@ -37,69 +37,59 @@ namespace {
 using cudf::experimental::row::lhs_index_type;
 using cudf::experimental::row::rhs_index_type;
 
-using static_map = cuco::static_map<lhs_index_type,
-                                    size_type,
-                                    cuda::thread_scope_device,
-                                    rmm::mr::stream_allocator_adaptor<default_allocator<char>>>;
-
 /**
- * @brief Check if the given type `T` is a strong index type (i.e., `lhs_index_type` or
- * `rhs_index_type`).
- *
- * @return A boolean value indicating if `T` is a strong index type
+ * @brief An hasher adapter wrapping both haystack hasher and needles hasher
  */
-template <typename T>
-constexpr auto is_strong_index_type()
-{
-  return std::is_same_v<T, lhs_index_type> || std::is_same_v<T, rhs_index_type>;
-}
+template <typename HaystackHasher, typename NeedleHasher>
+struct hasher_adapter {
+  hasher_adapter(HaystackHasher const& haystack_hasher, NeedleHasher const& needle_hasher)
+    : _haystack_hasher{haystack_hasher}, _needle_hasher{needle_hasher}
+  {
+  }
 
-/**
- * @brief An adapter functor to support strong index types for row hasher that must be operating on
- * `cudf::size_type`.
- */
-template <typename Hasher>
-struct strong_index_hasher_adapter {
-  strong_index_hasher_adapter(Hasher const& hasher) : _hasher{hasher} {}
+  __device__ constexpr auto operator()(lhs_index_type idx) const noexcept
+  {
+    return _haystack_hasher(static_cast<size_type>(idx));
+  }
 
-  template <typename T, CUDF_ENABLE_IF(is_strong_index_type<T>())>
-  __device__ constexpr auto operator()(T const idx) const noexcept
+  __device__ constexpr auto operator()(rhs_index_type idx) const noexcept
   {
-    return _hasher(static_cast<size_type>(idx));
+    return _needle_hasher(static_cast<size_type>(idx));
   }
 
  private:
-  Hasher const _hasher;
+  HaystackHasher const _haystack_hasher;
+  NeedleHasher const _needle_hasher;
 };
 
 /**
- * @brief An adapter functor to support strong index type for table row comparator that must be
- * operating on `cudf::size_type`.
+ * @brief An comparator adapter wrapping both self comparator and two table comparator
  */
-template <typename Comparator>
-struct strong_index_comparator_adapter {
-  strong_index_comparator_adapter(Comparator const& comparator) : _comparator{comparator} {}
-
-  template <typename T,
-            typename U,
-            CUDF_ENABLE_IF(is_strong_index_type<T>() && is_strong_index_type<U>())>
-  __device__ constexpr auto operator()(T const lhs_index, U const rhs_index) const noexcept
+template <typename SelfEqual, typename TwoTableEqual>
+struct comparator_adapter {
+  comparator_adapter(SelfEqual const& self_equal, TwoTableEqual const& two_table_equal)
+    : _self_equal{self_equal}, _two_table_equal{two_table_equal}
+  {
+  }
+
+  __device__ constexpr auto operator()(lhs_index_type lhs_index,
+                                       lhs_index_type rhs_index) const noexcept
   {
     auto const lhs = static_cast<size_type>(lhs_index);
     auto const rhs = static_cast<size_type>(rhs_index);
 
-    if constexpr (std::is_same_v<T, U> || std::is_same_v<T, lhs_index_type>) {
-      return _comparator(lhs, rhs);
-    } else {
-      // Here we have T == rhs_index_type.
-      // This is when the indices are provided in wrong order for two table comparator, so we need
-      // to switch them back to the right order before calling the underlying comparator.
-      return _comparator(rhs, lhs);
-    }
+    return _self_equal(lhs, rhs);
+  }
+
+  __device__ constexpr auto operator()(lhs_index_type lhs_index,
+                                       rhs_index_type rhs_index) const noexcept
+  {
+    return _two_table_equal(lhs_index, rhs_index);
   }
 
  private:
-  Comparator const _comparator;
+  SelfEqual const _self_equal;
+  TwoTableEqual const _two_table_equal;
 };
 
 /**
@@ -134,38 +124,62 @@ std::pair<rmm::device_buffer, bitmask_type const*> build_row_bitmask(table_view
 }
 
 /**
- * @brief Invoke an `operator()` template with a row equality comparator based on the specified
- * `compare_nans` parameter.
+ * @brief Invokes the given `func` with desired comparators based on the specified `compare_nans`
+ * parameter
+ *
+ * @tparam HasNested Flag indicating whether there are nested columns in haystack or needles
+ * @tparam Hasher Type of device hash function
+ * @tparam Func Type of the helper function doing `contains` check
  *
- * @param compare_nans The flag to specify whether NaNs should be compared equal or not
+ * @param compare_nulls Control whether nulls should be compared as equal or not
+ * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
+ * @param haystack_has_nulls Flag indicating whether haystack has nulls or not
+ * @param has_any_nulls Flag indicating whether there are nested nulls is either haystack or needles
+ * @param self_equal Self table comparator
+ * @param two_table_equal Two table comparator
+ * @param d_hasher Device hash functor
  * @param func The input functor to invoke
  */
-template <typename Func>
-void dispatch_nan_comparator(nan_equality compare_nans, Func&& func)
+template <bool HasNested, typename Hasher, typename Func>
+void dispatch_nan_comparator(
+  null_equality compare_nulls,
+  nan_equality compare_nans,
+  bool haystack_has_nulls,
+  bool has_any_nulls,
+  cudf::experimental::row::equality::self_comparator self_equal,
+  cudf::experimental::row::equality::two_table_comparator two_table_equal,
+  Hasher const& d_hasher,
+  Func&& func)
 {
+  // Distinguish probing scheme CG sizes between nested and flat types for better performance
+  auto const probing_scheme = [&]() {
+    if constexpr (HasNested) {
+      return cuco::experimental::linear_probing<4, Hasher>{d_hasher};
+    } else {
+      return cuco::experimental::linear_probing<1, Hasher>{d_hasher};
+    }
+  }();
+
   if (compare_nans == nan_equality::ALL_EQUAL) {
     using nan_equal_comparator =
       cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    func(nan_equal_comparator{});
+    auto const d_self_equal = self_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_equal_comparator{});
+    auto const d_two_table_equal = two_table_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_equal_comparator{});
+    func(d_self_equal, d_two_table_equal, probing_scheme);
   } else {
     using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    func(nan_unequal_comparator{});
+    auto const d_self_equal      = self_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_unequal_comparator{});
+    auto const d_two_table_equal = two_table_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_unequal_comparator{});
+    func(d_self_equal, d_two_table_equal, probing_scheme);
   }
 }
 
 }  // namespace
 
-/**
- * @brief Check if rows in the given `needles` table exist in the `haystack` table.
- *
- * @param haystack The table containing the search space
- * @param needles A table of rows whose existence to check in the search space
- * @param compare_nulls Control whether nulls should be compared as equal or not
- * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned vector
- * @return A vector of bools indicating if each row in `needles` has matching rows in `haystack`
- */
 rmm::device_uvector<bool> contains(table_view const& haystack,
                                    table_view const& needles,
                                    null_equality compare_nulls,
@@ -173,124 +187,97 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
-  auto map = static_map(compute_hash_table_size(haystack.num_rows()),
-                        cuco::empty_key{lhs_index_type{std::numeric_limits<size_type>::max()}},
-                        cuco::empty_value{detail::JoinNoneValue},
-                        detail::hash_table_allocator_type{default_allocator<char>{}, stream},
-                        stream.value());
+  CUDF_EXPECTS(cudf::have_same_types(haystack, needles), "Column types mismatch");
 
   auto const haystack_has_nulls = has_nested_nulls(haystack);
   auto const needles_has_nulls  = has_nested_nulls(needles);
   auto const has_any_nulls      = haystack_has_nulls || needles_has_nulls;
 
+  auto const preprocessed_needles =
+    cudf::experimental::row::equality::preprocessed_table::create(needles, stream);
   auto const preprocessed_haystack =
     cudf::experimental::row::equality::preprocessed_table::create(haystack, stream);
-  // Insert row indices of the haystack table as map keys.
-  {
-    auto const haystack_it = cudf::detail::make_counting_transform_iterator(
-      size_type{0},
-      [] __device__(auto const idx) { return cuco::make_pair(lhs_index_type{idx}, 0); });
-
-    auto const hasher = cudf::experimental::row::hash::row_hasher(preprocessed_haystack);
-    auto const d_hasher =
-      strong_index_hasher_adapter{hasher.device_hasher(nullate::DYNAMIC{has_any_nulls})};
-
-    auto const comparator =
-      cudf::experimental::row::equality::self_comparator(preprocessed_haystack);
-
-    // If the haystack table has nulls but they are compared unequal, don't insert them.
-    // Otherwise, it was known to cause performance issue:
-    // - https://github.com/rapidsai/cudf/pull/6943
-    // - https://github.com/rapidsai/cudf/pull/8277
-    if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
-      auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
-      auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
-
-      auto const insert_map = [&](auto const value_comp) {
-        if (cudf::detail::has_nested_columns(haystack)) {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<true>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert_if(haystack_it,
-                        haystack_it + haystack.num_rows(),
-                        thrust::counting_iterator<size_type>(0),  // stencil
-                        row_is_valid{row_bitmask_ptr},
-                        d_hasher,
-                        d_eqcomp,
-                        stream.value());
-        } else {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<false>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert_if(haystack_it,
-                        haystack_it + haystack.num_rows(),
-                        thrust::counting_iterator<size_type>(0),  // stencil
-                        row_is_valid{row_bitmask_ptr},
-                        d_hasher,
-                        d_eqcomp,
-                        stream.value());
-        }
-      };
-
-      // Insert only rows that do not have any null at any level.
-      dispatch_nan_comparator(compare_nans, insert_map);
-    } else {  // haystack_doesn't_have_nulls || compare_nulls == null_equality::EQUAL
-      auto const insert_map = [&](auto const value_comp) {
-        if (cudf::detail::has_nested_columns(haystack)) {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<true>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert(
-            haystack_it, haystack_it + haystack.num_rows(), d_hasher, d_eqcomp, stream.value());
-        } else {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<false>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert(
-            haystack_it, haystack_it + haystack.num_rows(), d_hasher, d_eqcomp, stream.value());
-        }
-      };
-
-      dispatch_nan_comparator(compare_nans, insert_map);
-    }
-  }
+
+  auto const haystack_hasher   = cudf::experimental::row::hash::row_hasher(preprocessed_haystack);
+  auto const d_haystack_hasher = haystack_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
+  auto const needle_hasher     = cudf::experimental::row::hash::row_hasher(preprocessed_needles);
+  auto const d_needle_hasher   = needle_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
+  auto const d_hasher          = hasher_adapter{d_haystack_hasher, d_needle_hasher};
+
+  auto const self_equal = cudf::experimental::row::equality::self_comparator(preprocessed_haystack);
+  auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator(
+    preprocessed_haystack, preprocessed_needles);
 
   // The output vector.
   auto contained = rmm::device_uvector<bool>(needles.num_rows(), stream, mr);
 
-  auto const preprocessed_needles =
-    cudf::experimental::row::equality::preprocessed_table::create(needles, stream);
-  // Check existence for each row of the needles table in the haystack table.
-  {
-    auto const needles_it = cudf::detail::make_counting_transform_iterator(
-      size_type{0}, [] __device__(auto const idx) { return rhs_index_type{idx}; });
-
-    auto const hasher = cudf::experimental::row::hash::row_hasher(preprocessed_needles);
-    auto const d_hasher =
-      strong_index_hasher_adapter{hasher.device_hasher(nullate::DYNAMIC{has_any_nulls})};
-
-    auto const comparator = cudf::experimental::row::equality::two_table_comparator(
-      preprocessed_haystack, preprocessed_needles);
-
-    auto const check_contains = [&](auto const value_comp) {
-      if (cudf::detail::has_nested_columns(haystack) or cudf::detail::has_nested_columns(needles)) {
-        auto const d_eqcomp =
-          comparator.equal_to<true>(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
-        map.contains(needles_it,
-                     needles_it + needles.num_rows(),
-                     contained.begin(),
-                     d_hasher,
-                     d_eqcomp,
-                     stream.value());
+  auto const haystack_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0}, [] __device__(auto idx) { return lhs_index_type{idx}; });
+  auto const needles_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0}, [] __device__(auto idx) { return rhs_index_type{idx}; });
+
+  auto const helper_func =
+    [&](auto const& d_self_equal, auto const& d_two_table_equal, auto const& probing_scheme) {
+      auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal};
+
+      auto set = cuco::experimental::static_set{
+        cuco::experimental::extent{compute_hash_table_size(haystack.num_rows())},
+        cuco::empty_key{lhs_index_type{-1}},
+        d_equal,
+        probing_scheme,
+        detail::hash_table_allocator_type{default_allocator<lhs_index_type>{}, stream},
+        stream.value()};
+
+      if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
+        auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
+        auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
+
+        // If the haystack table has nulls but they are compared unequal, don't insert them.
+        // Otherwise, it was known to cause performance issue:
+        // - https://github.com/rapidsai/cudf/pull/6943
+        // - https://github.com/rapidsai/cudf/pull/8277
+        set.insert_if_async(haystack_iter,
+                            haystack_iter + haystack.num_rows(),
+                            thrust::counting_iterator<size_type>(0),  // stencil
+                            row_is_valid{row_bitmask_ptr},
+                            stream.value());
       } else {
-        auto const d_eqcomp =
-          comparator.equal_to<false>(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
-        map.contains(needles_it,
-                     needles_it + needles.num_rows(),
-                     contained.begin(),
-                     d_hasher,
-                     d_eqcomp,
-                     stream.value());
+        set.insert_async(haystack_iter, haystack_iter + haystack.num_rows(), stream.value());
+      }
+
+      if (needles_has_nulls && compare_nulls == null_equality::UNEQUAL) {
+        auto const bitmask_buffer_and_ptr = build_row_bitmask(needles, stream);
+        auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
+        set.contains_if_async(needles_iter,
+                              needles_iter + needles.num_rows(),
+                              thrust::counting_iterator<size_type>(0),  // stencil
+                              row_is_valid{row_bitmask_ptr},
+                              contained.begin(),
+                              stream.value());
+      } else {
+        set.contains_async(
+          needles_iter, needles_iter + needles.num_rows(), contained.begin(), stream.value());
       }
     };
 
-    dispatch_nan_comparator(compare_nans, check_contains);
+  if (cudf::detail::has_nested_columns(haystack)) {
+    dispatch_nan_comparator<true>(compare_nulls,
+                                  compare_nans,
+                                  haystack_has_nulls,
+                                  has_any_nulls,
+                                  self_equal,
+                                  two_table_equal,
+                                  d_hasher,
+                                  helper_func);
+  } else {
+    dispatch_nan_comparator<false>(compare_nulls,
+                                   compare_nans,
+                                   haystack_has_nulls,
+                                   has_any_nulls,
+                                   self_equal,
+                                   two_table_equal,
+                                   d_hasher,
+                                   helper_func);
   }
 
   return contained;
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index 25c594e9e74..39476a2f534 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -73,7 +73,8 @@ bool is_sorted(cudf::table_view const& in,
 
 bool is_sorted(cudf::table_view const& in,
                std::vector<order> const& column_order,
-               std::vector<null_order> const& null_precedence)
+               std::vector<null_order> const& null_precedence,
+               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   if (in.num_columns() == 0 || in.num_rows() == 0) { return true; }
@@ -89,7 +90,7 @@ bool is_sorted(cudf::table_view const& in,
       "Number of columns in the table doesn't match the vector null_precedence's size .\n");
   }
 
-  return detail::is_sorted(in, column_order, null_precedence, cudf::get_default_stream());
+  return detail::is_sorted(in, column_order, null_precedence, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index fd65e38d467..3ead8cfcbaa 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -366,16 +366,11 @@ std::unique_ptr<column> rank(column_view const& input,
                              null_policy null_handling,
                              null_order null_precedence,
                              bool percentage,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rank(input,
-                      method,
-                      column_order,
-                      null_handling,
-                      null_precedence,
-                      percentage,
-                      cudf::get_default_stream(),
-                      mr);
+  return detail::rank(
+    input, method, column_order, null_handling, null_precedence, percentage, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu
index 38d008c120c..d9457341bd2 100644
--- a/cpp/src/sort/segmented_sort.cu
+++ b/cpp/src/sort/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,11 +81,12 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
                                                column_view const& segment_offsets,
                                                std::vector<order> const& column_order,
                                                std::vector<null_order> const& null_precedence,
+                                               rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sorted_order(
-    keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
@@ -93,11 +94,12 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              column_view const& segment_offsets,
                                              std::vector<order> const& column_order,
                                              std::vector<null_order> const& null_precedence,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sort_by_key(
-    values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    values, keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index 37664f33762..5d11bf055f1 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -166,7 +166,7 @@ std::unique_ptr<column> fast_segmented_sorted_order(column_view const& input,
   // Unfortunately, CUB's segmented sort functions cannot accept iterators.
   // We have to build a pre-filled sequence of indices as input.
   auto sorted_indices =
-    cudf::detail::sequence(input.size(), numeric_scalar<size_type>{0}, stream, mr);
+    cudf::detail::sequence(input.size(), numeric_scalar<size_type>{0, true, stream}, stream, mr);
   auto indices_view = sorted_indices->mutable_view();
 
   cudf::type_dispatcher<dispatch_storage_type>(input.type(),
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index 25b95af4f83..46edae798d4 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -109,30 +109,32 @@ std::unique_ptr<table> sort(table_view const& input,
 std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sorted_order(input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sorted_order(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
+                            rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort(input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sort(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> sort_by_key(table_view const& values,
                                    table_view const& keys,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort_by_key(
-    values, keys, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sort_by_key(values, keys, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/stable_segmented_sort.cu b/cpp/src/sort/stable_segmented_sort.cu
index 40df1b50279..4725d65e05d 100644
--- a/cpp/src/sort/stable_segmented_sort.cu
+++ b/cpp/src/sort/stable_segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,11 +55,12 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sorted_order(
-    keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
@@ -67,11 +68,12 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
                                                     column_view const& segment_offsets,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sort_by_key(
-    values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    values, keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index 6f5678c4168..cf602dcf1a9 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -62,22 +62,22 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
 std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_sorted_order(
-    input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::stable_sorted_order(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                                           table_view const& keys,
                                           std::vector<order> const& column_order,
                                           std::vector<null_order> const& null_precedence,
+                                          rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_sort_by_key(
-    values, keys, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::stable_sort_by_key(values, keys, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp
index 58d958d2ff4..18c531e3e69 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.hpp
+++ b/cpp/src/stream_compaction/stream_compaction_common.hpp
@@ -15,12 +15,11 @@
  */
 #pragma once
 
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
-#include <hash/hash_allocator.cuh>
-#include <hash/helper_functions.cuh>
-
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
 #include <cuco/static_map.cuh>
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 938fd45246d..5f2f4d021a4 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -150,10 +150,11 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
 std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& strings,
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr);
+  return detail::generate_ngrams(strings, ngrams, separator, stream, mr);
 }
 
 namespace detail {
@@ -317,18 +318,20 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
 
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
                                                         cudf::size_type ngrams,
+                                                        rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+  return detail::generate_character_ngrams(strings, ngrams, stream, mr);
 }
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+  return detail::hash_character_ngrams(strings, ngrams, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 5b55745c2c7..95324847ea0 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -107,7 +107,7 @@ rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view con
  *
  * This is called with a warp per row
  */
-struct sorted_interset_fn {
+struct sorted_intersect_fn {
   cudf::column_device_view const d_input1;
   cudf::column_device_view const d_input2;
   cudf::size_type* d_results;
@@ -151,7 +151,7 @@ rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view
   auto const d_input1 = cudf::column_device_view::create(input1, stream);
   auto const d_input2 = cudf::column_device_view::create(input2, stream);
   auto d_results      = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
-  sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()};
+  sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::counting_iterator<cudf::size_type>(0),
                      input1.size() * cudf::detail::warp_size,
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index fd1cbf99221..73d85513e95 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -265,11 +265,11 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& delimiter,
                                               cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ngrams_tokenize(
-    strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr);
+  return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 83aa22aaae9..2fa879ea734 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -18,10 +18,9 @@
 
 #include <nvtext/bpe_tokenize.hpp>
 
-#include <hash/hash_allocator.cuh>
-
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/strings/string_view.cuh>
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d1e50442058..68ff6c54c99 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -621,17 +621,21 @@ ConfigureTest(
   STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu
 )
 
-ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
   testing
 )
+ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 9a5cc3733af..a898106a5b2 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -456,3 +456,98 @@ INSTANTIATE_TEST_CASE_P(FromArrowTest,
                                           std::make_tuple(0, 0),
                                           std::make_tuple(0, 3000),
                                           std::make_tuple(10000, 10000)));
+
+template <typename T>
+struct FromArrowNumericScalarTest : public cudf::test::BaseFixture {};
+
+using NumericTypesNotBool =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+TYPED_TEST_SUITE(FromArrowNumericScalarTest, NumericTypesNotBool);
+
+TYPED_TEST(FromArrowNumericScalarTest, Basic)
+{
+  TypeParam const value{42};
+  auto const arrow_scalar = arrow::MakeScalar(value);
+  auto const cudf_scalar  = cudf::from_arrow(*arrow_scalar);
+  auto const cudf_numeric_scalar =
+    dynamic_cast<cudf::numeric_scalar<TypeParam>*>(cudf_scalar.get());
+  if (cudf_numeric_scalar == nullptr) { CUDF_FAIL("Attempted to test with a non-numeric type."); }
+  EXPECT_EQ(cudf_numeric_scalar->type(), cudf::data_type(cudf::type_to_id<TypeParam>()));
+  EXPECT_EQ(cudf_numeric_scalar->value(), value);
+}
+
+struct FromArrowDecimalScalarTest : public cudf::test::BaseFixture {};
+
+// Only testing Decimal128 because that's the only size cudf and arrow have in common.
+TEST_F(FromArrowDecimalScalarTest, Basic)
+{
+  auto const value{42};
+  auto const precision{8};
+  auto const scale{4};
+  auto arrow_scalar = arrow::Decimal128Scalar(value, arrow::decimal128(precision, -scale));
+  auto cudf_scalar  = cudf::from_arrow(arrow_scalar);
+
+  // Arrow offers a minimum of 128 bits for the Decimal type.
+  auto const cudf_decimal_scalar =
+    dynamic_cast<cudf::fixed_point_scalar<numeric::decimal128>*>(cudf_scalar.get());
+  EXPECT_EQ(cudf_decimal_scalar->type(),
+            cudf::data_type(cudf::type_to_id<numeric::decimal128>(), scale));
+  EXPECT_EQ(cudf_decimal_scalar->value(), value);
+}
+
+struct FromArrowStringScalarTest : public cudf::test::BaseFixture {};
+
+TEST_F(FromArrowStringScalarTest, Basic)
+{
+  auto const value        = std::string("hello world");
+  auto const arrow_scalar = arrow::StringScalar(value);
+  auto const cudf_scalar  = cudf::from_arrow(arrow_scalar);
+
+  auto const cudf_string_scalar = dynamic_cast<cudf::string_scalar*>(cudf_scalar.get());
+  EXPECT_EQ(cudf_string_scalar->type(), cudf::data_type(cudf::type_id::STRING));
+  EXPECT_EQ(cudf_string_scalar->to_string(), value);
+}
+
+struct FromArrowListScalarTest : public cudf::test::BaseFixture {};
+
+TEST_F(FromArrowListScalarTest, Basic)
+{
+  std::vector<int64_t> host_values = {1, 2, 3, 5, 6, 7, 8};
+  std::vector<bool> host_validity  = {true, true, true, false, true, true, true};
+
+  arrow::Int64Builder builder;
+  auto const status      = builder.AppendValues(host_values, host_validity);
+  auto const maybe_array = builder.Finish();
+  auto const array       = *maybe_array;
+
+  auto const arrow_scalar = arrow::ListScalar(array);
+  auto const cudf_scalar  = cudf::from_arrow(arrow_scalar);
+
+  auto const cudf_list_scalar = dynamic_cast<cudf::list_scalar*>(cudf_scalar.get());
+  EXPECT_EQ(cudf_list_scalar->type(), cudf::data_type(cudf::type_id::LIST));
+
+  cudf::test::fixed_width_column_wrapper<int64_t> const lhs(
+    host_values.begin(), host_values.end(), host_validity.begin());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(lhs, cudf_list_scalar->view());
+}
+
+struct FromArrowStructScalarTest : public cudf::test::BaseFixture {};
+
+TEST_F(FromArrowStructScalarTest, Basic)
+{
+  int64_t const value{42};
+  auto const underlying_arrow_scalar = arrow::MakeScalar(value);
+
+  auto const field        = arrow::field("", underlying_arrow_scalar->type);
+  auto const arrow_type   = arrow::struct_({field});
+  auto const arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type);
+  auto const cudf_scalar  = cudf::from_arrow(arrow_scalar);
+
+  auto const cudf_struct_scalar = dynamic_cast<cudf::struct_scalar*>(cudf_scalar.get());
+  EXPECT_EQ(cudf_struct_scalar->type(), cudf::data_type(cudf::type_id::STRUCT));
+
+  cudf::test::fixed_width_column_wrapper<int64_t> const col({value});
+  cudf::table_view const lhs({col});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(lhs, cudf_struct_scalar->view());
+}
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 97d80984272..6bb4cdfd747 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -22,6 +22,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
@@ -578,4 +579,106 @@ INSTANTIATE_TEST_CASE_P(ToArrowTest,
                                           std::make_tuple(0, 0),
                                           std::make_tuple(0, 3000)));
 
+template <typename T>
+struct ToArrowNumericScalarTest : public cudf::test::BaseFixture {};
+
+using NumericTypesNotBool =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+TYPED_TEST_SUITE(ToArrowNumericScalarTest, NumericTypesNotBool);
+
+TYPED_TEST(ToArrowNumericScalarTest, Basic)
+{
+  TypeParam const value{42};
+  auto const cudf_scalar = cudf::make_fixed_width_scalar<TypeParam>(value);
+
+  cudf::column_metadata const metadata{""};
+  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+
+  auto const ref_arrow_scalar = arrow::MakeScalar(value);
+  EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar));
+}
+
+struct ToArrowDecimalScalarTest : public cudf::test::BaseFixture {};
+
+// Only testing Decimal128 because that's the only size cudf and arrow have in common.
+TEST_F(ToArrowDecimalScalarTest, Basic)
+{
+  auto const value{42};
+  auto const precision{18};  // cudf will convert to the widest-precision Arrow scalar of the type
+  int32_t const scale{4};
+
+  auto const cudf_scalar =
+    cudf::make_fixed_point_scalar<numeric::decimal128>(value, numeric::scale_type{scale});
+
+  cudf::column_metadata const metadata{""};
+  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+
+  auto const maybe_ref_arrow_scalar =
+    arrow::MakeScalar(arrow::decimal128(precision, -scale), value);
+  if (!maybe_ref_arrow_scalar.ok()) { CUDF_FAIL("Failed to construct reference scalar"); }
+  auto const ref_arrow_scalar = *maybe_ref_arrow_scalar;
+  EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar));
+}
+
+struct ToArrowStringScalarTest : public cudf::test::BaseFixture {};
+
+TEST_F(ToArrowStringScalarTest, Basic)
+{
+  std::string const value{"hello world"};
+  auto const cudf_scalar = cudf::make_string_scalar(value);
+  cudf::column_metadata const metadata{""};
+  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+
+  auto const ref_arrow_scalar = arrow::MakeScalar(value);
+  EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar));
+}
+
+struct ToArrowListScalarTest : public cudf::test::BaseFixture {};
+
+TEST_F(ToArrowListScalarTest, Basic)
+{
+  std::vector<int64_t> const host_values = {1, 2, 3, 5, 6, 7, 8};
+  std::vector<bool> const host_validity  = {true, true, true, false, true, true, true};
+
+  cudf::test::fixed_width_column_wrapper<int64_t> const col(
+    host_values.begin(), host_values.end(), host_validity.begin());
+
+  auto const cudf_scalar = cudf::make_list_scalar(col);
+
+  cudf::column_metadata const metadata{""};
+  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+
+  arrow::Int64Builder builder;
+  auto const status      = builder.AppendValues(host_values, host_validity);
+  auto const maybe_array = builder.Finish();
+  auto const array       = *maybe_array;
+
+  auto const ref_arrow_scalar = arrow::ListScalar(array);
+
+  EXPECT_TRUE(arrow_scalar->Equals(ref_arrow_scalar));
+}
+
+struct ToArrowStructScalarTest : public cudf::test::BaseFixture {};
+
+TEST_F(ToArrowStructScalarTest, Basic)
+{
+  int64_t const value{42};
+  auto const field_name{"a"};
+
+  cudf::test::fixed_width_column_wrapper<int64_t> const col{value};
+  cudf::table_view const tbl({col});
+  auto const cudf_scalar = cudf::make_struct_scalar(tbl);
+
+  cudf::column_metadata metadata{""};
+  metadata.children_meta.emplace_back(field_name);
+  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+
+  auto const underlying_arrow_scalar = arrow::MakeScalar(value);
+  auto const field            = arrow::field(field_name, underlying_arrow_scalar->type, false);
+  auto const arrow_type       = arrow::struct_({field});
+  auto const ref_arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type);
+
+  EXPECT_TRUE(arrow_scalar->Equals(ref_arrow_scalar));
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/streams/dictionary_test.cpp b/cpp/tests/streams/dictionary_test.cpp
new file mode 100644
index 00000000000..f48e64c078e
--- /dev/null
+++ b/cpp/tests/streams/dictionary_test.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/dictionary/search.hpp>
+#include <cudf/dictionary/update_keys.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class DictionaryTest : public cudf::test::BaseFixture {};
+
+TEST_F(DictionaryTest, Encode)
+{
+  cudf::test::fixed_width_column_wrapper<int> col({1, 2, 3, 4, 5});
+  cudf::data_type int32_type(cudf::type_id::UINT32);
+  cudf::column_view col_view = col;
+  cudf::dictionary::encode(col_view, int32_type, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, Decode)
+{
+  // keys = {0, 2, 6}, indices = {0, 1, 1, 2, 2}
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::dictionary::decode(dict_col_view, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, GetIndex)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::numeric_scalar<int32_t> key_scalar(2, true, cudf::test::get_default_stream());
+  cudf::dictionary::get_index(dict_col_view, key_scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, AddKeys)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::test::fixed_width_column_wrapper<int> new_keys_col({8, 9});
+  cudf::dictionary::add_keys(dict_col_view, new_keys_col, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, RemoveKeys)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::test::fixed_width_column_wrapper<int> keys_to_remove_col({2});
+  cudf::dictionary::remove_keys(
+    dict_col_view, keys_to_remove_col, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, RemoveUnsedKeys)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::dictionary::remove_unused_keys(dict_col_view, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, SetKeys)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::test::fixed_width_column_wrapper<int> keys_col({2, 6});
+  cudf::dictionary::set_keys(dict_col_view, keys_col, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, MatchDictionaries)
+{
+  std::vector<int32_t> elements_a{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col_a(elements_a.begin(), elements_a.end());
+  cudf::dictionary_column_view dict_col_view_a = dict_col_a;
+
+  std::vector<int32_t> elements_b{1, 3, 4, 5, 5};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col_b(elements_b.begin(), elements_b.end());
+  cudf::dictionary_column_view dict_col_view_b = dict_col_b;
+
+  std::vector<cudf::dictionary_column_view> dicts = {dict_col_view_a, dict_col_view_b};
+
+  cudf::test::fixed_width_column_wrapper<int> keys_col({2, 6});
+  cudf::dictionary::match_dictionaries(dicts, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp
new file mode 100644
index 00000000000..7eac9e016eb
--- /dev/null
+++ b/cpp/tests/streams/interop_test.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/interop.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+struct ArrowTest : public cudf::test::BaseFixture {};
+
+TEST_F(ArrowTest, ToArrow)
+{
+  int32_t const value{42};
+  auto col = cudf::test::fixed_width_column_wrapper<int32_t>{{value}};
+  cudf::table_view tbl{{col}};
+
+  std::vector<cudf::column_metadata> metadata{{""}};
+  cudf::to_arrow(tbl, metadata, cudf::test::get_default_stream());
+}
+
+TEST_F(ArrowTest, FromArrow)
+{
+  std::vector<int64_t> host_values = {1, 2, 3, 5, 6, 7, 8};
+  std::vector<bool> host_validity  = {true, true, true, false, true, true, true};
+
+  arrow::Int64Builder builder;
+  auto status      = builder.AppendValues(host_values, host_validity);
+  auto maybe_array = builder.Finish();
+  auto array       = *maybe_array;
+
+  auto field  = arrow::field("", arrow::int32());
+  auto schema = arrow::schema({field});
+  auto table  = arrow::Table::Make(schema, {array});
+  cudf::from_arrow(*table, cudf::test::get_default_stream());
+}
+
+TEST_F(ArrowTest, ToArrowScalar)
+{
+  int32_t const value{42};
+  auto cudf_scalar =
+    cudf::make_fixed_width_scalar<int32_t>(value, cudf::test::get_default_stream());
+
+  cudf::column_metadata metadata{""};
+  cudf::to_arrow(*cudf_scalar, metadata, cudf::test::get_default_stream());
+}
+
+TEST_F(ArrowTest, FromArrowScalar)
+{
+  int32_t const value{42};
+  auto arrow_scalar = arrow::MakeScalar(value);
+  cudf::from_arrow(*arrow_scalar, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/sorting_test.cpp b/cpp/tests/streams/sorting_test.cpp
new file mode 100644
index 00000000000..e481f95bded
--- /dev/null
+++ b/cpp/tests/streams/sorting_test.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/sorting.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class SortingTest : public cudf::test::BaseFixture {};
+
+TEST_F(SortingTest, SortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::sorted_order(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::stable_sorted_order(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, IsSorted)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::is_sorted(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, Sort)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::sort(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, SortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{10, 20, 30, 40, 50};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{10, 20, 30, 40, 50};
+  cudf::table_view const keys{{keys_col}};
+
+  cudf::sort_by_key(values, keys, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{10, 20, 30, 40, 50};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{10, 20, 30, 40, 50};
+  cudf::table_view const keys{{keys_col}};
+
+  cudf::stable_sort_by_key(values, keys, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, Rank)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+
+  cudf::rank(column,
+             cudf::rank_method::AVERAGE,
+             cudf::order::ASCENDING,
+             cudf::null_policy::EXCLUDE,
+             cudf::null_order::AFTER,
+             false,
+             cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, SegmentedSortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{3, 7};
+
+  cudf::segmented_sorted_order(keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSegmentedSortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{3, 7};
+
+  cudf::stable_segmented_sorted_order(
+    keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, SegmentedSortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{7, 6, 9, 3, 4, 5, 1, 2, 0, 4};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{0, 3, 7, 10};
+
+  cudf::segmented_sort_by_key(
+    values, keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSegmentedSortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{7, 6, 9, 3, 4, 5, 1, 2, 0, 4};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{0, 3, 7, 10};
+
+  cudf::stable_segmented_sort_by_key(
+    values, keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp
new file mode 100644
index 00000000000..bce0d2b680b
--- /dev/null
+++ b/cpp/tests/streams/text/ngrams_test.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/generate_ngrams.hpp>
+#include <nvtext/ngrams_tokenize.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class TextNGramsTest : public cudf::test::BaseFixture {};
+
+TEST_F(TextNGramsTest, GenerateNgrams)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
+  nvtext::generate_ngrams(
+    cudf::strings_column_view(input), 3, separator, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, GenerateCharacterNgrams)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  nvtext::generate_character_ngrams(
+    cudf::strings_column_view(input), 3, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, HashCharacterNgrams)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+  nvtext::hash_character_ngrams(
+    cudf::strings_column_view(input), 5, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, NgramsTokenize)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+  auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
+  nvtext::ngrams_tokenize(
+    cudf::strings_column_view(input), 2, delimiter, separator, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index 323b3eed3e2..7b179588385 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -34,18 +34,19 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
 {
   cudf::test::strings_column_wrapper strings{"the", "fox", "jumped", "over", "thé", "dog"};
   cudf::strings_column_view strings_view(strings);
+  auto const separator = cudf::string_scalar("_");
 
   {
     cudf::test::strings_column_wrapper expected{
       "the_fox", "fox_jumped", "jumped_over", "over_thé", "thé_dog"};
-    auto const results = nvtext::generate_ngrams(strings_view);
+    auto const results = nvtext::generate_ngrams(strings_view, 2, separator);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 
   {
     cudf::test::strings_column_wrapper expected{
       "the_fox_jumped", "fox_jumped_over", "jumped_over_thé", "over_thé_dog"};
-    auto const results = nvtext::generate_ngrams(strings_view, 3);
+    auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
@@ -83,10 +84,11 @@ TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto const separator = cudf::string_scalar("_");
 
   cudf::strings_column_view strings_view(strings);
   {
-    auto const results = nvtext::generate_ngrams(strings_view, 3);
+    auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
     cudf::test::strings_column_wrapper expected{
       "the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -103,7 +105,10 @@ TEST_F(TextGenerateNgramsTest, Empty)
 {
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
 
-  auto results = nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column));
+  auto const separator = cudf::string_scalar("_");
+
+  auto results =
+    nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column), 2, separator);
   cudf::test::expect_column_empty(results->view());
   results = nvtext::generate_character_ngrams(cudf::strings_column_view(zero_size_strings_column));
   cudf::test::expect_column_empty(results->view());
@@ -112,21 +117,20 @@ TEST_F(TextGenerateNgramsTest, Empty)
 TEST_F(TextGenerateNgramsTest, Errors)
 {
   cudf::test::strings_column_wrapper strings{""};
+  auto const separator = cudf::string_scalar("_");
   // invalid parameter value
-  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error);
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator),
+               cudf::logic_error);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1),
                cudf::logic_error);
   // not enough strings to generate ngrams
-  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error);
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator),
+               cudf::logic_error);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 3),
                cudf::logic_error);
 
-  std::vector<char const*> h_strings{"", nullptr, "", nullptr};
-  cudf::test::strings_column_wrapper strings_no_tokens(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens)),
+  cudf::test::strings_column_wrapper strings_no_tokens({"", "", "", ""}, {1, 0, 1, 0});
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens), 2, separator),
                cudf::logic_error);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings_no_tokens)),
                cudf::logic_error);
diff --git a/cpp/tests/text/ngrams_tokenize_tests.cpp b/cpp/tests/text/ngrams_tokenize_tests.cpp
index 5879bec3e64..c6fb886f7e5 100644
--- a/cpp/tests/text/ngrams_tokenize_tests.cpp
+++ b/cpp/tests/text/ngrams_tokenize_tests.cpp
@@ -62,7 +62,7 @@ TEST_F(TextNgramsTokenizeTest, Tokenize)
                                                 "mousé_ate",
                                                 "ate_the",
                                                 "the_cheese"};
-    auto results = nvtext::ngrams_tokenize(strings_view);
+    auto results = nvtext::ngrams_tokenize(strings_view, 2, std::string(), std::string("_"));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
@@ -101,9 +101,10 @@ TEST_F(TextNgramsTokenizeTest, TokenizeOneGram)
 {
   cudf::test::strings_column_wrapper strings{"aaa bbb", "  ccc  ddd  ", "eee"};
   cudf::strings_column_view strings_view(strings);
+  auto const empty = cudf::string_scalar("");
 
   cudf::test::strings_column_wrapper expected{"aaa", "bbb", "ccc", "ddd", "eee"};
-  auto results = nvtext::ngrams_tokenize(strings_view, 1);
+  auto results = nvtext::ngrams_tokenize(strings_view, 1, empty, empty);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
@@ -111,7 +112,8 @@ TEST_F(TextNgramsTokenizeTest, TokenizeEmptyTest)
 {
   auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
   cudf::strings_column_view strings_view(strings->view());
-  auto results = nvtext::ngrams_tokenize(strings_view);
+  auto const empty = cudf::string_scalar("");
+  auto results     = nvtext::ngrams_tokenize(strings_view, 2, empty, empty);
   EXPECT_EQ(results->size(), 0);
   EXPECT_EQ(results->has_nulls(), false);
 }
@@ -120,5 +122,6 @@ TEST_F(TextNgramsTokenizeTest, TokenizeErrorTest)
 {
   cudf::test::strings_column_wrapper strings{"this column intentionally left blank"};
   cudf::strings_column_view strings_view(strings);
-  EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0), cudf::logic_error);
+  auto const empty = cudf::string_scalar("");
+  EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0, empty, empty), cudf::logic_error);
 }
diff --git a/dependencies.yaml b/dependencies.yaml
index 66417b214ff..56e625336ae 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -218,6 +218,7 @@ dependencies:
           - libkvikio==23.12.*
       - output_types: conda
         packages:
+          - aws-sdk-cpp<1.11
           - fmt>=9.1.0,<10
           - &gbench benchmark==1.8.0
           - &gtest gtest>=1.13.0
diff --git a/python/cudf/cudf/_lib/cpp/interop.pxd b/python/cudf/cudf/_lib/cpp/interop.pxd
index e81f0d617fb..88e9d83ee98 100644
--- a/python/cudf/cudf/_lib/cpp/interop.pxd
+++ b/python/cudf/cudf/_lib/cpp/interop.pxd
@@ -1,12 +1,13 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-from pyarrow.lib cimport CTable
+from pyarrow.lib cimport CScalar, CTable
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
+from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 
@@ -24,6 +25,7 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
                                ) except +
 
     cdef unique_ptr[table] from_arrow(CTable input) except +
+    cdef unique_ptr[scalar] from_arrow(CScalar input) except +
 
     cdef cppclass column_metadata:
         column_metadata() except +
@@ -35,3 +37,8 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
         table_view input,
         vector[column_metadata] metadata,
     ) except +
+
+    cdef shared_ptr[CScalar] to_arrow(
+        const scalar& input,
+        column_metadata metadata,
+    ) except +
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 8fd2a409d90..639754fc54f 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -4,7 +4,14 @@ from cpython cimport pycapsule
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table
+from pyarrow.lib cimport (
+    CScalar,
+    CTable,
+    pyarrow_unwrap_scalar,
+    pyarrow_unwrap_table,
+    pyarrow_wrap_scalar,
+    pyarrow_wrap_table,
+)
 
 from cudf._lib.cpp.interop cimport (
     DLManagedTensor,
@@ -14,12 +21,22 @@ from cudf._lib.cpp.interop cimport (
     to_arrow as cpp_to_arrow,
     to_dlpack as cpp_to_dlpack,
 )
+from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport type_id
+from cudf._lib.cpp.wrappers.decimals cimport (
+    decimal32,
+    decimal64,
+    decimal128,
+    scale_type,
+)
+from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf.api.types import is_list_dtype, is_struct_dtype
 from cudf.core.buffer import acquire_spill_lock
+from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
 
 
 def from_dlpack(dlpack_capsule):
@@ -182,3 +199,79 @@ def from_arrow(object input_table):
         c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))
 
     return columns_from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def to_arrow_scalar(DeviceScalar source_scalar):
+    """Convert a scalar to a PyArrow scalar.
+
+    Parameters
+    ----------
+    source_scalar : the scalar to convert
+
+    Returns
+    -------
+    pyarrow.lib.Scalar
+    """
+    cdef vector[column_metadata] cpp_metadata = gather_metadata(
+        [("", source_scalar.dtype)]
+    )
+    cdef const scalar* source_scalar_ptr = source_scalar.get_raw_ptr()
+
+    cdef shared_ptr[CScalar] cpp_arrow_scalar
+    with nogil:
+        cpp_arrow_scalar = cpp_to_arrow(
+            source_scalar_ptr[0], cpp_metadata[0]
+        )
+
+    return pyarrow_wrap_scalar(cpp_arrow_scalar)
+
+
+@acquire_spill_lock()
+def from_arrow_scalar(object input_scalar, output_dtype=None):
+    """Convert from PyArrow scalar to a cudf scalar.
+
+    Parameters
+    ----------
+    input_scalar : PyArrow scalar
+    output_dtype : output type to cast to, ignored except for decimals
+
+    Returns
+    -------
+    cudf._lib.DeviceScalar
+    """
+    cdef shared_ptr[CScalar] cpp_arrow_scalar = (
+        pyarrow_unwrap_scalar(input_scalar)
+    )
+    cdef unique_ptr[scalar] c_result
+
+    with nogil:
+        c_result = move(cpp_from_arrow(cpp_arrow_scalar.get()[0]))
+
+    cdef type_id ctype = c_result.get().type().id()
+    if ctype == type_id.DECIMAL128:
+        if output_dtype is None:
+            # Decimals must be cast to the cudf dtype of the right width
+            raise ValueError(
+                "Decimal scalars must be constructed with a dtype"
+            )
+
+        if isinstance(output_dtype, Decimal32Dtype):
+            c_result.reset(
+                new fixed_point_scalar[decimal32](
+                    (<fixed_point_scalar[decimal128]*> c_result.get()).value(),
+                    scale_type(-input_scalar.type.scale),
+                    c_result.get().is_valid()
+                )
+            )
+        elif isinstance(output_dtype, Decimal64Dtype):
+            c_result.reset(
+                new fixed_point_scalar[decimal64](
+                    (<fixed_point_scalar[decimal128]*> c_result.get()).value(),
+                    scale_type(-input_scalar.type.scale),
+                    c_result.get().is_valid()
+                )
+            )
+        # Decimal128Dtype is a no-op, no conversion needed.
+
+    return DeviceScalar.from_unique_ptr(move(c_result), output_dtype)
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 0407785b2d8..5ab286c5701 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -2,22 +2,13 @@
 
 cimport cython
 
-import decimal
+import copy
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 
-from libc.stdint cimport (
-    int8_t,
-    int16_t,
-    int32_t,
-    int64_t,
-    uint8_t,
-    uint16_t,
-    uint32_t,
-    uint64_t,
-)
+from libc.stdint cimport int64_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -25,38 +16,22 @@ from libcpp.utility cimport move
 from rmm._lib.memory_resource cimport get_current_device_resource
 
 import cudf
-from cudf._lib.types import (
-    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
-    datetime_unit_map,
-    duration_unit_map,
-)
+from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
 from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.missing import NA, NaT
 
-from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 
-from cudf._lib.interop import from_arrow, to_arrow
+from cudf._lib.interop import from_arrow_scalar, to_arrow_scalar
 
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.scalar.scalar cimport (
     duration_scalar,
-    fixed_point_scalar,
     list_scalar,
-    numeric_scalar,
     scalar,
-    string_scalar,
     struct_scalar,
     timestamp_scalar,
 )
-from cudf._lib.cpp.wrappers.decimals cimport (
-    decimal32,
-    decimal64,
-    decimal128,
-    scale_type,
-)
 from cudf._lib.cpp.wrappers.durations cimport (
     duration_ms,
     duration_ns,
@@ -69,7 +44,21 @@ from cudf._lib.cpp.wrappers.timestamps cimport (
     timestamp_s,
     timestamp_us,
 )
-from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
+
+
+def _replace_nested(obj, check, replacement):
+    if isinstance(obj, list):
+        for i, item in enumerate(obj):
+            if check(item):
+                obj[i] = replacement
+            elif isinstance(item, (dict, list)):
+                _replace_nested(item, check, replacement)
+    elif isinstance(obj, dict):
+        for k, v in obj.items():
+            if check(v):
+                obj[k] = replacement
+            elif isinstance(v, (dict, list)):
+                _replace_nested(v, check, replacement)
 
 
 # The DeviceMemoryResource attribute could be released prematurely
@@ -97,61 +86,61 @@ cdef class DeviceScalar:
             A NumPy dtype.
         """
         self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object')
-        self._set_value(value, self._dtype)
-
-    def _set_value(self, value, dtype):
-        # IMPORTANT: this should only ever be called from __init__
-        valid = not _is_null_host_scalar(value)
-
-        if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
-            _set_decimal_from_scalar(
-                self.c_value, value, dtype, valid)
-        elif isinstance(dtype, cudf.ListDtype):
-            _set_list_from_pylist(
-                self.c_value, value, dtype, valid)
-        elif isinstance(dtype, cudf.StructDtype):
-            _set_struct_from_pydict(self.c_value, value, dtype, valid)
+
+        if cudf.utils.utils.is_na_like(value):
+            value = None
+        else:
+            # TODO: For now we always deepcopy the input value to avoid
+            # overwriting the input values when replacing nulls. Since it's
+            # just host values it's not that expensive, but we could consider
+            # alternatives.
+            value = copy.deepcopy(value)
+            _replace_nested(value, cudf.utils.utils.is_na_like, None)
+
+        if isinstance(dtype, cudf.core.dtypes._BaseDtype):
+            pa_type = dtype.to_arrow()
         elif pd.api.types.is_string_dtype(dtype):
-            _set_string_from_np_string(self.c_value, value, valid)
-        elif pd.api.types.is_numeric_dtype(dtype):
-            _set_numeric_from_np_scalar(self.c_value,
-                                        value,
-                                        dtype,
-                                        valid)
-        elif pd.api.types.is_datetime64_dtype(dtype):
-            _set_datetime64_from_np_scalar(
-                self.c_value, value, dtype, valid
-            )
-        elif pd.api.types.is_timedelta64_dtype(dtype):
-            _set_timedelta64_from_np_scalar(
-                self.c_value, value, dtype, valid
-            )
+            # Have to manually convert object types, which we use internally
+            # for strings but pyarrow only supports as unicode 'U'
+            pa_type = pa.string()
         else:
-            raise ValueError(
-                f"Cannot convert value of type "
-                f"{type(value).__name__} to cudf scalar"
-            )
+            pa_type = pa.from_numpy_dtype(dtype)
+
+        pa_scalar = pa.scalar(value, type=pa_type)
+
+        # Note: This factory-like behavior in __init__ will be removed when
+        # migrating to pylibcudf.
+        cdef DeviceScalar obj = from_arrow_scalar(pa_scalar, self._dtype)
+        self.c_value.swap(obj.c_value)
 
     def _to_host_scalar(self):
-        if isinstance(self.dtype, cudf.core.dtypes.DecimalDtype):
-            result = _get_py_decimal_from_fixed_point(self.c_value)
-        elif cudf.api.types.is_struct_dtype(self.dtype):
-            result = _get_py_dict_from_struct(self.c_value, self.dtype)
-        elif cudf.api.types.is_list_dtype(self.dtype):
-            result = _get_py_list_from_list(self.c_value, self.dtype)
-        elif pd.api.types.is_string_dtype(self.dtype):
-            result = _get_py_string_from_string(self.c_value)
-        elif pd.api.types.is_numeric_dtype(self.dtype):
-            result = _get_np_scalar_from_numeric(self.c_value)
-        elif pd.api.types.is_datetime64_dtype(self.dtype):
-            result = _get_np_scalar_from_timestamp64(self.c_value)
-        elif pd.api.types.is_timedelta64_dtype(self.dtype):
-            result = _get_np_scalar_from_timedelta64(self.c_value)
+        is_datetime = self.dtype.kind == "M"
+        is_timedelta = self.dtype.kind == "m"
+
+        null_type = NaT if is_datetime or is_timedelta else NA
+
+        ps = to_arrow_scalar(self)
+        if not ps.is_valid:
+            return null_type
+
+        # TODO: The special handling of specific types below does not currently
+        # extend to nested types containing those types (e.g. List[timedelta]
+        # where the timedelta would overflow). We should eventually account for
+        # those cases, but that will require more careful consideration of how
+        # to traverse the contents of the nested data.
+        if is_datetime or is_timedelta:
+            time_unit, _ = np.datetime_data(self.dtype)
+            # Cast to int64 to avoid overflow
+            ps_cast = ps.cast('int64').as_py()
+            out_type = np.datetime64 if is_datetime else np.timedelta64
+            ret = out_type(ps_cast, time_unit)
+        elif cudf.api.types.is_numeric_dtype(self.dtype):
+            ret = ps.type.to_pandas_dtype()(ps.as_py())
         else:
-            raise ValueError(
-                "Could not convert cudf::scalar to a Python value"
-            )
-        return result
+            ret = ps.as_py()
+
+        _replace_nested(ret, lambda item: item is None, NA)
+        return ret
 
     @property
     def dtype(self):
@@ -236,42 +225,9 @@ cdef class DeviceScalar:
         return s
 
 
-cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True):
-    value = value if valid else ""
-    s.reset(new string_scalar(value.encode(), valid))
-
-
-cdef _set_numeric_from_np_scalar(unique_ptr[scalar]& s,
-                                 object value,
-                                 object dtype,
-                                 bool valid=True):
-    value = value if valid else 0
-    if dtype == "int8":
-        s.reset(new numeric_scalar[int8_t](value, valid))
-    elif dtype == "int16":
-        s.reset(new numeric_scalar[int16_t](value, valid))
-    elif dtype == "int32":
-        s.reset(new numeric_scalar[int32_t](value, valid))
-    elif dtype == "int64":
-        s.reset(new numeric_scalar[int64_t](value, valid))
-    elif dtype == "uint8":
-        s.reset(new numeric_scalar[uint8_t](value, valid))
-    elif dtype == "uint16":
-        s.reset(new numeric_scalar[uint16_t](value, valid))
-    elif dtype == "uint32":
-        s.reset(new numeric_scalar[uint32_t](value, valid))
-    elif dtype == "uint64":
-        s.reset(new numeric_scalar[uint64_t](value, valid))
-    elif dtype == "float32":
-        s.reset(new numeric_scalar[float](value, valid))
-    elif dtype == "float64":
-        s.reset(new numeric_scalar[double](value, valid))
-    elif dtype == "bool":
-        s.reset(new numeric_scalar[bool](<bool>value, valid))
-    else:
-        raise ValueError(f"dtype not supported: {dtype}")
-
-
+# TODO: Currently the only uses of this function and the one below are in
+# _create_proxy_nat_scalar. See if that code path can be simplified to excise
+# or at least simplify these implementations.
 cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s,
                                     object value,
                                     object dtype,
@@ -324,253 +280,6 @@ cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s,
     else:
         raise ValueError(f"dtype not supported: {dtype}")
 
-cdef _set_decimal_from_scalar(unique_ptr[scalar]& s,
-                              object value,
-                              object dtype,
-                              bool valid=True):
-    value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0
-    if isinstance(dtype, cudf.Decimal64Dtype):
-        s.reset(
-            new fixed_point_scalar[decimal64](
-                <int64_t>np.int64(value), scale_type(-dtype.scale), valid
-            )
-        )
-    elif isinstance(dtype, cudf.Decimal32Dtype):
-        s.reset(
-            new fixed_point_scalar[decimal32](
-                <int32_t>np.int32(value), scale_type(-dtype.scale), valid
-            )
-        )
-    elif isinstance(dtype, cudf.Decimal128Dtype):
-        s.reset(
-            new fixed_point_scalar[decimal128](
-                <libcudf_types.int128>value, scale_type(-dtype.scale), valid
-            )
-        )
-    else:
-        raise ValueError(f"dtype not supported: {dtype}")
-
-cdef _set_struct_from_pydict(unique_ptr[scalar]& s,
-                             object value,
-                             object dtype,
-                             bool valid=True):
-    arrow_schema = dtype.to_arrow()
-    columns = [str(i) for i in range(len(arrow_schema))]
-    if valid:
-        pyarrow_table = pa.Table.from_arrays(
-            [
-                pa.array([value[f.name]], from_pandas=True, type=f.type)
-                for f in arrow_schema
-            ],
-            names=columns
-        )
-    else:
-        pyarrow_table = pa.Table.from_arrays(
-            [
-                pa.array([NA], from_pandas=True, type=f.type)
-                for f in arrow_schema
-            ],
-            names=columns
-        )
-
-    data = from_arrow(pyarrow_table)
-    cdef table_view struct_view = table_view_from_columns(data)
-
-    s.reset(
-        new struct_scalar(struct_view, valid)
-    )
-
-cdef _get_py_dict_from_struct(unique_ptr[scalar]& s, dtype):
-    if not s.get()[0].is_valid():
-        return NA
-
-    cdef table_view struct_table_view = (<struct_scalar*>s.get()).view()
-    columns = columns_from_table_view(struct_table_view, None)
-    struct_col = cudf.core.column.build_struct_column(
-        names=dtype.fields.keys(),
-        children=tuple(columns),
-        size=1,
-    )
-    table = to_arrow([struct_col], [("None", dtype)])
-    python_dict = table.to_pydict()["None"][0]
-    return {k: _nested_na_replace([python_dict[k]])[0] for k in python_dict}
-
-cdef _set_list_from_pylist(unique_ptr[scalar]& s,
-                           object value,
-                           object dtype,
-                           bool valid=True):
-
-    value = value if valid else [NA]
-    cdef Column col
-    if isinstance(dtype.element_type, ListDtype):
-        pa_type = dtype.element_type.to_arrow()
-    else:
-        pa_type = dtype.to_arrow().value_type
-    col = cudf.core.column.as_column(
-        pa.array(value, from_pandas=True, type=pa_type)
-    )
-    cdef column_view col_view = col.view()
-    s.reset(
-        new list_scalar(col_view, valid)
-    )
-
-
-cdef _get_py_list_from_list(unique_ptr[scalar]& s, dtype):
-
-    if not s.get()[0].is_valid():
-        return NA
-
-    cdef column_view list_col_view = (<list_scalar*>s.get()).view()
-    cdef Column element_col = Column.from_column_view(list_col_view, None)
-
-    arrow_obj = to_arrow([element_col], [("None", dtype.element_type)])["None"]
-
-    result = arrow_obj.to_pylist()
-    return _nested_na_replace(result)
-
-
-cdef _get_py_string_from_string(unique_ptr[scalar]& s):
-    if not s.get()[0].is_valid():
-        return NA
-    return (<string_scalar*>s.get())[0].to_string().decode()
-
-
-cdef _get_np_scalar_from_numeric(unique_ptr[scalar]& s):
-    cdef scalar* s_ptr = s.get()
-    if not s_ptr[0].is_valid():
-        return NA
-
-    cdef libcudf_types.data_type cdtype = s_ptr[0].type()
-
-    if cdtype.id() == libcudf_types.type_id.INT8:
-        return np.int8((<numeric_scalar[int8_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.INT16:
-        return np.int16((<numeric_scalar[int16_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.INT32:
-        return np.int32((<numeric_scalar[int32_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.INT64:
-        return np.int64((<numeric_scalar[int64_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.UINT8:
-        return np.uint8((<numeric_scalar[uint8_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.UINT16:
-        return np.uint16((<numeric_scalar[uint16_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.UINT32:
-        return np.uint32((<numeric_scalar[uint32_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.UINT64:
-        return np.uint64((<numeric_scalar[uint64_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.FLOAT32:
-        return np.float32((<numeric_scalar[float]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.FLOAT64:
-        return np.float64((<numeric_scalar[double]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.BOOL8:
-        return np.bool_((<numeric_scalar[bool]*>s_ptr)[0].value())
-    else:
-        raise ValueError("Could not convert cudf::scalar to numpy scalar")
-
-
-cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s):
-    cdef scalar* s_ptr = s.get()
-    if not s_ptr[0].is_valid():
-        return NA
-
-    cdef libcudf_types.data_type cdtype = s_ptr[0].type()
-
-    if cdtype.id() == libcudf_types.type_id.DECIMAL64:
-        rep_val = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].value())
-        scale = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].type().scale())
-        return decimal.Decimal(rep_val).scaleb(scale)
-    elif cdtype.id() == libcudf_types.type_id.DECIMAL32:
-        rep_val = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].value())
-        scale = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].type().scale())
-        return decimal.Decimal(rep_val).scaleb(scale)
-    elif cdtype.id() == libcudf_types.type_id.DECIMAL128:
-        rep_val = int((<fixed_point_scalar[decimal128]*>s_ptr)[0].value())
-        scale = int((<fixed_point_scalar[decimal128]*>s_ptr)[0].type().scale())
-        return decimal.Decimal(rep_val).scaleb(scale)
-    else:
-        raise ValueError("Could not convert cudf::scalar to numpy scalar")
-
-cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s):
-
-    cdef scalar* s_ptr = s.get()
-
-    if not s_ptr[0].is_valid():
-        return NaT
-
-    cdef libcudf_types.data_type cdtype = s_ptr[0].type()
-
-    if cdtype.id() == libcudf_types.type_id.TIMESTAMP_SECONDS:
-        return np.datetime64(
-            (
-                <timestamp_scalar[timestamp_ms]*> s_ptr
-            )[0].ticks_since_epoch_64(),
-            datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MILLISECONDS:
-        return np.datetime64(
-            (
-                <timestamp_scalar[timestamp_ms]*> s_ptr
-            )[0].ticks_since_epoch_64(),
-            datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MICROSECONDS:
-        return np.datetime64(
-            (
-                <timestamp_scalar[timestamp_ms]*> s_ptr
-            )[0].ticks_since_epoch_64(),
-            datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_NANOSECONDS:
-        return np.datetime64(
-            (
-                <timestamp_scalar[timestamp_ms]*> s_ptr
-            )[0].ticks_since_epoch_64(),
-            datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    else:
-        raise ValueError("Could not convert cudf::scalar to numpy scalar")
-
-
-cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s):
-
-    cdef scalar* s_ptr = s.get()
-
-    if not s_ptr[0].is_valid():
-        return NaT
-
-    cdef libcudf_types.data_type cdtype = s_ptr[0].type()
-
-    if cdtype.id() == libcudf_types.type_id.DURATION_SECONDS:
-        return np.timedelta64(
-            (
-                <duration_scalar[duration_s]*> s_ptr
-            )[0].ticks(),
-            duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.DURATION_MILLISECONDS:
-        return np.timedelta64(
-            (
-                <duration_scalar[duration_ms]*> s_ptr
-            )[0].ticks(),
-            duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.DURATION_MICROSECONDS:
-        return np.timedelta64(
-            (
-                <duration_scalar[duration_us]*> s_ptr
-            )[0].ticks(),
-            duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.DURATION_NANOSECONDS:
-        return np.timedelta64(
-            (
-                <duration_scalar[duration_ns]*> s_ptr
-            )[0].ticks(),
-            duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    else:
-        raise ValueError("Could not convert cudf::scalar to numpy scalar")
-
 
 def as_device_scalar(val, dtype=None):
     if isinstance(val, (cudf.Scalar, DeviceScalar)):
@@ -607,16 +316,3 @@ def _create_proxy_nat_scalar(dtype):
         return result
     else:
         raise TypeError('NAT only valid for datetime and timedelta')
-
-
-def _nested_na_replace(input_list):
-    '''
-    Replace `None` with `cudf.NA` in the result of
-    `__getitem__` calls to list type columns
-    '''
-    for idx, value in enumerate(input_list):
-        if isinstance(value, list):
-            _nested_na_replace(value)
-        elif value is None:
-            input_list[idx] = NA
-    return input_list
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1a780cc9e9f..8a3dbe77787 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1390,10 +1390,21 @@ def _get_numeric_data(self):
         return self[columns]
 
     @_cudf_nvtx_annotate
-    def assign(self, **kwargs):
+    def assign(self, **kwargs: Union[Callable[[Self], Any], Any]):
         """
         Assign columns to DataFrame from keyword arguments.
 
+        Parameters
+        ----------
+        **kwargs: dict mapping string column names to values
+            The value for each key can either be a literal column (or
+            something that can be converted to a column), or
+            a callable of one argument that will be given the
+            dataframe as an argument and should return the new column
+            (without modifying the input argument).
+            Columns are added in-order, so callables can refer to
+            column names constructed in the assignment.
+
         Examples
         --------
         >>> import cudf
@@ -1405,15 +1416,9 @@ def assign(self, **kwargs):
         1  1  4
         2  2  5
         """
-        new_df = cudf.DataFrame(index=self.index.copy())
-        for name, col in self._data.items():
-            if name in kwargs:
-                new_df[name] = kwargs.pop(name)
-            else:
-                new_df._data[name] = col.copy()
-
+        new_df = self.copy(deep=False)
         for k, v in kwargs.items():
-            new_df[k] = v
+            new_df[k] = v(new_df) if callable(v) else v
         return new_df
 
     @classmethod
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 62e091b29b5..aacf1fa8dae 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -5438,6 +5438,13 @@ def _is_same_dtype(lhs_dtype, rhs_dtype):
     # for matching column dtype.
     if lhs_dtype == rhs_dtype:
         return True
+    elif (
+        is_categorical_dtype(lhs_dtype)
+        and is_categorical_dtype(rhs_dtype)
+        and lhs_dtype.categories.dtype == rhs_dtype.categories.dtype
+    ):
+        # OK if categories are not all the same
+        return True
     elif (
         is_categorical_dtype(lhs_dtype)
         and not is_categorical_dtype(rhs_dtype)
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index e81f4ec795a..848c77206b2 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import types
 from contextlib import ExitStack as does_not_raise
@@ -193,7 +193,7 @@ def test_cuda_array_interface_pytorch():
 
     assert_eq(got, cudf.Series(buffer, dtype=np.bool_))
 
-    index = cudf.Index([])
+    index = cudf.Index([], dtype="float64")
     tensor = torch.tensor(index)
     got = cudf.Index(tensor)
     assert_eq(got, index)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 6180162ecdd..67b63028fab 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1327,6 +1327,25 @@ def test_assign():
     np.testing.assert_equal(gdf2.y.to_numpy(), [2, 3, 4])
 
 
+@pytest.mark.parametrize(
+    "mapping",
+    [
+        {"y": 1, "z": lambda df: df["x"] + df["y"]},
+        {
+            "x": lambda df: df["x"] * 2,
+            "y": lambda df: 2,
+            "z": lambda df: df["x"] / df["y"],
+        },
+    ],
+)
+def test_assign_callable(mapping):
+    df = pd.DataFrame({"x": [1, 2, 3]})
+    cdf = cudf.from_pandas(df)
+    expect = df.assign(**mapping)
+    actual = cdf.assign(**mapping)
+    assert_eq(expect, actual)
+
+
 @pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
 @pytest.mark.parametrize("method", ["murmur3", "md5"])
 @pytest.mark.parametrize("seed", [None, 42])
@@ -10389,6 +10408,19 @@ def test_dataframe_init_from_nested_dict():
     assert_eq(pdf, gdf)
 
 
+def test_init_from_2_categoricalindex_series_diff_categories():
+    s1 = cudf.Series(
+        [39, 6, 4], index=cudf.CategoricalIndex(["female", "male", "unknown"])
+    )
+    s2 = cudf.Series(
+        [2, 152, 2, 242, 150],
+        index=cudf.CategoricalIndex(["f", "female", "m", "male", "unknown"]),
+    )
+    result = cudf.DataFrame([s1, s2])
+    expected = pd.DataFrame([s1.to_pandas(), s2.to_pandas()])
+    assert_eq(result, expected, check_dtype=False)
+
+
 def test_data_frame_values_no_cols_but_index():
     result = cudf.DataFrame(index=range(5)).values
     expected = pd.DataFrame(index=range(5)).values
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 5dd58d8a875..ac10dd97c56 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -895,14 +895,14 @@ def test_memory_usage():
     "data, idx",
     [
         (
-            [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": None}]],
+            [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": NA}]],
             0,
         ),
         (
             [
                 [
                     {"f2": {"a": 100, "c": 90, "f2": 10}, "f1": "a"},
-                    {"f1": "sf12", "f2": None},
+                    {"f1": "sf12", "f2": NA},
                 ]
             ],
             0,
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index a3593e55b97..ce6dc587320 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -150,9 +150,7 @@ def test_struct_setitem(data, item):
     "data",
     [
         {"a": 1, "b": "rapids", "c": [1, 2, 3, 4]},
-        {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA},
         {"a": "Hello"},
-        {"b": [], "c": [1, 2, 3]},
     ],
 )
 def test_struct_scalar_host_construction(data):
@@ -161,6 +159,39 @@ def test_struct_scalar_host_construction(data):
     assert list(slr.device_value.value.values()) == list(data.values())
 
 
+@pytest.mark.parametrize(
+    ("data", "dtype"),
+    [
+        (
+            {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA},
+            cudf.StructDtype(
+                {
+                    "a": np.dtype(np.int64),
+                    "b": np.dtype(np.str_),
+                    "c": cudf.ListDtype(np.dtype(np.int64)),
+                    "d": np.dtype(np.int64),
+                }
+            ),
+        ),
+        (
+            {"b": [], "c": [1, 2, 3]},
+            cudf.StructDtype(
+                {
+                    "b": cudf.ListDtype(np.dtype(np.int64)),
+                    "c": cudf.ListDtype(np.dtype(np.int64)),
+                }
+            ),
+        ),
+    ],
+)
+def test_struct_scalar_host_construction_no_dtype_inference(data, dtype):
+    # cudf cannot infer the dtype of the scalar when it contains only nulls or
+    # is empty.
+    slr = cudf.Scalar(data, dtype=dtype)
+    assert slr.value == data
+    assert list(slr.device_value.value.values()) == list(data.values())
+
+
 def test_struct_scalar_null():
     slr = cudf.Scalar(cudf.NA, dtype=StructDtype)
     assert slr.device_value.value is cudf.NA
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 1b94db75340..73ea8e2cfc4 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -463,24 +463,6 @@ def _get_nan_for_dtype(dtype):
         return np.float64("nan")
 
 
-def _decimal_to_int64(decimal: Decimal) -> int:
-    """
-    Scale a Decimal such that the result is the integer
-    that would result from removing the decimal point.
-
-    Examples
-    --------
-    >>> _decimal_to_int64(Decimal('1.42'))
-    142
-    >>> _decimal_to_int64(Decimal('0.0042'))
-    42
-    >>> _decimal_to_int64(Decimal('-1.004201'))
-    -1004201
-
-    """
-    return int(f"{decimal:0f}".replace(".", ""))
-
-
 def get_allowed_combinations_for_operator(dtype_l, dtype_r, op):
     error = TypeError(
         f"{op} not supported between {dtype_l} and {dtype_r} scalars"
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index e3f4f04eb85..344b03c631d 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -373,22 +373,37 @@ def percentile_cudf(a, q, interpolation="linear"):
 
 
 @pyarrow_schema_dispatch.register((cudf.DataFrame,))
-def _get_pyarrow_schema_cudf(obj, preserve_index=True, **kwargs):
+def _get_pyarrow_schema_cudf(obj, preserve_index=None, **kwargs):
     if kwargs:
         warnings.warn(
             "Ignoring the following arguments to "
             f"`pyarrow_schema_dispatch`: {list(kwargs)}"
         )
-    return meta_nonempty(obj).to_arrow(preserve_index=preserve_index).schema
+
+    return _cudf_to_table(
+        meta_nonempty(obj), preserve_index=preserve_index
+    ).schema
 
 
 @to_pyarrow_table_dispatch.register(cudf.DataFrame)
-def _cudf_to_table(obj, preserve_index=True, **kwargs):
+def _cudf_to_table(obj, preserve_index=None, **kwargs):
     if kwargs:
         warnings.warn(
             "Ignoring the following arguments to "
             f"`to_pyarrow_table_dispatch`: {list(kwargs)}"
         )
+
+    # TODO: Remove this logic when cudf#14159 is resolved
+    # (see: https://github.com/rapidsai/cudf/issues/14159)
+    if preserve_index and isinstance(obj.index, cudf.RangeIndex):
+        obj = obj.copy()
+        obj.index.name = (
+            obj.index.name
+            if obj.index.name is not None
+            else "__index_level_0__"
+        )
+        obj.index = obj.index._as_int_index()
+
     return obj.to_arrow(preserve_index=preserve_index)
 
 
@@ -401,7 +416,15 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
             f"Ignoring the following arguments to "
             f"`from_pyarrow_table_dispatch`: {list(kwargs)}"
         )
-    return obj.from_arrow(table)
+    result = obj.from_arrow(table)
+
+    # TODO: Remove this logic when cudf#14159 is resolved
+    # (see: https://github.com/rapidsai/cudf/issues/14159)
+    if "__index_level_0__" in result.index.names:
+        assert len(result.index.names) == 1
+        result.index.name = None
+
+    return result
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index e841f2d8830..d6c9c1be73c 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -6,7 +6,7 @@
 import numpy as np
 import tlz as toolz
 
-import dask
+from dask import config
 from dask.base import tokenize
 from dask.dataframe import methods
 from dask.dataframe.core import DataFrame, Index, Series
@@ -18,6 +18,8 @@
 from cudf.api.types import is_categorical_dtype
 from cudf.utils.utils import _dask_cudf_nvtx_annotate
 
+_SHUFFLE_SUPPORT = ("tasks", "p2p")  # "disk" not supported
+
 
 @_dask_cudf_nvtx_annotate
 def set_index_post(df, index_name, drop, column_dtype):
@@ -307,15 +309,25 @@ def sort_values(
     return df4
 
 
+def get_default_shuffle_method():
+    # Note that `dask.utils.get_default_shuffle_method`
+    # will return "p2p" by default when a distributed
+    # client is present. Dask-cudf supports "p2p", but
+    # will not use it by default (yet)
+    default = config.get("dataframe.shuffle.method", "tasks")
+    if default not in _SHUFFLE_SUPPORT:
+        default = "tasks"
+    return default
+
+
 def _get_shuffle_type(shuffle):
     # Utility to set the shuffle-kwarg default
-    # and to validate user-specified options.
-    # The only supported options is currently "tasks"
-    shuffle = shuffle or dask.config.get("shuffle", "tasks")
-    if shuffle != "tasks":
+    # and to validate user-specified options
+    shuffle = shuffle or get_default_shuffle_method()
+    if shuffle not in _SHUFFLE_SUPPORT:
         raise ValueError(
-            f"Dask-cudf only supports in-memory shuffling with "
-            f"'tasks'. Got shuffle={shuffle}"
+            "Dask-cudf only supports the following shuffle "
+            f"methods: {_SHUFFLE_SUPPORT}. Got shuffle={shuffle}"
         )
 
     return shuffle
diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
index cf49b1df4f4..c64e25fd437 100644
--- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py
+++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
@@ -22,18 +22,25 @@ def test_is_categorical_dispatch():
     assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category"))
 
 
-def test_pyarrow_conversion_dispatch():
+@pytest.mark.parametrize("preserve_index", [True, False])
+def test_pyarrow_conversion_dispatch(preserve_index):
     from dask.dataframe.dispatch import (
         from_pyarrow_table_dispatch,
         to_pyarrow_table_dispatch,
     )
 
     df1 = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc"))
-    df2 = from_pyarrow_table_dispatch(df1, to_pyarrow_table_dispatch(df1))
+    df2 = from_pyarrow_table_dispatch(
+        df1, to_pyarrow_table_dispatch(df1, preserve_index=preserve_index)
+    )
 
     assert type(df1) == type(df2)
     assert_eq(df1, df2)
 
+    # Check that preserve_index does not produce a RangeIndex
+    if preserve_index:
+        assert not isinstance(df2.index, cudf.RangeIndex)
+
 
 @pytest.mark.parametrize("index", [None, [1, 2] * 5])
 def test_deterministic_tokenize(index):
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index e24feaa2ea4..db3f3695648 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import numba.cuda
 import pytest
@@ -77,3 +77,23 @@ def test_str_series_roundtrip():
 
             actual = dask_series.compute()
             assert_eq(actual, expected)
+
+
+def test_p2p_shuffle():
+    # Check that we can use `shuffle="p2p"`
+    with dask_cuda.LocalCUDACluster(n_workers=1) as cluster:
+        with Client(cluster):
+            ddf = (
+                dask.datasets.timeseries(
+                    start="2000-01-01",
+                    end="2000-01-08",
+                    dtypes={"x": int},
+                )
+                .reset_index(drop=True)
+                .to_backend("cudf")
+            )
+            dd.assert_eq(
+                ddf.sort_values("x", shuffle="p2p").compute(),
+                ddf.compute().sort_values("x"),
+                check_index=False,
+            )