From 2e0d2d6a0859b2cad34a36513b6977cf2bbe172f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 8 Nov 2024 13:15:26 -0800 Subject: [PATCH 01/19] Improve the performance of low cardinality groupby (#16619) This PR enhances groupby performance for low-cardinality input cases. When applicable, it leverages shared memory for initial aggregation, followed by global memory aggregation to reduce atomic contention and improve performance. Authors: - Yunsong Wang (https://github.com/PointKernel) - Mike Wilson (https://github.com/hyperbolic2346) Approvers: - David Wendt (https://github.com/davidwendt) - Mike Wilson (https://github.com/hyperbolic2346) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16619 --- cpp/CMakeLists.txt | 5 +- cpp/src/groupby/groupby.cu | 1 - cpp/src/groupby/hash/compute_aggregations.cu | 29 +++ cpp/src/groupby/hash/compute_aggregations.cuh | 185 ++++++++++++++++++ ...pass_aggs.hpp => compute_aggregations.hpp} | 16 +- .../groupby/hash/compute_aggregations_null.cu | 29 +++ .../hash/compute_global_memory_aggs.cu | 32 +++ .../hash/compute_global_memory_aggs.cuh | 89 +++++++++ .../hash/compute_global_memory_aggs.hpp | 42 ++++ .../hash/compute_global_memory_aggs_null.cu | 32 +++ cpp/src/groupby/hash/compute_groupby.cu | 43 +--- cpp/src/groupby/hash/compute_groupby.hpp | 17 -- .../hash/compute_shared_memory_aggs.cu | 19 +- .../hash/compute_shared_memory_aggs.hpp | 7 +- .../groupby/hash/compute_single_pass_aggs.cu | 99 ---------- .../hash/create_sparse_results_table.cu | 115 ++++++++--- .../hash/create_sparse_results_table.hpp | 27 ++- cpp/src/groupby/hash/helpers.cuh | 2 - cpp/src/groupby/hash/single_pass_functors.cuh | 118 ++++++++++- 19 files changed, 699 insertions(+), 208 deletions(-) create mode 100644 cpp/src/groupby/hash/compute_aggregations.cu create mode 100644 cpp/src/groupby/hash/compute_aggregations.cuh rename cpp/src/groupby/hash/{compute_single_pass_aggs.hpp => compute_aggregations.hpp} (70%) create mode 100644 cpp/src/groupby/hash/compute_aggregations_null.cu create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.cu create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.cuh create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs.hpp create mode 100644 cpp/src/groupby/hash/compute_global_memory_aggs_null.cu delete mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d3bf7019e35..559826ac232 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -394,11 +394,14 @@ add_library( src/filling/repeat.cu src/filling/sequence.cu src/groupby/groupby.cu + src/groupby/hash/compute_aggregations.cu + src/groupby/hash/compute_aggregations_null.cu + src/groupby/hash/compute_global_memory_aggs.cu + src/groupby/hash/compute_global_memory_aggs_null.cu src/groupby/hash/compute_groupby.cu src/groupby/hash/compute_mapping_indices.cu src/groupby/hash/compute_mapping_indices_null.cu src/groupby/hash/compute_shared_memory_aggs.cu - src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/create_sparse_results_table.cu src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index cc0682b68b9..6eb82618e2a 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu new file mode 100644 index 00000000000..cac6c2224f0 --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_aggregations.cuh" +#include "compute_aggregations.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + global_set_t& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh new file mode 100644 index 00000000000..e8b29a0e7a8 --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_aggregations.hpp" +#include "compute_global_memory_aggs.hpp" +#include "compute_mapping_indices.hpp" +#include "compute_shared_memory_aggs.hpp" +#include "create_sparse_results_table.hpp" +#include "flatten_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + SetType& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream) +{ + // flatten the aggs to a table that can be operated on by aggregate_row + auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + auto const d_agg_kinds = cudf::detail::make_device_uvector_async( + agg_kinds, stream, rmm::mr::get_current_device_resource()); + + auto const grid_size = + max_occupancy_grid_size>(num_rows); + auto const available_shmem_size = get_available_shared_memory_size(grid_size); + auto const has_sufficient_shmem = + available_shmem_size > (compute_shmem_offsets_size(flattened_values.num_columns()) * 2); + auto const has_dictionary_request = std::any_of( + requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) { + return cudf::is_dictionary(request.values.type()); + }); + auto const is_shared_memory_compatible = !has_dictionary_request and has_sufficient_shmem; + + // Performs naive global memory aggregations when the workload is not compatible with shared + // memory, such as when aggregating dictionary columns or when there is insufficient dynamic + // shared memory for shared memory aggregations. + if (!is_shared_memory_compatible) { + return compute_global_memory_aggs(num_rows, + skip_rows_with_nulls, + row_bitmask, + flattened_values, + d_agg_kinds.data(), + agg_kinds, + global_set, + aggs, + sparse_results, + stream); + } + + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(num_rows, stream); + // 'local_mapping_index' maps from the global row index of the input table to its block-wise rank + rmm::device_uvector local_mapping_index(num_rows, stream); + // 'global_mapping_index' maps from the block-wise rank to the row index of global aggregate table + rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, + stream); + rmm::device_uvector block_cardinality(grid_size, stream); + + // Flag indicating whether a global memory aggregation fallback is required or not + rmm::device_scalar needs_global_memory_fallback(stream); + + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + + compute_mapping_indices(grid_size, + num_rows, + global_set_ref, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + needs_global_memory_fallback.data(), + stream); + + cuda::std::atomic_flag h_needs_fallback; + // Cannot use `device_scalar::value` as it requires a copy constructor, which + // `atomic_flag` doesn't have. + CUDF_CUDA_TRY(cudaMemcpyAsync(&h_needs_fallback, + needs_global_memory_fallback.data(), + sizeof(cuda::std::atomic_flag), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + auto const needs_fallback = h_needs_fallback.test(); + + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds.data(), + agg_kinds, + needs_fallback, + global_set, + populated_keys, + stream); + // prepare to launch kernel to do the actual aggregation + auto d_values = table_device_view::create(flattened_values, stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + + compute_shared_memory_aggs(grid_size, + available_shmem_size, + num_rows, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + stream); + + // The shared memory groupby is designed so that each thread block can handle up to 128 unique + // keys. When a block reaches this cardinality limit, shared memory becomes insufficient to store + // the temporary aggregation results. In these situations, we must fall back to a global memory + // aggregator to process the remaining aggregation requests. + if (needs_fallback) { + auto const stride = GROUPBY_BLOCK_SIZE * grid_size; + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator{0}, + num_rows, + global_memory_fallback_fn{global_set_ref, + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + block_cardinality.data(), + stride, + row_bitmask, + skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + } + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggs.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp similarity index 70% rename from cpp/src/groupby/hash/compute_single_pass_aggs.hpp rename to cpp/src/groupby/hash/compute_aggregations.hpp index a7434bdf61a..829c3c808b0 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -21,6 +21,7 @@ #include #include +#include namespace cudf::groupby::detail::hash { /** @@ -28,11 +29,12 @@ namespace cudf::groupby::detail::hash { * over the data and stores the results in `sparse_results` */ template -void compute_single_pass_aggs(int64_t num_keys, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - SetType set, - cudf::host_span requests, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream); +rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + SetType& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations_null.cu b/cpp/src/groupby/hash/compute_aggregations_null.cu new file mode 100644 index 00000000000..1d7184227ea --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations_null.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_aggregations.cuh" +#include "compute_aggregations.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + nullable_global_set_t& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu new file mode 100644 index 00000000000..6025686953e --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_global_memory_aggs.cuh" +#include "compute_global_memory_aggs.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + global_set_t& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh new file mode 100644 index 00000000000..00db149c6d9 --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_global_memory_aggs.hpp" +#include "create_sparse_results_table.hpp" +#include "flatten_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + SetType& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream) +{ + auto constexpr uses_global_memory_aggs = true; + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(num_rows, stream); + + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds, + agg_kinds, + uses_global_memory_aggs, + global_set, + populated_keys, + stream); + + // prepare to launch kernel to do the actual aggregation + auto d_values = table_device_view::create(flattened_values, stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::counting_iterator{0}, + num_rows, + hash::compute_single_pass_aggs_fn{ + global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask, skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggregations.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggregations[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp new file mode 100644 index 00000000000..0777b9ffd93 --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + SetType& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu new file mode 100644 index 00000000000..209e2b7f20a --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_global_memory_aggs.cuh" +#include "compute_global_memory_aggs.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + nullable_global_set_t& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 59457bea694..e1dbf2a3d9e 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -14,8 +14,8 @@ * limitations under the License. */ +#include "compute_aggregations.hpp" #include "compute_groupby.hpp" -#include "compute_single_pass_aggs.hpp" #include "helpers.cuh" #include "sparse_to_dense_results.hpp" @@ -29,7 +29,6 @@ #include #include -#include #include #include @@ -38,18 +37,6 @@ #include namespace cudf::groupby::detail::hash { -template -rmm::device_uvector extract_populated_keys(SetType const& key_set, - size_type num_keys, - rmm::cuda_stream_view stream) -{ - rmm::device_uvector populated_keys(num_keys, stream); - auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); - - populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); - return populated_keys; -} - template std::unique_ptr compute_groupby(table_view const& keys, host_span requests, @@ -67,8 +54,8 @@ std::unique_ptr
compute_groupby(table_view const& keys, // column is indexed by the hash set cudf::detail::result_cache sparse_results(requests.size()); - auto const set = cuco::static_set{ - num_keys, + auto set = cuco::static_set{ + cuco::extent{num_keys}, cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% load factor cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, d_row_equal, @@ -84,17 +71,13 @@ std::unique_ptr
compute_groupby(table_view const& keys, : rmm::device_buffer{}; // Compute all single pass aggs first - compute_single_pass_aggs(num_keys, - skip_rows_with_nulls, - static_cast(row_bitmask.data()), - set.ref(cuco::insert_and_find), - requests, - &sparse_results, - stream); - - // Extract the populated indices from the hash set and create a gather map. - // Gathering using this map from sparse results will give dense results. - auto gather_map = extract_populated_keys(set, keys.num_rows(), stream); + auto gather_map = compute_aggregations(num_keys, + skip_rows_with_nulls, + static_cast(row_bitmask.data()), + set, + requests, + &sparse_results, + stream); // Compact all results from sparse_results and insert into cache sparse_to_dense_results(requests, @@ -114,12 +97,6 @@ std::unique_ptr
compute_groupby(table_view const& keys, mr); } -template rmm::device_uvector extract_populated_keys( - global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream); - -template rmm::device_uvector extract_populated_keys( - nullable_global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream); - template std::unique_ptr
compute_groupby( table_view const& keys, host_span requests, diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp index 7bb3a60ff07..77243dc0a4f 100644 --- a/cpp/src/groupby/hash/compute_groupby.hpp +++ b/cpp/src/groupby/hash/compute_groupby.hpp @@ -22,28 +22,11 @@ #include #include -#include #include #include namespace cudf::groupby::detail::hash { -/** - * @brief Computes and returns a device vector containing all populated keys in - * `key_set`. - * - * @tparam SetType Type of key hash set - * - * @param key_set Key hash set - * @param num_keys Number of input keys - * @param stream CUDA stream used for device memory operations and kernel launches - * @return An array of unique keys contained in `key_set` - */ -template -rmm::device_uvector extract_populated_keys(SetType const& key_set, - size_type num_keys, - rmm::cuda_stream_view stream); - /** * @brief Computes groupby using hash table. * diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index 12c02a1865e..f0361ccced2 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -47,9 +47,8 @@ struct size_of_functor { /// Shared memory data alignment CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8; -// Prepares shared memory data required by each output column, exits if -// no enough memory space to perform the shared memory aggregation for the -// current output column +// Allocates shared memory required for output columns. Exits if there is insufficient memory to +// perform shared memory aggregation for the current output column. __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, cudf::size_type& col_end, cudf::mutable_table_device_view output_values, @@ -74,9 +73,7 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, ALIGNMENT); auto const next_col_total_size = next_col_size + valid_col_size; - if (bytes_allocated + next_col_total_size > total_agg_size) { - CUDF_UNREACHABLE("Not enough memory for shared memory aggregations"); - } + if (bytes_allocated + next_col_total_size > total_agg_size) { break; } shmem_agg_res_offsets[col_end] = bytes_allocated; shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size; @@ -275,7 +272,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, } } // namespace -std::size_t available_shared_memory_size(cudf::size_type grid_size) +std::size_t get_available_shared_memory_size(cudf::size_type grid_size) { auto const active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); @@ -302,11 +299,11 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, { // For each aggregation, need one offset determining where the aggregation is // performed, another indicating the validity of the aggregation - auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type); + auto const offsets_size = compute_shmem_offsets_size(output_values.num_columns()); // The rest of shmem is utilized for the actual arrays in shmem - CUDF_EXPECTS(available_shmem_size > shmem_offsets_size * 2, + CUDF_EXPECTS(available_shmem_size > offsets_size * 2, "No enough space for shared memory aggregations"); - auto const shmem_agg_size = available_shmem_size - shmem_offsets_size * 2; + auto const shmem_agg_size = available_shmem_size - offsets_size * 2; single_pass_shmem_aggs_kernel<<>>( num_input_rows, row_bitmask, @@ -318,6 +315,6 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, output_values, d_agg_kinds, shmem_agg_size, - shmem_offsets_size); + offsets_size); } } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp index 653821fd53b..346956cdab0 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp @@ -22,8 +22,12 @@ #include namespace cudf::groupby::detail::hash { +std::size_t get_available_shared_memory_size(cudf::size_type grid_size); -std::size_t available_shared_memory_size(cudf::size_type grid_size); +std::size_t constexpr compute_shmem_offsets_size(cudf::size_type num_cols) +{ + return sizeof(cudf::size_type) * num_cols; +} void compute_shared_memory_aggs(cudf::size_type grid_size, std::size_t available_shmem_size, @@ -37,5 +41,4 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, cudf::mutable_table_device_view output_values, cudf::aggregation::Kind const* d_agg_kinds, rmm::cuda_stream_view stream); - } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu deleted file mode 100644 index e292543e6e9..00000000000 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "compute_single_pass_aggs.hpp" -#include "create_sparse_results_table.hpp" -#include "flatten_single_pass_aggs.hpp" -#include "helpers.cuh" -#include "single_pass_functors.cuh" -#include "var_hash_functor.cuh" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -namespace cudf::groupby::detail::hash { -/** - * @brief Computes all aggregations from `requests` that require a single pass - * over the data and stores the results in `sparse_results` - */ -template -void compute_single_pass_aggs(int64_t num_keys, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - SetType set, - host_span requests, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream) -{ - // flatten the aggs to a table that can be operated on by aggregate_row - auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - - // make table that will hold sparse results - table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream); - // prepare to launch kernel to do the actual aggregation - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto d_values = table_device_view::create(flattened_values, stream); - auto const d_aggs = cudf::detail::make_device_uvector_async( - agg_kinds, stream, cudf::get_current_device_resource_ref()); - - thrust::for_each_n( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - num_keys, - hash::compute_single_pass_aggs_fn{ - set, *d_values, *d_sparse_table, d_aggs.data(), row_bitmask, skip_rows_with_nulls}); - // Add results back to sparse_results cache - auto sparse_result_cols = sparse_table.release(); - for (size_t i = 0; i < aggs.size(); i++) { - // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result( - flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); - } -} - -template void compute_single_pass_aggs>( - int64_t num_keys, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - hash_set_ref_t set, - host_span requests, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream); - -template void compute_single_pass_aggs>( - int64_t num_keys, - bool skip_rows_with_nulls, - bitmask_type const* row_bitmask, - nullable_hash_set_ref_t set, - host_span requests, - cudf::detail::result_cache* sparse_results, - rmm::cuda_stream_view stream); -} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu index 22fa4fc584c..bc32e306b3f 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.cu +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -15,53 +15,110 @@ */ #include "create_sparse_results_table.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" +#include #include #include -#include -#include -#include #include #include #include +#include + +#include #include #include #include namespace cudf::groupby::detail::hash { +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); + + populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); +} + // make table that will hold sparse results -cudf::table create_sparse_results_table(table_view const& flattened_values, - std::vector aggs, +template +cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, rmm::cuda_stream_view stream) { // TODO single allocation - room for performance improvement - std::vector> sparse_columns; - sparse_columns.reserve(flattened_values.num_columns()); - std::transform( - flattened_values.begin(), - flattened_values.end(), - aggs.begin(), - std::back_inserter(sparse_columns), - [stream](auto const& col, auto const& agg) { - bool nullable = - (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL) - ? false - : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD); - auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED; + std::vector> sparse_columns; + std::transform(flattened_values.begin(), + flattened_values.end(), + agg_kinds.begin(), + std::back_inserter(sparse_columns), + [stream](auto const& col, auto const& agg) { + auto const nullable = + (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) + ? false + : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or + agg == cudf::aggregation::STD); + auto const mask_flag = + (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; + auto const col_type = cudf::is_dictionary(col.type()) + ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + return make_fixed_width_column( + cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); + }); + cudf::table sparse_table(std::move(sparse_columns)); + // If no direct aggregations, initialize the sparse table + // only for the keys inserted in global hash set + if (!direct_aggregations) { + auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); + extract_populated_keys(global_set, populated_keys, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + populated_keys.size(), + initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds}); + } + // Else initialize the whole table + else { + cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); + cudf::detail::initialize_with_identity(sparse_table_view, agg_kinds, stream); + } + return sparse_table; +} - auto col_type = cudf::is_dictionary(col.type()) - ? cudf::dictionary_column_view(col).keys().type() - : col.type(); +template void extract_populated_keys( + global_set_t const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); - return make_fixed_width_column( - cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); - }); +template void extract_populated_keys( + nullable_global_set_t const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); - table sparse_table(std::move(sparse_columns)); - mutable_table_view table_view = sparse_table.mutable_view(); - cudf::detail::initialize_with_identity(table_view, aggs, stream); - return sparse_table; -} +template cudf::table create_sparse_results_table( + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + global_set_t const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template cudf::table create_sparse_results_table( + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + nullable_global_set_t const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp index c1d4e0d3f20..8155ce852e0 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.hpp +++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp @@ -15,18 +15,41 @@ */ #pragma once +#include #include #include #include #include #include +#include #include namespace cudf::groupby::detail::hash { +/** + * @brief Computes and returns a device vector containing all populated keys in + * `key_set`. + * + * @tparam SetType Type of the key hash set + * + * @param key_set Key hash set + * @param populated_keys Array of unique keys + * @param stream CUDA stream used for device memory operations and kernel launches + * @return An array of unique keys contained in `key_set` + */ +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + // make table that will hold sparse results -cudf::table create_sparse_results_table(table_view const& flattened_values, - std::vector aggs_kinds, +template +cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index 00836567b4f..f950e03e0fb 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -23,8 +23,6 @@ #include namespace cudf::groupby::detail::hash { -// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested -// types and `cg_size = 1`for flat data to improve performance /// Number of threads to handle each input element CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1; diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 28a5b578e00..048c9252773 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -15,12 +15,14 @@ */ #pragma once -#include +#include "helpers.cuh" + #include -#include -#include +#include +#include +#include -#include +#include namespace cudf::groupby::detail::hash { // TODO: TO BE REMOVED issue tracked via #17171 @@ -104,6 +106,114 @@ struct initialize_shmem { } }; +template +struct initialize_target_element_gmem { + __device__ void operator()(cudf::mutable_column_device_view, cudf::size_type) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct initialize_target_element_gmem< + Target, + k, + std::enable_if_t() && cudf::is_fixed_width() && + !cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + target.element(target_index) = get_identity(); + } +}; + +template +struct initialize_target_element_gmem< + Target, + k, + std::enable_if_t() && cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + target.element(target_index) = get_identity(); + } +}; + +struct initialize_gmem { + template + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + initialize_target_element_gmem{}(target, target_index); + } +}; + +struct initialize_sparse_table { + cudf::size_type const* row_indices; + cudf::mutable_table_device_view sparse_table; + cudf::aggregation::Kind const* __restrict__ aggs; + initialize_sparse_table(cudf::size_type const* row_indices, + cudf::mutable_table_device_view sparse_table, + cudf::aggregation::Kind const* aggs) + : row_indices(row_indices), sparse_table(sparse_table), aggs(aggs) + { + } + __device__ void operator()(cudf::size_type i) + { + auto key_idx = row_indices[i]; + for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) { + cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(), + aggs[col_idx], + initialize_gmem{}, + sparse_table.column(col_idx), + key_idx); + } + } +}; + +template +struct global_memory_fallback_fn { + SetType set; + cudf::table_device_view input_values; + cudf::mutable_table_device_view output_values; + cudf::aggregation::Kind const* __restrict__ aggs; + cudf::size_type* block_cardinality; + cudf::size_type stride; + bitmask_type const* __restrict__ row_bitmask; + bool skip_rows_with_nulls; + + global_memory_fallback_fn(SetType set, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* aggs, + cudf::size_type* block_cardinality, + cudf::size_type stride, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls) + : set(set), + input_values(input_values), + output_values(output_values), + aggs(aggs), + block_cardinality(block_cardinality), + stride(stride), + row_bitmask(row_bitmask), + skip_rows_with_nulls(skip_rows_with_nulls) + { + } + + __device__ void operator()(cudf::size_type i) + { + auto const block_id = (i % stride) / GROUPBY_BLOCK_SIZE; + if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD and + (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i))) { + auto const result = set.insert_and_find(i); + cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); + } + } +}; + /** * @brief Computes single-pass aggregations and store results into a sparse `output_values` table, * and populate `set` with indices of unique keys From d295f17f4468004367fe60088854ac5513519d32 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:22:08 -0500 Subject: [PATCH 02/19] Add `cudf::calendrical_month_sequence` to pylibcudf (#17277) Apart of #15162. Also adds tests for `pylibcudf.filling`. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17277 --- python/cudf/cudf/_lib/datetime.pyx | 21 ++--- python/pylibcudf/pylibcudf/filling.pxd | 6 ++ python/pylibcudf/pylibcudf/filling.pyx | 37 ++++++++ .../pylibcudf/pylibcudf/tests/test_filling.py | 91 +++++++++++++++++++ 4 files changed, 140 insertions(+), 15 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/tests/test_filling.py diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 2c7a585f4b1..7e8f29dac93 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -4,13 +4,7 @@ import warnings from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - cimport pylibcudf.libcudf.datetime as libcudf_datetime -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.filling cimport calendrical_month_sequence -from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type from pylibcudf.datetime import DatetimeComponent, RoundingFrequency @@ -143,20 +137,17 @@ def is_leap_year(Column col): @acquire_spill_lock() def date_range(DeviceScalar start, size_type n, offset): - cdef unique_ptr[column] c_result cdef size_type months = ( offset.kwds.get("years", 0) * 12 + offset.kwds.get("months", 0) ) - - cdef const scalar* c_start = start.get_raw_ptr() - with nogil: - c_result = move(calendrical_month_sequence( + return Column.from_pylibcudf( + plc.filling.calendrical_month_sequence( n, - c_start[0], - months - )) - return Column.from_unique_ptr(move(c_result)) + start.c_value, + months, + ) + ) @acquire_spill_lock() diff --git a/python/pylibcudf/pylibcudf/filling.pxd b/python/pylibcudf/pylibcudf/filling.pxd index b9345f8cd42..56aef086e1b 100644 --- a/python/pylibcudf/pylibcudf/filling.pxd +++ b/python/pylibcudf/pylibcudf/filling.pxd @@ -33,3 +33,9 @@ cpdef Table repeat( Table input_table, ColumnOrSize count ) + +cpdef Column calendrical_month_sequence( + size_type n, + Scalar init, + size_type months, +) diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx index a47004a1e42..313605ead16 100644 --- a/python/pylibcudf/pylibcudf/filling.pyx +++ b/python/pylibcudf/pylibcudf/filling.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.filling cimport ( fill_in_place as cpp_fill_in_place, repeat as cpp_repeat, sequence as cpp_sequence, + calendrical_month_sequence as cpp_calendrical_month_sequence ) from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type @@ -164,3 +165,39 @@ cpdef Table repeat( count ) return Table.from_libcudf(move(result)) + + +cpdef Column calendrical_month_sequence( + size_type n, + Scalar init, + size_type months, +): + + """Fill destination column from begin to end with value. + + For details, see :cpp:func:`calendrical_month_sequence`. + + Parameters + ---------- + n : size_type + Number of timestamps to generate + init : Scalar + The initial timestamp + months : size_type + Months to increment + + Returns + ------- + pylibcudf.Column + Timestamps column with sequences of months + """ + + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_calendrical_month_sequence( + n, + dereference(init.c_obj), + months + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_filling.py b/python/pylibcudf/pylibcudf/tests/test_filling.py new file mode 100644 index 00000000000..91c7e42a0a0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_filling.py @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from datetime import datetime + +import pyarrow as pa +import pytest +from utils import assert_column_eq, assert_table_eq + +import pylibcudf as plc + + +@pytest.fixture +def pa_col(): + return pa.array([2, 3, 5, 7, 11]) + + +@pytest.fixture +def pa_table(): + pa_col = pa.array([1, 2, 3]) + return pa.table([pa_col], names=["a"]) + + +def test_fill(pa_col): + result = plc.filling.fill( + plc.interop.from_arrow(pa_col), + 1, + 3, + plc.interop.from_arrow(pa.scalar(5)), + ) + expect = pa.array([2, 5, 5, 7, 11]) + assert_column_eq(result, expect) + + +def test_fill_in_place(pa_col): + result = plc.interop.from_arrow(pa_col) + plc.filling.fill_in_place( + result, + 1, + 3, + plc.interop.from_arrow(pa.scalar(5)), + ) + expect = pa.array([2, 5, 5, 7, 11]) + assert_column_eq(result, expect) + + +def test_sequence(): + size = 5 + init_scalar = plc.interop.from_arrow(pa.scalar(10)) + step_scalar = plc.interop.from_arrow(pa.scalar(2)) + result = plc.filling.sequence( + size, + init_scalar, + step_scalar, + ) + expect = pa.array([10, 12, 14, 16, 18]) + assert_column_eq(result, expect) + + +def test_repeat_with_count_int(pa_table): + input_table = plc.interop.from_arrow(pa_table) + count = 2 + result = plc.filling.repeat(input_table, count) + expect = pa.table([[1, 1, 2, 2, 3, 3]], names=["a"]) + assert_table_eq(expect, result) + + +def test_repeat_with_count_column(pa_table): + input_table = plc.interop.from_arrow(pa_table) + count = plc.interop.from_arrow(pa.array([1, 2, 3])) + result = plc.filling.repeat(input_table, count) + expect = pa.table([[1] + [2] * 2 + [3] * 3], names=["a"]) + assert_table_eq(expect, result) + + +def test_calendrical_month_sequence(): + n = 5 + init_date = datetime(2020, 1, 31) + init = plc.interop.from_arrow( + pa.scalar(init_date, type=pa.timestamp("ms")) + ) + months = 1 + result = plc.filling.calendrical_month_sequence(n, init, months) + expected_dates = [ + datetime(2020, 1, 31), + datetime(2020, 2, 29), + datetime(2020, 3, 31), + datetime(2020, 4, 30), + datetime(2020, 5, 31), + ] + expect = pa.array(expected_dates, type=pa.timestamp("ms")) + assert_column_eq(result, expect) From fea46cd869bac0e312a898ca959783aa8db2ad5f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Nov 2024 14:14:55 -0800 Subject: [PATCH 03/19] Add read_parquet_metadata to pylibcudf (#17245) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17245 --- docs/cudf/source/conf.py | 2 + .../api_docs/pylibcudf/io/index.rst | 1 + .../pylibcudf/io/parquet_metadata.rst | 6 + python/cudf/cudf/_lib/io/utils.pxd | 1 - python/cudf/cudf/_lib/io/utils.pyx | 56 ----- python/cudf/cudf/_lib/parquet.pyx | 67 ++---- python/cudf/cudf/tests/test_parquet.py | 4 +- python/pylibcudf/pylibcudf/io/CMakeLists.txt | 4 +- python/pylibcudf/pylibcudf/io/__init__.pxd | 12 +- python/pylibcudf/pylibcudf/io/__init__.py | 13 +- .../pylibcudf/io/parquet_metadata.pxd | 51 +++++ .../pylibcudf/io/parquet_metadata.pyx | 207 ++++++++++++++++++ .../pylibcudf/libcudf/io/parquet_metadata.pxd | 4 +- 13 files changed, 318 insertions(+), 110 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst create mode 100644 python/pylibcudf/pylibcudf/io/parquet_metadata.pxd create mode 100644 python/pylibcudf/pylibcudf/io/parquet_metadata.pyx diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 5942cc16850..0d463b918d3 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -554,6 +554,8 @@ def on_missing_reference(app, env, node, contnode): nitpick_ignore = [ + # Erroneously warned in ParquetColumnSchema.name + ("py:class", "unicode"), ("py:class", "SeriesOrIndex"), ("py:class", "Dtype"), # The following are erroneously warned due to diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst index cd5c5a5f77e..1c1c8040972 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -19,5 +19,6 @@ I/O Functions csv json parquet + parquet_metadata text timezone diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst new file mode 100644 index 00000000000..fce964f9714 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst @@ -0,0 +1,6 @@ +================ +Parquet Metadata +================ + +.. automodule:: pylibcudf.io.parquet_metadata + :members: diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd index 76a6e32fde0..96504ebdd66 100644 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ b/python/cudf/cudf/_lib/io/utils.pxd @@ -13,7 +13,6 @@ from pylibcudf.libcudf.io.types cimport ( from cudf._lib.column cimport Column -cdef source_info make_source_info(list src) except* cdef sink_info make_sinks_info( list src, vector[unique_ptr[data_sink]] & data) except* cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except* diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index 564daefbae2..f23980b387a 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -7,76 +7,20 @@ from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector -from pylibcudf.io.datasource cimport Datasource from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.datasource cimport datasource from pylibcudf.libcudf.io.types cimport ( column_name_info, - host_buffer, sink_info, - source_info, ) from cudf._lib.column cimport Column import codecs -import errno import io import os from cudf.core.dtypes import StructDtype - -# Converts the Python source input to libcudf IO source_info -# with the appropriate type and source values -cdef source_info make_source_info(list src) except*: - if not src: - raise ValueError("Need to pass at least one source") - - cdef const unsigned char[::1] c_buffer - cdef vector[host_buffer] c_host_buffers - cdef vector[string] c_files - cdef Datasource csrc - cdef vector[datasource*] c_datasources - empty_buffer = False - if isinstance(src[0], bytes): - empty_buffer = True - for buffer in src: - if (len(buffer) > 0): - c_buffer = buffer - c_host_buffers.push_back(host_buffer(&c_buffer[0], - c_buffer.shape[0])) - empty_buffer = False - elif isinstance(src[0], io.BytesIO): - for bio in src: - c_buffer = bio.getbuffer() # check if empty? - c_host_buffers.push_back(host_buffer(&c_buffer[0], - c_buffer.shape[0])) - # Otherwise src is expected to be a numeric fd, string path, or PathLike. - # TODO (ptaylor): Might need to update this check if accepted input types - # change when UCX and/or cuStreamz support is added. - elif isinstance(src[0], Datasource): - for csrc in src: - c_datasources.push_back(csrc.get_datasource()) - return source_info(c_datasources) - elif isinstance(src[0], (int, float, complex, basestring, os.PathLike)): - # If source is a file, return source_info where type=FILEPATH - if not all(os.path.isfile(file) for file in src): - raise FileNotFoundError(errno.ENOENT, - os.strerror(errno.ENOENT), - src) - - files = [ str(elem).encode() for elem in src] - c_files = files - return source_info(c_files) - else: - raise TypeError("Unrecognized input type: {}".format(type(src[0]))) - - if empty_buffer is True: - c_host_buffers.push_back(host_buffer(NULL, 0)) - - return source_info(c_host_buffers) - # Converts the Python sink input to libcudf IO sink_info. cdef sink_info make_sinks_info( list src, vector[unique_ptr[data_sink]] & sink diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 1212637d330..d4bd0cd306c 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -27,7 +27,6 @@ from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string -from libcpp.unordered_map cimport unordered_map from libcpp.utility cimport move from libcpp.vector cimport vector @@ -41,12 +40,7 @@ from pylibcudf.libcudf.io.parquet cimport ( parquet_writer_options, write_parquet as parquet_writer, ) -from pylibcudf.libcudf.io.parquet_metadata cimport ( - parquet_metadata, - read_parquet_metadata as parquet_metadata_reader, -) from pylibcudf.libcudf.io.types cimport ( - source_info, sink_info, column_in_metadata, table_input_metadata, @@ -62,7 +56,6 @@ from cudf._lib.column cimport Column from cudf._lib.io.utils cimport ( add_df_col_struct_names, make_sinks_info, - make_source_info, ) from cudf._lib.utils cimport table_view_from_table @@ -373,7 +366,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, nrows=nrows, skip_rows=skip_rows) return df -cpdef read_parquet_metadata(filepaths_or_buffers): +cpdef read_parquet_metadata(list filepaths_or_buffers): """ Cython function to call into libcudf API, see `read_parquet_metadata`. @@ -382,56 +375,40 @@ cpdef read_parquet_metadata(filepaths_or_buffers): cudf.io.parquet.read_parquet cudf.io.parquet.to_parquet """ - cdef source_info source = make_source_info(filepaths_or_buffers) - - args = move(source) - - cdef parquet_metadata c_result - - # Read Parquet metadata - with nogil: - c_result = move(parquet_metadata_reader(args)) - - # access and return results - num_rows = c_result.num_rows() - num_rowgroups = c_result.num_rowgroups() - - # extract row group metadata and sanitize keys - row_group_metadata = [{k.decode(): v for k, v in metadata} - for metadata in c_result.rowgroup_metadata()] + parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata( + plc.io.SourceInfo(filepaths_or_buffers) + ) # read all column names including index column, if any - col_names = [info.name().decode() for info in c_result.schema().root().children()] - - # access the Parquet file_footer to find the index - index_col = None - cdef unordered_map[string, string] file_footer = c_result.metadata() + col_names = [info.name() for info in parquet_metadata.schema().root().children()] - # get index column name(s) - index_col_names = None - json_str = file_footer[b'pandas'].decode('utf-8') - meta = None + index_col_names = set() + json_str = parquet_metadata.metadata()['pandas'] if json_str != "": meta = json.loads(json_str) file_is_range_index, index_col, _ = _parse_metadata(meta) - if not file_is_range_index and index_col is not None \ - and index_col_names is None: - index_col_names = {} + if ( + not file_is_range_index + and index_col is not None + ): + columns = meta['columns'] for idx_col in index_col: - for c in meta['columns']: + for c in columns: if c['field_name'] == idx_col: - index_col_names[idx_col] = c['name'] + index_col_names.add(idx_col) # remove the index column from the list of column names # only if index_col_names is not None - if index_col_names is not None: + if len(index_col_names) >= 0: col_names = [name for name in col_names if name not in index_col_names] - # num_columns = length of list(col_names) - num_columns = len(col_names) - - # return the metadata - return num_rows, num_rowgroups, col_names, num_columns, row_group_metadata + return ( + parquet_metadata.num_rows(), + parquet_metadata.num_rowgroups(), + col_names, + len(col_names), + parquet_metadata.rowgroup_metadata() + ) @acquire_spill_lock() diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index c9ce24d2a5b..3c4398a87de 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -405,14 +405,14 @@ def test_parquet_range_index_pandas_metadata(tmpdir, pandas_compat, as_bytes): assert_eq(expect, got) -def test_parquet_read_metadata(tmpdir, pdf): +def test_parquet_read_metadata(tmp_path, pdf): if len(pdf) > 100: pytest.skip("Skipping long setup test") def num_row_groups(rows, group_size): return max(1, (rows + (group_size - 1)) // group_size) - fname = tmpdir.join("metadata.parquet") + fname = tmp_path / "metadata.parquet" row_group_size = 5 pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size) diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt index f78d97ef4d1..664faef718f 100644 --- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx - text.pyx types.pyx +set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx + parquet_metadata.pyx text.pyx timezone.pyx types.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd index 6ba7f78a013..663804e714d 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.pxd +++ b/python/pylibcudf/pylibcudf/io/__init__.pxd @@ -1,5 +1,15 @@ # Copyright (c) 2024, NVIDIA CORPORATION. # CSV is removed since it is def not cpdef (to force kw-only arguments) -from . cimport avro, datasource, json, orc, parquet, timezone, text, types +from . cimport ( + avro, + datasource, + json, + orc, + parquet, + parquet_metadata, + text, + timezone, + types, +) from .types cimport SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py index 0fc77dd0f57..9e8e0f6e080 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.py +++ b/python/pylibcudf/pylibcudf/io/__init__.py @@ -1,4 +1,15 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import avro, csv, datasource, json, orc, parquet, timezone, text, types +from . import ( + avro, + csv, + datasource, + json, + orc, + parquet, + parquet_metadata, + text, + timezone, + types, +) from .types import SinkInfo, SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd new file mode 100644 index 00000000000..e421a64adc8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.io.types cimport SourceInfo +from pylibcudf.libcudf.io.parquet_metadata cimport( + parquet_metadata, + parquet_schema, + parquet_column_schema, +) + +cdef class ParquetColumnSchema: + cdef parquet_column_schema column_schema + + @staticmethod + cdef from_column_schema(parquet_column_schema column_schema) + + cpdef str name(self) + + cpdef int num_children(self) + + cpdef ParquetColumnSchema child(self, int idx) + + cpdef list children(self) + + +cdef class ParquetSchema: + cdef parquet_schema schema + + @staticmethod + cdef from_schema(parquet_schema schema) + + cpdef ParquetColumnSchema root(self) + + +cdef class ParquetMetadata: + cdef parquet_metadata meta + + @staticmethod + cdef from_metadata(parquet_metadata meta) + + cpdef ParquetSchema schema(self) + + cpdef int num_rows(self) + + cpdef int num_rowgroups(self) + + cpdef dict metadata(self) + + cpdef list rowgroup_metadata(self) + + +cpdef ParquetMetadata read_parquet_metadata(SourceInfo src_info) diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx new file mode 100644 index 00000000000..352905ff0f8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx @@ -0,0 +1,207 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.io.types cimport SourceInfo +from pylibcudf.libcudf.io cimport parquet_metadata as cpp_parquet_metadata + + +cdef class ParquetColumnSchema: + """ + Schema of a parquet column, including the nested columns. + + Parameters + ---------- + parquet_column_schema + """ + def __init__(self): + raise ValueError("Construct ParquetColumnSchema with from_column_schema.") + + @staticmethod + cdef from_column_schema(cpp_parquet_metadata.parquet_column_schema column_schema): + cdef ParquetColumnSchema result = ParquetColumnSchema.__new__( + ParquetColumnSchema + ) + result.column_schema = column_schema + return result + + cpdef str name(self): + """ + Returns parquet column name; can be empty. + + Returns + ------- + str + Column name + """ + return self.column_schema.name().decode() + + cpdef int num_children(self): + """ + Returns the number of child columns. + + Returns + ------- + int + Children count + """ + return self.column_schema.num_children() + + cpdef ParquetColumnSchema child(self, int idx): + """ + Returns schema of the child with the given index. + + Parameters + ---------- + idx : int + Child Index + + Returns + ------- + ParquetColumnSchema + Child schema + """ + return ParquetColumnSchema.from_column_schema(self.column_schema.child(idx)) + + cpdef list children(self): + """ + Returns schemas of all child columns. + + Returns + ------- + list[ParquetColumnSchema] + Child schemas. + """ + cdef cpp_parquet_metadata.parquet_column_schema child + return [ + ParquetColumnSchema.from_column_schema(child) + for child in self.column_schema.children() + ] + + +cdef class ParquetSchema: + """ + Schema of a parquet file. + + Parameters + ---------- + parquet_schema + """ + + def __init__(self): + raise ValueError("Construct ParquetSchema with from_schema.") + + @staticmethod + cdef from_schema(cpp_parquet_metadata.parquet_schema schema): + cdef ParquetSchema result = ParquetSchema.__new__(ParquetSchema) + result.schema = schema + return result + + cpdef ParquetColumnSchema root(self): + """ + Returns the schema of the struct column that contains all columns as fields. + + Returns + ------- + ParquetColumnSchema + Root column schema + """ + return ParquetColumnSchema.from_column_schema(self.schema.root()) + + +cdef class ParquetMetadata: + """ + Information about content of a parquet file. + + Parameters + ---------- + parquet_metadata + """ + + def __init__(self): + raise ValueError("Construct ParquetMetadata with from_metadata.") + + @staticmethod + cdef from_metadata(cpp_parquet_metadata.parquet_metadata meta): + cdef ParquetMetadata result = ParquetMetadata.__new__(ParquetMetadata) + result.meta = meta + return result + + cpdef ParquetSchema schema(self): + """ + Returns the parquet schema. + + Returns + ------- + ParquetSchema + Parquet schema + """ + return ParquetSchema.from_schema(self.meta.schema()) + + cpdef int num_rows(self): + """ + Returns the number of rows of the root column. + + Returns + ------- + int + Number of rows + """ + return self.meta.num_rows() + + cpdef int num_rowgroups(self): + """ + Returns the number of rowgroups in the file. + + Returns + ------- + int + Number of row groups. + """ + return self.meta.num_rowgroups() + + cpdef dict metadata(self): + """ + Returns the key-value metadata in the file footer. + + Returns + ------- + dict[bytes, bytes] + Key value metadata as a map. + """ + return {key.decode(): val.decode() for key, val in self.meta.metadata()} + + cpdef list rowgroup_metadata(self): + """ + Returns the row group metadata in the file footer. + + Returns + ------- + list[dict[str, int]] + Vector of row group metadata as maps. + """ + return [ + {key.decode(): val for key, val in metadata} + for metadata in self.meta.rowgroup_metadata() + ] + + +cpdef ParquetMetadata read_parquet_metadata(SourceInfo src_info): + """ + Reads metadata of parquet dataset. + + Parameters + ---------- + src_info : SourceInfo + Dataset source. + + Returns + ------- + ParquetMetadata + Parquet_metadata with parquet schema, number of rows, + number of row groups and key-value metadata. + """ + cdef cpp_parquet_metadata.parquet_metadata c_result + + with nogil: + c_result = cpp_parquet_metadata.read_parquet_metadata(src_info.c_obj) + + return ParquetMetadata.from_metadata(c_result) diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd index 8e6da56c9a6..b0ce13e4492 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd @@ -1,11 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -cimport pylibcudf.libcudf.io.types as cudf_io_types from libc.stdint cimport int64_t from libcpp.string cimport string from libcpp.unordered_map cimport unordered_map from libcpp.vector cimport vector from pylibcudf.libcudf.types cimport size_type +from pylibcudf.libcudf.io.types cimport source_info cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil: @@ -28,4 +28,4 @@ cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil: unordered_map[string, string] metadata() except+ vector[unordered_map[string, int64_t]] rowgroup_metadata() except+ - cdef parquet_metadata read_parquet_metadata(cudf_io_types.source_info src) except+ + cdef parquet_metadata read_parquet_metadata(source_info src_info) except+ From db69c52d9140d909aeb4af3a5b3db1e7c44c92bc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Nov 2024 14:46:27 -0800 Subject: [PATCH 04/19] Follow up making Python tests more deterministic (#17272) Addressing comments in https://github.com/rapidsai/cudf/pull/17008/files#r1823318321 and https://github.com/rapidsai/cudf/pull/17008/files#r1823318898 Didn't touch the `_fuzz_testing` directory because maybe we don't want that to be deterministic? Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) - GALI PREM SAGAR (https://github.com/galipremsagar) - James Lamb (https://github.com/jameslamb) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17272 --- .pre-commit-config.yaml | 4 ++-- python/cudf/cudf/tests/test_parquet.py | 11 +++-------- .../dask_cudf/tests/test_reductions.py | 17 +---------------- python/dask_cudf/dask_cudf/tests/utils.py | 2 +- 4 files changed, 7 insertions(+), 27 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f5234f58efe..6d070a8a14c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -90,8 +90,8 @@ repos: entry: | # Check for usage of default_rng without seeding default_rng\(\)| - # Check for usage of np.random.seed - np.random.seed\( + # Check for usage of np.random.seed (NPY002 only disallows this being called) + np.random.seed language: pygrep types: [python] - id: cmake-format diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 3c4398a87de..96512dacb69 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -193,11 +193,6 @@ def parquet_file(request, tmp_path_factory, pdf): return fname -@pytest.fixture(scope="module") -def rdg_seed(): - return int(os.environ.get("TEST_CUDF_RDG_SEED", "42")) - - def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64): test_pdf = pd.DataFrame( [list(range(ncolumns * i, ncolumns * (i + 1))) for i in range(nrows)], @@ -431,7 +426,7 @@ def num_row_groups(rows, group_size): assert a == b -def test_parquet_read_filtered(tmpdir, rdg_seed): +def test_parquet_read_filtered(tmpdir): # Generate data fname = tmpdir.join("filtered.parquet") dg.generate( @@ -455,13 +450,13 @@ def test_parquet_read_filtered(tmpdir, rdg_seed): dg.ColumnParameters( 40, 0.2, - lambda: np.random.default_rng(seed=None).integers( + lambda: np.random.default_rng(seed=0).integers( 0, 100, size=40 ), True, ), ], - seed=rdg_seed, + seed=42, ), format={"name": "parquet", "row_group_size": 64}, ) diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index 4351b672151..f11a5252080 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -1,7 +1,5 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -import numpy as np -import pandas as pd import pytest import dask @@ -10,20 +8,7 @@ import cudf import dask_cudf - - -def _make_random_frame(nelem, npartitions=2): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - { - "x": rng.integers(0, 5, size=nelem), - "y": rng.normal(loc=1.0, scale=1.0, size=nelem), - } - ) - gdf = cudf.DataFrame.from_pandas(df) - dgf = dask_cudf.from_cudf(gdf, npartitions=npartitions) - return df, dgf - +from dask_cudf.tests.utils import _make_random_frame _reducers = ["sum", "count", "mean", "var", "std", "min", "max"] diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index a9f61f75762..b44b3f939e7 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -19,7 +19,7 @@ def _make_random_frame(nelem, npartitions=2, include_na=False): - rng = np.random.default_rng(seed=None) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( {"x": rng.random(size=nelem), "y": rng.random(size=nelem)} ) From 0fc5fab825ece5b605d84a3d5ef04d7dde31b39f Mon Sep 17 00:00:00 2001 From: Graham Markall <535640+gmarkall@users.noreply.github.com> Date: Sat, 9 Nov 2024 00:01:26 +0000 Subject: [PATCH 05/19] Use numba-cuda<0.0.18 (#17280) Numba-cuda 0.0.18 (not yet released) contains some changes that might break pynvjitlink patching. In order to avoid breaking RAPIDS CI whilst working through that after releasing numba-cuda 0.0.18 but before the next pynvjitlink, this PR makes use of numba-cuda 0.0.17 or less a requirement. Authors: - Graham Markall (https://github.com/gmarkall) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - https://github.com/brandon-b-miller - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17280 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 2 +- dependencies.yaml | 2 +- python/cudf/pyproject.toml | 2 +- python/dask_cudf/pyproject.toml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 6fbdd4ba568..01764411346 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -55,7 +55,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.0.13 +- numba-cuda>=0.0.13,<0.0.18 - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 4aafa12fdae..9074e6332d9 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -54,7 +54,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.0.13 +- numba-cuda>=0.0.13,<0.0.18 - numpy>=1.23,<3.0a0 - numpydoc - nvcomp==4.1.0.6 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 2aafcae072d..04904e95630 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -80,7 +80,7 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - - numba-cuda >=0.0.13 + - numba-cuda >=0.0.13,<0.0.18 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - libcudf ={{ version }} diff --git a/dependencies.yaml b/dependencies.yaml index 59f8f2fda49..e47e0c7523c 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -675,7 +675,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba-cuda-dep numba-cuda>=0.0.13 + - &numba-cuda-dep numba-cuda>=0.0.13,<0.0.18 - nvtx>=0.2.1 - packaging - rich diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 41dedc4ff20..ca6dbddfecc 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==24.12.*,>=0.0.0a0", - "numba-cuda>=0.0.13", + "numba-cuda>=0.0.13,<0.0.18", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index c7e4cbc45ea..c4bfc3054bc 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -46,7 +46,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint" [project.optional-dependencies] test = [ "dask-cuda==24.12.*,>=0.0.0a0", - "numba-cuda>=0.0.13", + "numba-cuda>=0.0.13,<0.0.18", "pytest-cov", "pytest-xdist", "pytest<8", From e399e9596d9fe1cf2df0ff1270e2c0c764331b8e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:23:25 -0800 Subject: [PATCH 06/19] Use pylibcudf enums in cudf Python quantile (#17287) Shouldn't need to use the "private" `pylibcudf.libcudf` types anymore now that the Python side enums are exposed Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17287 --- python/cudf/cudf/_lib/quantiles.pyx | 28 +++--------------- python/cudf/cudf/_lib/types.pxd | 5 ---- python/cudf/cudf/_lib/types.pyx | 44 ----------------------------- python/cudf/cudf/core/frame.py | 12 ++++---- 4 files changed, 10 insertions(+), 79 deletions(-) diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index 7666b7ff8da..509cfe5e9f8 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -6,14 +6,6 @@ from libcpp cimport bool from libcpp.vector cimport vector from cudf._lib.column cimport Column -from cudf._lib.types cimport ( - underlying_type_t_interpolation, - underlying_type_t_sorted, -) - -from cudf._lib.types import Interpolation - -from pylibcudf.libcudf.types cimport interpolation, sorted from cudf._lib.utils cimport columns_from_pylibcudf_table @@ -28,17 +20,13 @@ def quantile( Column ordered_indices, bool exact, ): - cdef interpolation c_interp = ( - Interpolation[interp.upper()] - ) - return Column.from_pylibcudf( plc.quantiles.quantile( input.to_pylibcudf(mode="read"), q, - c_interp, + plc.types.Interpolation[interp.upper()], ordered_indices.to_pylibcudf(mode="read"), - exact + exact ) ) @@ -51,22 +39,14 @@ def quantile_table( list column_order, list null_precedence, ): - - cdef interpolation c_interp = ( - interp - ) - cdef sorted c_is_input_sorted = ( - is_input_sorted - ) - return columns_from_pylibcudf_table( plc.quantiles.quantiles( plc.Table([ c.to_pylibcudf(mode="read") for c in source_columns ]), q, - c_interp, - c_is_input_sorted, + interp, + is_input_sorted, column_order, null_precedence ) diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd index 4fd3d31841e..c2b760490c1 100644 --- a/python/cudf/cudf/_lib/types.pxd +++ b/python/cudf/cudf/_lib/types.pxd @@ -7,12 +7,7 @@ cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -ctypedef bool underlying_type_t_order -ctypedef bool underlying_type_t_null_order -ctypedef bool underlying_type_t_sorted -ctypedef int32_t underlying_type_t_interpolation ctypedef int32_t underlying_type_t_type_id -ctypedef bool underlying_type_t_null_policy cdef dtype_from_column_view(column_view cv) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 861bb063707..f169ea12b10 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -11,12 +11,6 @@ cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -from cudf._lib.types cimport ( - underlying_type_t_interpolation, - underlying_type_t_order, - underlying_type_t_sorted, -) - import pylibcudf import cudf @@ -151,44 +145,6 @@ datetime_unit_map = { size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID] -class Interpolation(IntEnum): - LINEAR = ( - libcudf_types.interpolation.LINEAR - ) - LOWER = ( - libcudf_types.interpolation.LOWER - ) - HIGHER = ( - libcudf_types.interpolation.HIGHER - ) - MIDPOINT = ( - libcudf_types.interpolation.MIDPOINT - ) - NEAREST = ( - libcudf_types.interpolation.NEAREST - ) - - -class Order(IntEnum): - ASCENDING = libcudf_types.order.ASCENDING - DESCENDING = libcudf_types.order.DESCENDING - - -class Sorted(IntEnum): - YES = libcudf_types.sorted.YES - NO = libcudf_types.sorted.NO - - -class NullOrder(IntEnum): - BEFORE = libcudf_types.null_order.BEFORE - AFTER = libcudf_types.null_order.AFTER - - -class NullHandling(IntEnum): - INCLUDE = libcudf_types.null_policy.INCLUDE - EXCLUDE = libcudf_types.null_policy.EXCLUDE - - cdef dtype_from_lists_column_view(column_view cv): # lists_column_view have no default constructor, so we heap # allocate it to get around Cython's limitation of requiring diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 205edd91d9d..2b4a17f9559 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -16,6 +16,8 @@ import pyarrow as pa from typing_extensions import Self +import pylibcudf as plc + import cudf from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar @@ -789,15 +791,13 @@ def _quantile_table( column_order=(), null_precedence=(), ): - interpolation = libcudf.types.Interpolation[interpolation] + interpolation = plc.types.Interpolation[interpolation] - is_sorted = libcudf.types.Sorted["YES" if is_sorted else "NO"] + is_sorted = plc.types.Sorted["YES" if is_sorted else "NO"] - column_order = [libcudf.types.Order[key] for key in column_order] + column_order = [plc.types.Order[key] for key in column_order] - null_precedence = [ - libcudf.types.NullOrder[key] for key in null_precedence - ] + null_precedence = [plc.types.NullOrder[key] for key in null_precedence] return self._from_columns_like_self( libcudf.quantiles.quantile_table( From 7a499f645c040c300e466721a39be65e3e1b054e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Nov 2024 17:38:47 -0800 Subject: [PATCH 07/19] Use more pylibcudf Python enums in cudf._lib (#17288) Similar to https://github.com/rapidsai/cudf/pull/17287. Also remove a `plc` naming shadowing Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17288 --- python/cudf/cudf/_lib/groupby.pyx | 7 ++----- python/cudf/cudf/_lib/json.pyx | 2 +- python/cudf/cudf/_lib/lists.pyx | 8 ++++++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index c199ed96d4f..1ce6dfab15e 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -18,7 +18,6 @@ from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf._lib.scalar import as_device_scalar -from pylibcudf.libcudf.replace cimport replace_policy from pylibcudf.libcudf.scalar.scalar cimport scalar import pylibcudf @@ -244,13 +243,11 @@ cdef class GroupBy: return columns_from_pylibcudf_table(shifts), columns_from_pylibcudf_table(keys) def replace_nulls(self, list values, object method): - # TODO: This is using an enum (replace_policy) that has not been exposed in - # pylibcudf yet. We'll want to fix that import once it is in pylibcudf. _, replaced = self._groupby.replace_nulls( pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]), [ - replace_policy.PRECEDING - if method == 'ffill' else replace_policy.FOLLOWING + pylibcudf.replace.ReplacePolicy.PRECEDING + if method == 'ffill' else pylibcudf.replace.ReplacePolicy.FOLLOWING ] * len(values), ) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index fb149603960..7dc9cd01a00 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -104,7 +104,7 @@ cpdef read_json(object filepaths_or_buffers, ) df = cudf.DataFrame._from_data( *_data_from_columns( - columns=[Column.from_pylibcudf(plc) for plc in res_cols], + columns=[Column.from_pylibcudf(col) for col in res_cols], column_names=res_col_names, index_names=None ) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 12432ac6d5d..a91d44274e5 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -4,7 +4,7 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from pylibcudf.libcudf.types cimport null_order, size_type +from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column from cudf._lib.utils cimport columns_from_pylibcudf_table @@ -49,7 +49,11 @@ def sort_lists(Column col, bool ascending, str na_position): plc.lists.sort_lists( col.to_pylibcudf(mode="read"), ascending, - null_order.BEFORE if na_position == "first" else null_order.AFTER, + ( + plc.types.NullOrder.BEFORE + if na_position == "first" + else plc.types.NullOrder.AFTER + ), False, ) ) From 5cbdcd07a71fd63813840fdf270d7aec62f1c844 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 8 Nov 2024 21:53:45 -0500 Subject: [PATCH 08/19] Expose delimiter character in JSON reader options to JSON reader APIs (#17266) Fixes #17261 Removes delimiter symbol group from whitespace normalization FST since it is run post-tokenization. Authors: - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/17266 --- cpp/include/cudf/io/detail/json.hpp | 8 +-- cpp/src/io/json/json_normalization.cu | 49 ++++++++++--------- cpp/src/io/json/read_json.cu | 3 +- .../io/json/json_quote_normalization_test.cpp | 21 ++++++-- 4 files changed, 49 insertions(+), 32 deletions(-) diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 940d03cdb41..2e2ac43d6fe 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -57,11 +57,13 @@ void write_json(data_sink* sink, /** * @brief Normalize single quotes to double quotes using FST * - * @param indata Input device buffer - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation + * @param indata Input device buffer + * @param delimiter Line-separating delimiter character in JSONL inputs + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation */ void normalize_single_quotes(datasource::owning_buffer& indata, + char delimiter, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 34a87918e57..1b61be20202 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -58,7 +58,7 @@ enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " SINGLE_QUOTE_CHAR, ///< Quote character SG: ' ESCAPE_CHAR, ///< Escape character SG: '\' - NEWLINE_CHAR, ///< Newline character SG: '\n' + DELIM_CHAR, ///< Delimiter character SG OTHER_SYMBOLS, ///< SG implicitly matching all other characters NUM_SYMBOL_GROUPS ///< Total number of symbol groups }; @@ -72,13 +72,17 @@ constexpr auto TT_SEC = dfa_states::TT_SEC; constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const qna_sgs{ - {{'\"'}, {'\''}, {'\\'}, {'\n'}}}; +auto get_sgid_lut(SymbolT delim) +{ + // The i-th string representing all the characters of a symbol group + std::array, NUM_SYMBOL_GROUPS - 1> symbol_groups{ + {{'\"'}, {'\''}, {'\\'}, {delim}}}; + return symbol_groups; +} // Transition table std::array, TT_NUM_STATES> const qna_state_tt{{ - /* IN_STATE " ' \ \n OTHER */ + /* IN_STATE " ' \ OTHER */ /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}}, /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}}, /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}}, @@ -199,28 +203,26 @@ struct TransduceToNormalizedQuotes { namespace normalize_whitespace { +// We do not need a symbol group for the delimiter character since whitespace normalization +// now occurs after tokenization. enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " ESCAPE_CHAR, ///< Escape character SG: '\\' - NEWLINE_CHAR, ///< Newline character SG: '\n' WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' ' OTHER_SYMBOLS, ///< SG implicitly matching all other characters NUM_SYMBOL_GROUPS ///< Total number of symbol groups }; // Alias for readability of symbol group ids constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ - {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}}; + +std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{{{'"'}, {'\\'}, {' ', '\t'}}}; /** * -------- FST states --------- * ----------------------------- * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double - * | quotes as well as any other character not enclosed by a string. Also handles - * | newline character present within a string - * TT_DQS | Double-quoted string state handling all characters within double quotes except - * | newline character + * | quotes as well as any other character not enclosed by a string. + * TT_DQS | Double-quoted string state handling all characters within double quotes * TT_DEC | State handling escaped characters inside double-quoted string. Note that this * | state is necessary to process escaped double-quote characters. Without this * | state, whitespaces following escaped double quotes inside strings may be removed. @@ -235,10 +237,10 @@ constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); // Transition table std::array, TT_NUM_STATES> const wna_state_tt{ - {/* IN_STATE " \ \n OTHER */ - /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}}, - /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}}, - /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; + {/* IN_STATE " \ OTHER */ + /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS}}, + /* TT_DQS */ {{TT_OOS, TT_DEC, TT_DQS, TT_DQS}}, + /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; // The DFA's starting state constexpr StateT start_state = static_cast(TT_OOS); @@ -302,18 +304,19 @@ struct TransduceToNormalizedWS { namespace detail { void normalize_single_quotes(datasource::owning_buffer& indata, + char delimiter, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); static constexpr std::int32_t min_out = 0; static constexpr std::int32_t max_out = 2; - auto parser = - fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs), - fst::detail::make_transition_table(normalize_quotes::qna_state_tt), - fst::detail::make_translation_functor( - normalize_quotes::TransduceToNormalizedQuotes{}), - stream); + auto parser = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(normalize_quotes::get_sgid_lut(delimiter)), + fst::detail::make_transition_table(normalize_quotes::qna_state_tt), + fst::detail::make_translation_functor( + normalize_quotes::TransduceToNormalizedQuotes{}), + stream); rmm::device_buffer outbuf(indata.size() * 2, stream, mr); cudf::detail::device_scalar outbuf_size(stream, mr); diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 2bc15ea19cb..279f5e71351 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -248,7 +248,8 @@ table_with_metadata read_batch(host_span> sources, // If input JSON buffer has single quotes and option to normalize single quotes is enabled, // invoke pre-processing FST if (reader_opts.is_enabled_normalize_single_quotes()) { - normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref()); + normalize_single_quotes( + bufview, reader_opts.get_delimiter(), stream, cudf::get_current_device_resource_ref()); } auto buffer = diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp index c8c2d18903f..0fbd7da7f4d 100644 --- a/cpp/tests/io/json/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json/json_quote_normalization_test.cpp @@ -34,7 +34,9 @@ // Base test fixture for tests struct JsonNormalizationTest : public cudf::test::BaseFixture {}; -void run_test(std::string const& host_input, std::string const& expected_host_output) +void run_test(std::string const& host_input, + std::string const& expected_host_output, + char delimiter = '\n') { // RMM memory resource std::shared_ptr rsc = @@ -46,7 +48,7 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou // Preprocessing FST cudf::io::datasource::owning_buffer device_data(std::move(device_input)); - cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get()); + cudf::io::json::detail::normalize_single_quotes(device_data, delimiter, stream_view, rsc.get()); std::string preprocessed_host_output(device_data.size(), 0); CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(), @@ -172,6 +174,13 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces run_test(input, output); } +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_NonNewlineDelimiter) +{ + std::string input{"{\"a\": \"1\n2\"}z{\'a\': 12}"}; + std::string output{"{\"a\": \"1\n2\"}z{\"a\": 12}"}; + run_test(input, output, 'z'); +} + TEST_F(JsonNormalizationTest, ReadJsonOption) { // RMM memory resource @@ -179,22 +188,24 @@ TEST_F(JsonNormalizationTest, ReadJsonOption) std::make_shared(); // Test input - std::string const host_input = R"({"A":'TEST"'})"; + std::string const host_input = R"({"a": "1\n2"}h{'a': 12})"; cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{host_input.data(), host_input.size()}) .lines(true) + .delimiter('h') .normalize_single_quotes(true); cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()); // Expected table - std::string const expected_input = R"({"A":"TEST\""})"; + std::string const expected_input = R"({"a": "1\n2"}h{"a": 12})"; cudf::io::json_reader_options expected_input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{expected_input.data(), expected_input.size()}) - .lines(true); + .lines(true) + .delimiter('h'); cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get()); From 84743c3d413f386077ff6f5f162e5d5159449ccd Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 11 Nov 2024 18:19:28 -0600 Subject: [PATCH 09/19] Fix `Dataframe.__setitem__` slow-downs (#17222) Fixes: #17140 This PR fixes slow-downs in `DataFrame.__seitem__` by properly passing in CPU objects where needed instead of passing a GPU object and then failing and performing a GPU -> CPU transfer. `DataFrame.__setitem__` first argument can be a column(pd.Index), in our fast path this will be converted to `cudf.Index` and thus there will be failure from cudf side and then the transfer to CPU + slow-path executes, this is the primary reason for slowdown. This PR maintains a dict mapping of such special functions where we shouldn't be converting the objects to fast path. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17222 --- python/cudf/cudf/pandas/fast_slow_proxy.py | 49 ++++++++++++++++++- .../cudf_pandas_tests/test_cudf_pandas.py | 23 +++++++++ 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 99c0cb82f41..9768a6c4a2f 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -33,6 +33,20 @@ def call_operator(fn, args, kwargs): "EXECUTE_SLOW": 0x0571B0, } +# This is a dict of functions that are known to have arguments that +# need to be transformed from fast to slow only. i.e., Some cudf functions +# error on passing a device object but don't error on passing a host object. +# For example: DataFrame.__setitem__(arg, value) errors on passing a +# cudf.Index object but doesn't error on passing a pd.Index object. +# Hence we need to transform the arg from fast to slow only. So, we use +# a dictionary like: +# {"DataFrame.__setitem__": {0}} +# where the keys are the function names and the values are the indices +# (0-based) of the arguments that need to be transformed. + +_SPECIAL_FUNCTIONS_ARGS_MAP = { + "DataFrame.__setitem__": {0}, +} _WRAPPER_ASSIGNMENTS = tuple( attr @@ -875,6 +889,10 @@ def __name__(self, value): pass setattr(self._fsproxy_slow, "__name__", value) + @property + def _customqualname(self): + return self._fsproxy_slow.__qualname__ + def _assert_fast_slow_eq(left, right): if _is_final_type(type(left)) or type(left) in NUMPY_TYPES: @@ -1011,7 +1029,36 @@ def _transform_arg( # use __reduce_ex__ instead... if type(arg) is tuple: # Must come first to avoid infinite recursion - return tuple(_transform_arg(a, attribute_name, seen) for a in arg) + if ( + len(arg) > 0 + and isinstance(arg[0], _MethodProxy) + and arg[0]._customqualname in _SPECIAL_FUNCTIONS_ARGS_MAP + ): + indices_map = _SPECIAL_FUNCTIONS_ARGS_MAP[ + arg[0]._customqualname + ] + method_proxy, original_args, original_kwargs = arg + + original_args = tuple( + _transform_arg(a, "_fsproxy_slow", seen) + if i - 1 in indices_map + else _transform_arg(a, attribute_name, seen) + for i, a in enumerate(original_args) + ) + original_kwargs = _transform_arg( + original_kwargs, attribute_name, seen + ) + return tuple( + ( + _transform_arg(method_proxy, attribute_name, seen), + original_args, + original_kwargs, + ) + ) + else: + return tuple( + _transform_arg(a, attribute_name, seen) for a in arg + ) elif hasattr(arg, "__getnewargs_ex__"): # Partial implementation of to reconstruct with # transformed pieces diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index e260b448219..d48fbad0ec3 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -12,6 +12,7 @@ import pickle import subprocess import tempfile +import time import types from io import BytesIO, StringIO @@ -1795,3 +1796,25 @@ def test_iter_doesnot_raise(monkeypatch): monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") for _ in s: pass + + +def test_dataframe_setitem_slowdown(): + # We are explicitly testing the slowdown of the setitem operation + df = xpd.DataFrame( + {"a": [1, 2, 3] * 100000, "b": [1, 2, 3] * 100000} + ).astype("float64") + df = xpd.DataFrame({"a": df["a"].repeat(1000), "b": df["b"].repeat(1000)}) + new_df = df + 1 + start_time = time.time() + df[df.columns] = new_df + end_time = time.time() + delta = int(end_time - start_time) + if delta > 5: + pytest.fail(f"Test took too long to run, runtime: {delta}") + + +def test_dataframe_setitem(): + df = xpd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}).astype("float64") + new_df = df + 1 + df[df.columns] = new_df + tm.assert_equal(df, new_df) From 61031ccd5977d5d85bf0b8e9c32bea1c853a25ae Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 11 Nov 2024 21:57:47 -0500 Subject: [PATCH 10/19] Expose streams in public quantile APIs (#17257) Adds stream parameter to ``` cudf::quantile cudf::quantiles cudf::percentile_approx ``` Added stream gtests to verify correct stream forwarding. Reference: #13744 Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17257 --- cpp/include/cudf/quantiles.hpp | 6 +++ cpp/src/quantiles/quantile.cu | 3 +- cpp/src/quantiles/quantiles.cu | 11 ++--- cpp/src/quantiles/tdigest/tdigest.cu | 3 +- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/quantile_test.cpp | 74 ++++++++++++++++++++++++++++ 6 files changed, 88 insertions(+), 10 deletions(-) create mode 100644 cpp/tests/streams/quantile_test.cpp diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp index f6bae170f03..f0039734519 100644 --- a/cpp/include/cudf/quantiles.hpp +++ b/cpp/include/cudf/quantiles.hpp @@ -48,6 +48,7 @@ namespace CUDF_EXPORT cudf { * ignored. * @param[in] exact If true, returns doubles. * If false, returns same type as input. + * @param[in] stream CUDA stream used for device memory operations and kernel launches * @param[in] mr Device memory resource used to allocate the returned column's device memory * @returns Column of specified quantiles, with nulls for indeterminable values @@ -59,6 +60,7 @@ std::unique_ptr quantile( interpolation interp = interpolation::LINEAR, column_view const& ordered_indices = {}, bool exact = true, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -85,6 +87,7 @@ std::unique_ptr quantile( * @param is_input_sorted Indicates if the input has been pre-sorted * @param column_order The desired sort order for each column * @param null_precedence The desired order of null compared to other elements + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * * @returns Table of specified quantiles, with nulls for indeterminable values @@ -98,6 +101,7 @@ std::unique_ptr
quantiles( cudf::sorted is_input_sorted = sorted::NO, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -114,6 +118,7 @@ std::unique_ptr
quantiles( * * @param input tdigest input data. One tdigest per row * @param percentiles Desired percentiles in range [0, 1] + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device * memory * @@ -125,6 +130,7 @@ std::unique_ptr
quantiles( std::unique_ptr percentile_approx( tdigest::tdigest_column_view const& input, column_view const& percentiles, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu index 80fd72a3088..21f6fe87a62 100644 --- a/cpp/src/quantiles/quantile.cu +++ b/cpp/src/quantiles/quantile.cu @@ -195,10 +195,11 @@ std::unique_ptr quantile(column_view const& input, interpolation interp, column_view const& ordered_indices, bool exact, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::quantile(input, q, interp, ordered_indices, exact, cudf::get_default_stream(), mr); + return detail::quantile(input, q, interp, ordered_indices, exact, stream, mr); } } // namespace cudf diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu index 69421f3bfc4..a94fb9362b9 100644 --- a/cpp/src/quantiles/quantiles.cu +++ b/cpp/src/quantiles/quantiles.cu @@ -103,17 +103,12 @@ std::unique_ptr
quantiles(table_view const& input, cudf::sorted is_input_sorted, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::quantiles(input, - q, - interp, - is_input_sorted, - column_order, - null_precedence, - cudf::get_default_stream(), - mr); + return detail::quantiles( + input, q, interp, is_input_sorted, column_order, null_precedence, stream, mr); } } // namespace cudf diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 43c3b0a291b..fb5aebb4b39 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -410,10 +410,11 @@ std::unique_ptr percentile_approx(tdigest_column_view const& input, std::unique_ptr percentile_approx(tdigest_column_view const& input, column_view const& percentiles, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return tdigest::percentile_approx(input, percentiles, cudf::get_default_stream(), mr); + return tdigest::percentile_approx(input, percentiles, stream, mr); } } // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index f502195aea4..3a9b930830b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -711,6 +711,7 @@ ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_PARTITIONING_TEST streams/partitioning_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing) +ConfigureTest(STREAM_QUANTILE_TEST streams/quantile_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/quantile_test.cpp b/cpp/tests/streams/quantile_test.cpp new file mode 100644 index 00000000000..4f4f16a9e70 --- /dev/null +++ b/cpp/tests/streams/quantile_test.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include + +struct QuantileTest : public cudf::test::BaseFixture {}; + +TEST_F(QuantileTest, TestMultiColumnUnsorted) +{ + auto input_a = cudf::test::strings_column_wrapper( + {"C", "B", "A", "A", "D", "B", "D", "B", "D", "C", "C", "C", + "D", "B", "D", "B", "C", "C", "A", "D", "B", "A", "A", "A"}, + {true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); + + cudf::test::fixed_width_column_wrapper input_b( + {4, 3, 5, 0, 1, 0, 4, 1, 5, 3, 0, 5, 2, 4, 3, 2, 1, 2, 3, 0, 5, 1, 4, 2}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + auto input = cudf::table_view({input_a, input_b}); + + auto actual = cudf::quantiles(input, + {0.0f, 0.5f, 0.7f, 0.25f, 1.0f}, + cudf::interpolation::NEAREST, + cudf::sorted::NO, + {cudf::order::ASCENDING, cudf::order::DESCENDING}, + {}, + cudf::test::get_default_stream()); +} + +TEST_F(QuantileTest, TestEmpty) +{ + auto input = cudf::test::fixed_width_column_wrapper({}); + cudf::quantile( + input, {0.5, 0.25}, cudf::interpolation::LINEAR, {}, true, cudf::test::get_default_stream()); +} + +TEST_F(QuantileTest, EmptyInput) +{ + auto empty_ = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()); + cudf::test::fixed_width_column_wrapper percentiles{0.0, 0.25, 0.3}; + + std::vector input; + input.push_back(*empty_); + input.push_back(*empty_); + input.push_back(*empty_); + auto empty = cudf::concatenate(input, cudf::test::get_default_stream()); + + cudf::tdigest::tdigest_column_view tdv(*empty); + auto result = cudf::percentile_approx(tdv, percentiles, cudf::test::get_default_stream()); +} From bdddab39826c061d3fad932aa306ba9313b1d062 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 12 Nov 2024 04:52:11 +0100 Subject: [PATCH 11/19] cmake option: `CUDF_KVIKIO_REMOTE_IO` (#17291) Compile flag to enable/disable remote IO through KvikIO: `CUDF_KVIKIO_REMOTE_IO` Authors: - Mads R. B. Kristensen (https://github.com/madsbk) Approvers: - Tianyu Liu (https://github.com/kingcrimsontianyu) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17291 --- cpp/CMakeLists.txt | 12 ++++++++++++ cpp/cmake/thirdparty/get_kvikio.cmake | 2 +- cpp/src/io/utilities/datasource.cpp | 19 ++++++++++++++++--- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 559826ac232..65b05fd518b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -90,6 +90,12 @@ option( mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL) option(CUDF_STATIC_LINTERS "Enable static linters during compilation" OFF) +option( + CUDF_KVIKIO_REMOTE_IO + "Enable remote IO (e.g. AWS S3) support through KvikIO. If disabled, cudf-python will still be able to do remote IO through fsspec." + ON +) + message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") message(VERBOSE "CUDF: Configure CMake to build (google & nvbench) benchmarks: ${BUILD_BENCHMARKS}") @@ -109,6 +115,9 @@ message( "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler): ${CUDA_ENABLE_LINEINFO}" ) message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}") +message(VERBOSE + "CUDF: Build with remote IO (e.g. AWS S3) support through KvikIO: ${CUDF_KVIKIO_REMOTE_IO}" +) # Set a default build type if none was specified rapids_cmake_build_type("Release") @@ -890,6 +899,9 @@ target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL # Define spdlog level target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}") +# Enable remote IO through KvikIO +target_compile_definitions(cudf PRIVATE $<$:CUDF_KVIKIO_REMOTE_IO>) + # Compile stringified JIT sources first add_dependencies(cudf jitify_preprocess_run) diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index c949f48505e..73f875b46c2 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -22,7 +22,7 @@ function(find_and_configure_kvikio VERSION) GIT_REPOSITORY https://github.com/rapidsai/kvikio.git GIT_TAG branch-${VERSION} GIT_SHALLOW TRUE SOURCE_SUBDIR cpp - OPTIONS "KvikIO_BUILD_EXAMPLES OFF" + OPTIONS "KvikIO_BUILD_EXAMPLES OFF" "KvikIO_REMOTE_SUPPORT ${CUDF_KVIKIO_REMOTE_IO}" ) include("${rapids-cmake-dir}/export/find_package_root.cmake") diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 9ea39e692b6..5ccc91e4220 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -26,7 +26,6 @@ #include #include -#include #include @@ -37,6 +36,10 @@ #include #include +#ifdef CUDF_KVIKIO_REMOTE_IO +#include +#endif + namespace cudf { namespace io { namespace { @@ -391,6 +394,7 @@ class user_datasource_wrapper : public datasource { datasource* const source; ///< A non-owning pointer to the user-implemented datasource }; +#ifdef CUDF_KVIKIO_REMOTE_IO /** * @brief Remote file source backed by KvikIO, which handles S3 filepaths seamlessly. */ @@ -463,14 +467,23 @@ class remote_file_source : public datasource { static bool is_supported_remote_url(std::string const& url) { // Regular expression to match "s3://" - std::regex pattern{R"(^s3://)", std::regex_constants::icase}; + static std::regex pattern{R"(^s3://)", std::regex_constants::icase}; return std::regex_search(url, pattern); } private: kvikio::RemoteHandle _kvikio_file; }; - +#else +/** + * @brief When KvikIO remote IO is disabled, `is_supported_remote_url()` return false always. + */ +class remote_file_source : public file_source { + public: + explicit remote_file_source(char const* filepath) : file_source(filepath) {} + static constexpr bool is_supported_remote_url(std::string const&) { return false; } +}; +#endif } // namespace std::unique_ptr datasource::create(std::string const& filepath, From 202c2318282e859c8a156a48cfbc133dd2941117 Mon Sep 17 00:00:00 2001 From: Peixin Date: Tue, 12 Nov 2024 12:36:44 +0800 Subject: [PATCH 12/19] Replace workaround of JNI build with CUDF_KVIKIO_REMOTE_IO=OFF (#17293) JNI build does not require kvikIO, to unblock the build use `CUDF_KVIKIO_REMOTE_IO=OFF` in cpp build phase. this should be merged after https://github.com/rapidsai/cudf/pull/17291 Authors: - Peixin (https://github.com/pxLi) Approvers: - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17293 --- java/ci/build-in-docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh index 4b5379cf0f1..b85c215d7d1 100755 --- a/java/ci/build-in-docker.sh +++ b/java/ci/build-in-docker.sh @@ -65,7 +65,7 @@ cmake .. -G"${CMAKE_GENERATOR}" \ -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS \ -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL \ -DBUILD_SHARED_LIBS=OFF \ - -DKvikIO_REMOTE_SUPPORT=OFF + -DCUDF_KVIKIO_REMOTE_IO=OFF if [[ -z "${PARALLEL_LEVEL}" ]]; then cmake --build . From 043bcbdf28aa9f7213c3f1f2b4170f4940c9d39e Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 12 Nov 2024 07:12:05 -0500 Subject: [PATCH 13/19] [FEA] Report all unsupported operations for a query in cudf.polars (#16960) Closes #16690. The purpose of this PR is to list all of the unique operations that are unsupported by `cudf.polars` when running a query. 1. Question: How to traverse the tree to report the error nodes? Should this be done upstream in Polars? 2. Instead of traversing the query afterwards, we should probably catch each unsupported feature as we translate the IR. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16960 --- python/cudf_polars/cudf_polars/__init__.py | 4 +- python/cudf_polars/cudf_polars/callback.py | 32 +- python/cudf_polars/cudf_polars/dsl/expr.py | 2 + .../cudf_polars/dsl/expressions/base.py | 11 + python/cudf_polars/cudf_polars/dsl/ir.py | 19 +- .../cudf_polars/cudf_polars/dsl/translate.py | 382 ++++++++++-------- .../cudf_polars/testing/asserts.py | 14 +- .../cudf_polars/cudf_polars/utils/dtypes.py | 11 +- python/cudf_polars/docs/overview.md | 4 +- python/cudf_polars/tests/dsl/test_to_ast.py | 4 +- .../cudf_polars/tests/dsl/test_traversal.py | 8 +- .../tests/expressions/test_sort.py | 4 +- python/cudf_polars/tests/test_mapfunction.py | 13 - 13 files changed, 297 insertions(+), 211 deletions(-) diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index 66c15f694ee..ba4858c5619 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -12,7 +12,7 @@ from cudf_polars._version import __git_commit__, __version__ from cudf_polars.callback import execute_with_cudf -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator # Check we have a supported polars version from cudf_polars.utils.versions import _ensure_polars_version @@ -22,7 +22,7 @@ __all__: list[str] = [ "execute_with_cudf", - "translate_ir", + "Translator", "__git_commit__", "__version__", ] diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 76816ee0a61..ff4933c7564 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -18,7 +18,7 @@ import rmm from rmm._cuda import gpu -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator if TYPE_CHECKING: from collections.abc import Generator @@ -180,14 +180,30 @@ def execute_with_cudf( ) try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): - nt.set_udf( - partial( - _callback, - translate_ir(nt), - device=device, - memory_resource=memory_resource, + translator = Translator(nt) + ir = translator.translate_ir() + ir_translation_errors = translator.errors + if len(ir_translation_errors): + # TODO: Display these errors in user-friendly way. + # tracked in https://github.com/rapidsai/cudf/issues/17051 + unique_errors = sorted(set(ir_translation_errors), key=str) + error_message = "Query contained unsupported operations" + verbose_error_message = ( + f"{error_message}\nThe errors were:\n{unique_errors}" + ) + unsupported_ops_exception = NotImplementedError( + error_message, unique_errors + ) + if bool(int(os.environ.get("POLARS_VERBOSE", 0))): + warnings.warn(verbose_error_message, UserWarning, stacklevel=2) + if raise_on_fail: + raise unsupported_ops_exception + else: + nt.set_udf( + partial( + _callback, ir, device=device, memory_resource=memory_resource + ) ) - ) except exception as e: if bool(int(os.environ.get("POLARS_VERBOSE", 0))): warnings.warn( diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 1881286ccbb..326d6b65cbe 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -20,6 +20,7 @@ AggInfo, Col, ColRef, + ErrorExpr, Expr, NamedExpr, ) @@ -36,6 +37,7 @@ __all__ = [ "Expr", + "ErrorExpr", "NamedExpr", "Literal", "LiteralColumn", diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py index 21ba7aea707..23851f91938 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py @@ -155,6 +155,17 @@ def collect_agg(self, *, depth: int) -> AggInfo: ) # pragma: no cover; check_agg trips first +class ErrorExpr(Expr): + __slots__ = ("error",) + _non_child = ("dtype", "error") + error: str + + def __init__(self, dtype: plc.DataType, error: str) -> None: + self.dtype = dtype + self.error = error + self.children = () + + class NamedExpr: # NamedExpr does not inherit from Expr since it does not appear # when evaluating expressions themselves, only when constructing diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index bc42b4a254f..beea5908e56 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -42,6 +42,7 @@ __all__ = [ "IR", + "ErrorNode", "PythonScan", "Scan", "Cache", @@ -212,6 +213,22 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ) +class ErrorNode(IR): + """Represents an error translating the IR.""" + + __slots__ = ("error",) + _non_child = ( + "schema", + "error", + ) + error: str + """The error.""" + + def __init__(self, schema: Schema, error: str): + self.schema = schema + self.error = error + + class PythonScan(IR): """Representation of input from a python function.""" @@ -1532,7 +1549,7 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR): raise NotImplementedError( "Unpivot cannot cast all input columns to " f"{self.schema[value_name].id()}" - ) + ) # pragma: no cover self.options = ( tuple(indices), tuple(pivotees), diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 2711676d31e..e8ed009cdf2 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -9,7 +9,7 @@ import json from contextlib import AbstractContextManager, nullcontext from functools import singledispatch -from typing import Any +from typing import TYPE_CHECKING, Any import pyarrow as pa from typing_extensions import assert_never @@ -25,7 +25,123 @@ from cudf_polars.typing import NodeTraverser from cudf_polars.utils import dtypes, sorting -__all__ = ["translate_ir", "translate_named_expr"] +if TYPE_CHECKING: + from cudf_polars.typing import NodeTraverser + +__all__ = ["Translator", "translate_named_expr"] + + +class Translator: + """ + Translates polars-internal IR nodes and expressions to our representation. + + Parameters + ---------- + visitor + Polars NodeTraverser object + """ + + def __init__(self, visitor: NodeTraverser): + self.visitor = visitor + self.errors: list[Exception] = [] + + def translate_ir(self, *, n: int | None = None) -> ir.IR: + """ + Translate a polars-internal IR node to our representation. + + Parameters + ---------- + visitor + Polars NodeTraverser object + n + Optional node to start traversing from, if not provided uses + current polars-internal node. + + Returns + ------- + Translated IR object + + Raises + ------ + NotImplementedError + If the version of Polars IR is unsupported. + + Notes + ----- + Any expression nodes that cannot be translated are replaced by + :class:`expr.ErrorNode` nodes and collected in the the `errors` attribute. + After translation is complete, this list of errors should be inspected + to determine if the query is supported. + """ + ctx: AbstractContextManager[None] = ( + set_node(self.visitor, n) if n is not None else noop_context + ) + # IR is versioned with major.minor, minor is bumped for backwards + # compatible changes (e.g. adding new nodes), major is bumped for + # incompatible changes (e.g. renaming nodes). + if (version := self.visitor.version()) >= (4, 0): + e = NotImplementedError( + f"No support for polars IR {version=}" + ) # pragma: no cover; no such version for now. + self.errors.append(e) # pragma: no cover + raise e # pragma: no cover + + with ctx: + polars_schema = self.visitor.get_schema() + try: + schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()} + except Exception as e: + self.errors.append(NotImplementedError(str(e))) + return ir.ErrorNode({}, str(e)) + try: + node = self.visitor.view_current_node() + except Exception as e: + self.errors.append(e) + return ir.ErrorNode(schema, str(e)) + try: + result = _translate_ir(node, self, schema) + except Exception as e: + self.errors.append(e) + return ir.ErrorNode(schema, str(e)) + if any( + isinstance(dtype, pl.Null) + for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values()) + ): + error = NotImplementedError( + f"No GPU support for {result} with Null column dtype." + ) + self.errors.append(error) + return ir.ErrorNode(schema, str(error)) + + return result + + def translate_expr(self, *, n: int) -> expr.Expr: + """ + Translate a polars-internal expression IR into our representation. + + Parameters + ---------- + n + Node to translate, an integer referencing a polars internal node. + + Returns + ------- + Translated IR object. + + Notes + ----- + Any expression nodes that cannot be translated are replaced by + :class:`expr.ErrorExpr` nodes and collected in the the `errors` attribute. + After translation is complete, this list of errors should be inspected + to determine if the query is supported. + """ + node = self.visitor.view_expression(n) + dtype = dtypes.from_polars(self.visitor.get_dtype(n)) + try: + return _translate_expr(node, self, dtype) + except Exception as e: + self.errors.append(e) + return expr.ErrorExpr(dtype, str(e)) class set_node(AbstractContextManager[None]): @@ -67,7 +183,7 @@ def __exit__(self, *args: Any) -> None: @singledispatch def _translate_ir( - node: Any, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: Any, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: raise NotImplementedError( f"Translation for {type(node).__name__}" @@ -76,19 +192,19 @@ def _translate_ir( @_translate_ir.register def _( - node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.PythonScan, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: scan_fn, with_columns, source_type, predicate, nrows = node.options options = (scan_fn, with_columns, source_type, nrows) predicate = ( - translate_named_expr(visitor, n=predicate) if predicate is not None else None + translate_named_expr(translator, n=predicate) if predicate is not None else None ) return ir.PythonScan(schema, options, predicate) @_translate_ir.register def _( - node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Scan, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: typ, *options = node.scan_type if typ == "ndjson": @@ -117,7 +233,7 @@ def _( skip_rows, n_rows, row_index, - translate_named_expr(visitor, n=node.predicate) + translate_named_expr(translator, n=node.predicate) if node.predicate is not None else None, ) @@ -125,20 +241,20 @@ def _( @_translate_ir.register def _( - node: pl_ir.Cache, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Cache, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input)) + return ir.Cache(schema, node.id_, translator.translate_ir(n=node.input)) @_translate_ir.register def _( - node: pl_ir.DataFrameScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.DataFrameScan, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: return ir.DataFrameScan( schema, node.df, node.projection, - translate_named_expr(visitor, n=node.selection) + translate_named_expr(translator, n=node.selection) if node.selection is not None else None, ) @@ -146,22 +262,22 @@ def _( @_translate_ir.register def _( - node: pl_ir.Select, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Select, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - exprs = [translate_named_expr(visitor, n=e) for e in node.expr] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + exprs = [translate_named_expr(translator, n=e) for e in node.expr] return ir.Select(schema, exprs, node.should_broadcast, inp) @_translate_ir.register def _( - node: pl_ir.GroupBy, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.GroupBy, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - aggs = [translate_named_expr(visitor, n=e) for e in node.aggs] - keys = [translate_named_expr(visitor, n=e) for e in node.keys] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + aggs = [translate_named_expr(translator, n=e) for e in node.aggs] + keys = [translate_named_expr(translator, n=e) for e in node.keys] return ir.GroupBy( schema, keys, @@ -174,17 +290,17 @@ def _( @_translate_ir.register def _( - node: pl_ir.Join, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Join, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: # Join key dtypes are dependent on the schema of the left and # right inputs, so these must be translated with the relevant # input active. - with set_node(visitor, node.input_left): - inp_left = translate_ir(visitor, n=None) - left_on = [translate_named_expr(visitor, n=e) for e in node.left_on] - with set_node(visitor, node.input_right): - inp_right = translate_ir(visitor, n=None) - right_on = [translate_named_expr(visitor, n=e) for e in node.right_on] + with set_node(translator.visitor, node.input_left): + inp_left = translator.translate_ir(n=None) + left_on = [translate_named_expr(translator, n=e) for e in node.left_on] + with set_node(translator.visitor, node.input_right): + inp_right = translator.translate_ir(n=None) + right_on = [translate_named_expr(translator, n=e) for e in node.right_on] if (how := node.options[0]) in { "inner", "left", @@ -239,27 +355,27 @@ def _( @_translate_ir.register def _( - node: pl_ir.HStack, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.HStack, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - exprs = [translate_named_expr(visitor, n=e) for e in node.exprs] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + exprs = [translate_named_expr(translator, n=e) for e in node.exprs] return ir.HStack(schema, exprs, node.should_broadcast, inp) @_translate_ir.register def _( - node: pl_ir.Reduce, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Reduce, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: # pragma: no cover; polars doesn't emit this node yet - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - exprs = [translate_named_expr(visitor, n=e) for e in node.expr] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + exprs = [translate_named_expr(translator, n=e) for e in node.expr] return ir.Reduce(schema, exprs, inp) @_translate_ir.register def _( - node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Distinct, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: (keep, subset, maintain_order, zlice) = node.options keep = ir.Distinct._KEEP_MAP[keep] @@ -270,17 +386,17 @@ def _( subset, zlice, maintain_order, - translate_ir(visitor, n=node.input), + translator.translate_ir(n=node.input), ) @_translate_ir.register def _( - node: pl_ir.Sort, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Sort, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - by = [translate_named_expr(visitor, n=e) for e in node.by_column] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + by = [translate_named_expr(translator, n=e) for e in node.by_column] stable, nulls_last, descending = node.sort_options order, null_order = sorting.sort_order( descending, nulls_last=nulls_last, num_keys=len(by) @@ -290,33 +406,35 @@ def _( @_translate_ir.register def _( - node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Slice, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.Slice(schema, node.offset, node.len, translate_ir(visitor, n=node.input)) + return ir.Slice( + schema, node.offset, node.len, translator.translate_ir(n=node.input) + ) @_translate_ir.register def _( - node: pl_ir.Filter, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Filter, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - mask = translate_named_expr(visitor, n=node.predicate) + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + mask = translate_named_expr(translator, n=node.predicate) return ir.Filter(schema, mask, inp) @_translate_ir.register def _( node: pl_ir.SimpleProjection, - visitor: NodeTraverser, + translator: Translator, schema: dict[str, plc.DataType], ) -> ir.IR: - return ir.Projection(schema, translate_ir(visitor, n=node.input)) + return ir.Projection(schema, translator.translate_ir(n=node.input)) @_translate_ir.register def _( - node: pl_ir.MapFunction, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.MapFunction, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: name, *options = node.function return ir.MapFunction( @@ -324,83 +442,36 @@ def _( name, options, # TODO: merge_sorted breaks this pattern - translate_ir(visitor, n=node.input), + translator.translate_ir(n=node.input), ) @_translate_ir.register def _( - node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Union, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: return ir.Union( - schema, node.options, *(translate_ir(visitor, n=n) for n in node.inputs) + schema, node.options, *(translator.translate_ir(n=n) for n in node.inputs) ) @_translate_ir.register def _( - node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.HConcat, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.HConcat(schema, *(translate_ir(visitor, n=n) for n in node.inputs)) - - -def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: - """ - Translate a polars-internal IR node to our representation. - - Parameters - ---------- - visitor - Polars NodeTraverser object - n - Optional node to start traversing from, if not provided uses - current polars-internal node. - - Returns - ------- - Translated IR object - - Raises - ------ - NotImplementedError - If we can't translate the nodes due to unsupported functionality. - """ - ctx: AbstractContextManager[None] = ( - set_node(visitor, n) if n is not None else noop_context - ) - # IR is versioned with major.minor, minor is bumped for backwards - # compatible changes (e.g. adding new nodes), major is bumped for - # incompatible changes (e.g. renaming nodes). - if (version := visitor.version()) >= (4, 0): - raise NotImplementedError( - f"No support for polars IR {version=}" - ) # pragma: no cover; no such version for now. - - with ctx: - polars_schema = visitor.get_schema() - node = visitor.view_current_node() - schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()} - result = _translate_ir(node, visitor, schema) - if any( - isinstance(dtype, pl.Null) - for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values()) - ): - raise NotImplementedError( - f"No GPU support for {result} with Null column dtype." - ) - return result + return ir.HConcat(schema, *(translator.translate_ir(n=n) for n in node.inputs)) def translate_named_expr( - visitor: NodeTraverser, *, n: pl_expr.PyExprIR + translator: Translator, *, n: pl_expr.PyExprIR ) -> expr.NamedExpr: """ Translate a polars-internal named expression IR object into our representation. Parameters ---------- - visitor - Polars NodeTraverser object + translator + Translator object n Node to translate, a named expression node. @@ -420,12 +491,12 @@ def translate_named_expr( NotImplementedError If any translation fails due to unsupported functionality. """ - return expr.NamedExpr(n.output_name, translate_expr(visitor, n=n.node)) + return expr.NamedExpr(n.output_name, translator.translate_expr(n=n.node)) @singledispatch def _translate_expr( - node: Any, visitor: NodeTraverser, dtype: plc.DataType + node: Any, translator: Translator, dtype: plc.DataType ) -> expr.Expr: raise NotImplementedError( f"Translation for {type(node).__name__}" @@ -433,7 +504,7 @@ def _translate_expr( @_translate_expr.register -def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> expr.Expr: name, *options = node.function_data options = tuple(options) if isinstance(name, pl_expr.StringFunction): @@ -442,7 +513,7 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex pl_expr.StringFunction.StripCharsStart, pl_expr.StringFunction.StripCharsEnd, }: - column, chars = (translate_expr(visitor, n=n) for n in node.input) + column, chars = (translator.translate_expr(n=n) for n in node.input) if isinstance(chars, expr.Literal): if chars.value == pa.scalar(""): # No-op in polars, but libcudf uses empty string @@ -459,11 +530,11 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex dtype, name, options, - *(translate_expr(visitor, n=n) for n in node.input), + *(translator.translate_expr(n=n) for n in node.input), ) elif isinstance(name, pl_expr.BooleanFunction): if name == pl_expr.BooleanFunction.IsBetween: - column, lo, hi = (translate_expr(visitor, n=n) for n in node.input) + column, lo, hi = (translator.translate_expr(n=n) for n in node.input) (closed,) = options lop, rop = expr.BooleanFunction._BETWEEN_OPS[closed] return expr.BinOp( @@ -476,7 +547,7 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex dtype, name, options, - *(translate_expr(visitor, n=n) for n in node.input), + *(translator.translate_expr(n=n) for n in node.input), ) elif isinstance(name, pl_expr.TemporalFunction): # functions for which evaluation of the expression may not return @@ -496,14 +567,14 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex dtype, name, options, - *(translate_expr(visitor, n=n) for n in node.input), + *(translator.translate_expr(n=n) for n in node.input), ) if name in needs_cast: return expr.Cast(dtype, result_expr) return result_expr elif isinstance(name, str): - children = (translate_expr(visitor, n=n) for n in node.input) + children = (translator.translate_expr(n=n) for n in node.input) if name == "log": (base,) = options (child,) = children @@ -522,26 +593,26 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex @_translate_expr.register -def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Window, translator: Translator, dtype: plc.DataType) -> expr.Expr: # TODO: raise in groupby? if isinstance(node.options, pl_expr.RollingGroupOptions): # pl.col("a").rolling(...) return expr.RollingWindow( - dtype, node.options, translate_expr(visitor, n=node.function) + dtype, node.options, translator.translate_expr(n=node.function) ) elif isinstance(node.options, pl_expr.WindowMapping): # pl.col("a").over(...) return expr.GroupedRollingWindow( dtype, node.options, - translate_expr(visitor, n=node.function), - *(translate_expr(visitor, n=n) for n in node.partition_by), + translator.translate_expr(n=node.function), + *(translator.translate_expr(n=n) for n in node.partition_by), ) assert_never(node.options) @_translate_expr.register -def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Literal, translator: Translator, dtype: plc.DataType) -> expr.Expr: if isinstance(node.value, plrs.PySeries): return expr.LiteralColumn(dtype, pl.Series._from_pyseries(node.value)) value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype)) @@ -549,42 +620,42 @@ def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> exp @_translate_expr.register -def _(node: pl_expr.Sort, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Sort, translator: Translator, dtype: plc.DataType) -> expr.Expr: # TODO: raise in groupby - return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr)) + return expr.Sort(dtype, node.options, translator.translate_expr(n=node.expr)) @_translate_expr.register -def _(node: pl_expr.SortBy, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.SortBy, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.SortBy( dtype, node.sort_options, - translate_expr(visitor, n=node.expr), - *(translate_expr(visitor, n=n) for n in node.by), + translator.translate_expr(n=node.expr), + *(translator.translate_expr(n=n) for n in node.by), ) @_translate_expr.register -def _(node: pl_expr.Gather, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Gather, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Gather( dtype, - translate_expr(visitor, n=node.expr), - translate_expr(visitor, n=node.idx), + translator.translate_expr(n=node.expr), + translator.translate_expr(n=node.idx), ) @_translate_expr.register -def _(node: pl_expr.Filter, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Filter, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Filter( dtype, - translate_expr(visitor, n=node.input), - translate_expr(visitor, n=node.by), + translator.translate_expr(n=node.input), + translator.translate_expr(n=node.by), ) @_translate_expr.register -def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: - inner = translate_expr(visitor, n=node.expr) +def _(node: pl_expr.Cast, translator: Translator, dtype: plc.DataType) -> expr.Expr: + inner = translator.translate_expr(n=node.expr) # Push casts into literals so we can handle Cast(Literal(Null)) if isinstance(inner, expr.Literal): return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype))) @@ -596,17 +667,17 @@ def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.E @_translate_expr.register -def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Column, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Col(dtype, node.name) @_translate_expr.register -def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Agg, translator: Translator, dtype: plc.DataType) -> expr.Expr: value = expr.Agg( dtype, node.name, node.options, - *(translate_expr(visitor, n=n) for n in node.arguments), + *(translator.translate_expr(n=n) for n in node.arguments), ) if value.name == "count" and value.dtype.id() != plc.TypeId.INT32: return expr.Cast(value.dtype, value) @@ -614,55 +685,30 @@ def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Ex @_translate_expr.register -def _(node: pl_expr.Ternary, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Ternary, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Ternary( dtype, - translate_expr(visitor, n=node.predicate), - translate_expr(visitor, n=node.truthy), - translate_expr(visitor, n=node.falsy), + translator.translate_expr(n=node.predicate), + translator.translate_expr(n=node.truthy), + translator.translate_expr(n=node.falsy), ) @_translate_expr.register def _( - node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType + node: pl_expr.BinaryExpr, translator: Translator, dtype: plc.DataType ) -> expr.Expr: return expr.BinOp( dtype, expr.BinOp._MAPPING[node.op], - translate_expr(visitor, n=node.left), - translate_expr(visitor, n=node.right), + translator.translate_expr(n=node.left), + translator.translate_expr(n=node.right), ) @_translate_expr.register -def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Len, translator: Translator, dtype: plc.DataType) -> expr.Expr: value = expr.Len(dtype) if dtype.id() != plc.TypeId.INT32: return expr.Cast(dtype, value) return value # pragma: no cover; never reached since polars len has uint32 dtype - - -def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr: - """ - Translate a polars-internal expression IR into our representation. - - Parameters - ---------- - visitor - Polars NodeTraverser object - n - Node to translate, an integer referencing a polars internal node. - - Returns - ------- - Translated IR object. - - Raises - ------ - NotImplementedError - If any translation fails due to unsupported functionality. - """ - node = visitor.view_expression(n) - dtype = dtypes.from_polars(visitor.get_dtype(n)) - return _translate_expr(node, visitor, dtype) diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index 7b45c1eaa06..2207545aa60 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -10,7 +10,7 @@ from polars import GPUEngine from polars.testing.asserts import assert_frame_equal -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator if TYPE_CHECKING: import polars as pl @@ -117,12 +117,14 @@ def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception]) AssertionError If the specified exceptions were not raised. """ - try: - _ = translate_ir(q._ldf.visit()) - except exceptions: + translator = Translator(q._ldf.visit()) + translator.translate_ir() + if errors := translator.errors: + for err in errors: + assert any( + isinstance(err, err_type) for err_type in exceptions + ), f"Translation DID NOT RAISE {exceptions}" return - except Exception as e: - raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e else: raise AssertionError(f"Translation DID NOT RAISE {exceptions}") diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index a90c283ee54..e7ac72df609 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -71,11 +71,16 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: ------- True if casting is supported, False otherwise """ + has_empty = from_.id() == plc.TypeId.EMPTY or to.id() == plc.TypeId.EMPTY return ( ( - plc.traits.is_fixed_width(to) - and plc.traits.is_fixed_width(from_) - and plc.unary.is_supported_cast(from_, to) + from_ == to + or not has_empty + and ( + plc.traits.is_fixed_width(to) + and plc.traits.is_fixed_width(from_) + and plc.unary.is_supported_cast(from_, to) + ) ) or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to)) or (to.id() == plc.TypeId.STRING and is_numeric_not_bool(from_)) diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index 17a94c633f8..2f2361223d2 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -458,12 +458,12 @@ translate it to our intermediate representation (IR), and then execute and convert back to polars: ```python -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator q = ... # Convert to our IR -ir = translate_ir(q._ldf.visit()) +ir = Translator(q._ldf.visit()).translate_ir() # DataFrame living on the device result = ir.evaluate(cache={}) diff --git a/python/cudf_polars/tests/dsl/test_to_ast.py b/python/cudf_polars/tests/dsl/test_to_ast.py index 8f10f119199..f6c24da0180 100644 --- a/python/cudf_polars/tests/dsl/test_to_ast.py +++ b/python/cudf_polars/tests/dsl/test_to_ast.py @@ -13,7 +13,7 @@ import cudf_polars.dsl.expr as expr_nodes import cudf_polars.dsl.ir as ir_nodes -from cudf_polars import translate_ir +from cudf_polars import Translator from cudf_polars.containers.dataframe import DataFrame, NamedColumn from cudf_polars.dsl.to_ast import insert_colrefs, to_ast, to_parquet_filter @@ -60,7 +60,7 @@ def df(): ) def test_compute_column(expr, df): q = df.select(expr) - ir = translate_ir(q._ldf.visit()) + ir = Translator(q._ldf.visit()).translate_ir() assert isinstance(ir, ir_nodes.Select) table = ir.children[0].evaluate(cache={}) diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py index 15c644d7978..8958c2a0f84 100644 --- a/python/cudf_polars/tests/dsl/test_traversal.py +++ b/python/cudf_polars/tests/dsl/test_traversal.py @@ -10,7 +10,7 @@ import pylibcudf as plc -from cudf_polars import translate_ir +from cudf_polars import Translator from cudf_polars.dsl import expr, ir from cudf_polars.dsl.traversal import ( CachingVisitor, @@ -109,7 +109,7 @@ def test_rewrite_ir_node(): df = pl.LazyFrame({"a": [1, 2, 1], "b": [1, 3, 4]}) q = df.group_by("a").agg(pl.col("b").sum()).sort("b") - orig = translate_ir(q._ldf.visit()) + orig = Translator(q._ldf.visit()).translate_ir() new_df = pl.DataFrame({"a": [1, 1, 2], "b": [-1, -2, -4]}) @@ -150,7 +150,7 @@ def replace_scan(node, rec): mapper = CachingVisitor(replace_scan) - orig = translate_ir(q._ldf.visit()) + orig = Translator(q._ldf.visit()).translate_ir() new = mapper(orig) result = new.evaluate(cache={}).to_polars() @@ -174,7 +174,7 @@ def test_rewrite_names_and_ops(): .collect() ) - qir = translate_ir(q._ldf.visit()) + qir = Translator(q._ldf.visit()).translate_ir() @singledispatch def _transform(e: expr.Expr, fn: ExprTransformer) -> expr.Expr: diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py index 62df8ce1498..6170281ad54 100644 --- a/python/cudf_polars/tests/expressions/test_sort.py +++ b/python/cudf_polars/tests/expressions/test_sort.py @@ -10,7 +10,7 @@ import pylibcudf as plc -from cudf_polars import translate_ir +from cudf_polars import Translator from cudf_polars.testing.asserts import assert_gpu_result_equal @@ -68,7 +68,7 @@ def test_setsorted(descending, nulls_last, with_nulls): assert_gpu_result_equal(q) - df = translate_ir(q._ldf.visit()).evaluate(cache={}) + df = Translator(q._ldf.visit()).translate_ir().evaluate(cache={}) a = df.column_map["a"] diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py index e895f27f637..63aa1c573a9 100644 --- a/python/cudf_polars/tests/test_mapfunction.py +++ b/python/cudf_polars/tests/test_mapfunction.py @@ -93,16 +93,3 @@ def test_unpivot_defaults(): ) q = df.unpivot(index="d") assert_gpu_result_equal(q) - - -def test_unpivot_unsupported_cast_raises(): - df = pl.LazyFrame( - { - "a": ["x", "y", "z"], - "b": pl.Series([1, 3, 5], dtype=pl.Int16), - } - ) - - q = df.unpivot(["a", "b"]) - - assert_ir_translation_raises(q, NotImplementedError) From ccfc95a623e13d59a6e4f640ee7c022bda35f763 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 12 Nov 2024 10:03:06 -0500 Subject: [PATCH 14/19] Add new nvtext minhash_permuted API (#16756) Introduce new nvtext minhash API that takes a single seed for hashing and 2 parameter vectors to calculate the minhash results from the seed hash: ``` std::unique_ptr minhash_permuted( cudf::strings_column_view const& input, uint32_t seed, cudf::device_span parameter_a, cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` The `seed` is used to hash the `input` using rolling set of substrings `width` characters wide. The hashes are then combined with the values in `parameter_a` and `parameter_b` to calculate a set of 32-bit (or 64-bit) values for each row. Only the minimum value is returned per element of `a` and `b` when combined with all the hashes for a row. Each output row is a set of M values where `M = parameter_a.size() = parameter_b.size()` This implementation is significantly faster than the current minhash which computes hashes for multiple seeds. Included in this PR is also the `minhash64_permuted()` API that is identical but uses 64-bit values for the seed and the parameter values. Also included are new tests and a benchmark as well as the pylibcudf and cudf interfaces. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Matthew Murray (https://github.com/Matt711) - Lawrence Mitchell (https://github.com/wence-) - Karthikeyan (https://github.com/karthikeyann) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/16756 --- cpp/benchmarks/CMakeLists.txt | 4 +- cpp/benchmarks/text/minhash.cpp | 38 +- cpp/include/nvtext/minhash.hpp | 94 +++++ cpp/src/text/minhash.cu | 390 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/text/minhash_tests.cpp | 267 ++++++------ python/cudf/cudf/_lib/nvtext/minhash.pyx | 28 ++ python/cudf/cudf/_lib/strings/__init__.py | 2 + python/cudf/cudf/core/column/string.py | 107 +++++ .../cudf/cudf/tests/text/test_text_methods.py | 48 +-- .../pylibcudf/libcudf/nvtext/minhash.pxd | 16 + python/pylibcudf/pylibcudf/nvtext/minhash.pxd | 16 + python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 103 +++++ .../pylibcudf/tests/test_nvtext_minhash.py | 12 +- 14 files changed, 949 insertions(+), 177 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index ad090be99f3..59f5602fd5a 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -348,8 +348,8 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary ConfigureBench(TEXT_BENCH text/subword.cpp) ConfigureNVBench( - TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp + text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 31ce60d8f9a..a80d0dcbdb8 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -20,8 +20,6 @@ #include -#include - #include static void bench_minhash(nvbench::state& state) @@ -29,26 +27,25 @@ static void bench_minhash(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); auto const hash_width = static_cast(state.get_int64("hash_width")); - auto const seed_count = static_cast(state.get_int64("seed_count")); + auto const parameters = static_cast(state.get_int64("parameters")); auto const base64 = state.get_int64("hash_type") == 64; - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder().distribution( cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const strings_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input(strings_table->view().column(0)); - data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution( - cudf::type_to_id(), distribution_id::NORMAL, 0, row_width); - auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; - auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); - auto seeds = seeds_table->get_column(0); - seeds.set_null_mask(rmm::device_buffer{}, 0); + data_profile const param_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), + distribution_id::NORMAL, + 0u, + std::numeric_limits::max()); + auto const param_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const param_table = + create_random_table({param_type, param_type}, row_count{parameters}, param_profile); + auto const parameters_a = param_table->view().column(0); + auto const parameters_b = param_table->view().column(1); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -57,15 +54,16 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width) - : nvtext::minhash(input, seeds.view(), hash_width); + auto result = base64 + ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width) + : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width); }); } NVBENCH_BENCH(bench_minhash) .set_name("minhash") - .add_int64_axis("num_rows", {1024, 8192, 16364, 131072}) - .add_int64_axis("row_width", {128, 512, 2048}) - .add_int64_axis("hash_width", {5, 10}) - .add_int64_axis("seed_count", {2, 26}) + .add_int64_axis("num_rows", {15000, 30000, 60000}) + .add_int64_axis("row_width", {6000, 28000, 50000}) + .add_int64_axis("hash_width", {12, 24}) + .add_int64_axis("parameters", {26, 260}) .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 42124461cdf..b2c1a23f57e 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -94,6 +94,53 @@ namespace CUDF_EXPORT nvtext { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each string + * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * + * The input strings are first hashed using the given `seed` over substrings + * of `width` characters. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint32 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a substring at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each substring and the minimum value is computed + * as follows: + * ``` + * mh[j,i] = min(pv[i]) for all substrings in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param width The character width of substrings to hash for each row + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash_permuted( + cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Returns the minhash value for each string * @@ -159,6 +206,53 @@ namespace CUDF_EXPORT nvtext { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each string + * + * This function uses MurmurHash3_x64_128 for the hash algorithm. + * + * The input strings are first hashed using the given `seed` over substrings + * of `width` characters. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint64 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a substring at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each substring and the minimum value is computed + * as follows: + * ``` + * mh[j,i] = min(pv[i]) for all substrings in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param width The character width of substrings to hash for each row + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash64_permuted( + cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Returns the minhash values for each row of strings per seed * diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index a03a34f5fa7..aee83ab35ed 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -37,9 +38,13 @@ #include #include +#include #include +#include #include #include +#include +#include #include @@ -162,6 +167,339 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, return hashes; } +constexpr cudf::thread_index_type block_size = 256; +// for potentially tuning minhash_seed_kernel independently from block_size +constexpr cudf::thread_index_type tile_size = block_size; + +// Number of a/b parameter values to process per thread. +// The intermediate values are stored in shared-memory and therefore limits this count. +// This value was found to be an efficient size for both uint32 and uint64 +// hash types based on benchmarks. +constexpr cuda::std::size_t params_per_thread = 16; + +// Separate kernels are used to process strings above and below this value (in bytes). +constexpr cudf::size_type wide_string_threshold = 1 << 18; // 256K +// The number of blocks per string for the above-threshold kernel processing. +constexpr cudf::size_type blocks_per_string = 64; +// The above values were determined using the redpajama and books_sample datasets + +/** + * @brief Hashing kernel launched as a thread per tile-size (block or warp) + * + * This kernel computes the hashes for each string using the seed and the specified + * hash function. The width is used to compute rolling substrings to hash over. + * The hashes are stored in d_hashes to be used in the minhash_permuted_kernel. + * + * This kernel also counts the number of strings above the wide_string_threshold + * and proactively initializes the output values for those strings. + * + * @tparam HashFunction The hash function to use for this kernel + * @tparam hash_value_type Derived from HashFunction result_type + * + * @param d_strings The input strings to hash + * @param seed The seed used for the hash function + * @param width Width in characters used for determining substrings to hash + * @param d_hashes The resulting hash values are stored here + * @param threshold_count Stores the number of strings above wide_string_threshold + * @param param_count Number of parameters (used for the proactive initialize) + * @param d_results Final results vector (used for the proactive initialize) + */ +template +CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, + hash_value_type seed, + cudf::size_type width, + hash_value_type* d_hashes, + cudf::size_type* threshold_count, + cudf::size_type param_count, + hash_value_type* d_results) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = tid / tile_size; + if (str_idx >= d_strings.size()) { return; } + if (d_strings.is_null(str_idx)) { return; } + + // retrieve this string's offset to locate the output position in d_hashes + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = + cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); + auto const offset = offsets_itr[str_idx]; + auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + if (size_bytes == 0) { return; } + + auto const d_str = cudf::string_view(d_strings.head() + offset, size_bytes); + auto const lane_idx = tid % tile_size; + + // hashes for this string/thread are stored here + auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx; + + auto const begin = d_str.data() + lane_idx; + auto const end = d_str.data() + d_str.size_bytes(); + auto const hasher = HashFunction(seed); + + for (auto itr = begin; itr < end; itr += tile_size, seed_hashes += tile_size) { + if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { + *seed_hashes = 0; + continue; + } + auto const check_str = // used for counting 'width' characters + cudf::string_view(itr, static_cast(thrust::distance(itr, end))); + auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width); + if ((itr != d_str.data()) && (left > 0)) { + // true itr+width is past the end of the string + *seed_hashes = 0; + continue; + } + + auto const hash_str = cudf::string_view(itr, bytes); + hash_value_type hv; + if constexpr (std::is_same_v) { + hv = hasher(hash_str); + } else { + hv = thrust::get<0>(hasher(hash_str)); + } + // disallowing hash to zero case + *seed_hashes = cuda::std::max(hv, hash_value_type{1}); + } + + // logic appended here so an extra kernel is not required + if (size_bytes >= wide_string_threshold) { + if (lane_idx == 0) { + // count the number of wide strings + cuda::atomic_ref ref{*threshold_count}; + ref.fetch_add(1, cuda::std::memory_order_relaxed); + } + // initialize the output -- only needed for wider strings + auto d_output = d_results + (str_idx * param_count); + for (auto i = lane_idx; i < param_count; i += tile_size) { + d_output[i] = std::numeric_limits::max(); + } + } +} + +/** + * @brief Permutation calculation kernel + * + * This kernel uses the hashes from the minhash_seed_kernel and the parameter_a and + * parameter_b values to compute the final output results. + * The output is the number of input rows (N) by the number of parameter values (M). + * Each output[i] is the calculated result for parameter_a/b[0:M]. + * + * This kernel is launched with either blocks per strings of 1 for strings + * below the wide_strings_threshold or blocks per string = blocks_per_strings + * for strings above wide_strings_threshold. + * + * @tparam hash_value_type Derived from HashFunction result_type + * @tparam blocks_per_string Number of blocks used to process each string + * + * @param d_strings The input strings to hash + * @param indices The indices of the strings in d_strings to process + * @param parameter_a 1st set of parameters for the calculation result + * @param parameter_b 2nd set of parameters for the calculation result + * @param width Used for calculating the number of available hashes in each string + * @param d_hashes The hash values computed in minhash_seed_kernel + * @param d_results Final results vector of calculate values + */ +template +CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, + cudf::device_span indices, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + hash_value_type const* d_hashes, + hash_value_type* d_results) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const idx = (tid / blocks_per_string) / block_size; + if (idx >= indices.size()) { return; } + auto const str_idx = indices[idx]; + if (d_strings.is_null(str_idx)) { return; } + + auto const block = cooperative_groups::this_thread_block(); + int const section_idx = block.group_index().x % blocks_per_string; + + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = + cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); + auto const offset = offsets_itr[str_idx]; + auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + + // number of items to process in this block; + // last block also includes any remainder values from the size_bytes/blocks_per_string truncation + // example: + // each section_size for string with size 588090 and blocks_per_string=64 is 9188 + // except the last section which is 9188 + (588090 % 64) = 9246 + auto const section_size = + (size_bytes / blocks_per_string) + + (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string); + auto const section_offset = section_idx * (size_bytes / blocks_per_string); + + // hash values for this block/section + auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset; + // width used here as a max value since a string's char-count <= byte-count + auto const hashes_size = + section_idx < (blocks_per_string - 1) + ? section_size + : cuda::std::max(static_cast(size_bytes > 0), section_size - width + 1); + + auto const init = size_bytes == 0 ? 0 : std::numeric_limits::max(); + auto const lane_idx = block.thread_rank(); + auto const d_output = d_results + (str_idx * parameter_a.size()); + + auto const begin = seed_hashes + lane_idx; + auto const end = seed_hashes + hashes_size; + + // constants used in the permutation calculations + constexpr uint64_t mersenne_prime = (1UL << 61) - 1; + constexpr hash_value_type hash_max = std::numeric_limits::max(); + + // found to be an efficient shared memory size for both hash types + __shared__ hash_value_type block_values[block_size * params_per_thread]; + + for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) { + // initialize this block's chunk of shared memory + // each thread handles params_per_thread of values + auto const chunk_values = block_values + (lane_idx * params_per_thread); + thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + params_per_thread, init); + block.sync(); + + auto const param_count = + cuda::std::min(static_cast(params_per_thread), parameter_a.size() - i); + + // each lane accumulates min hashes in its shared memory + for (auto itr = begin; itr < end; itr += block_size) { + auto const hv = *itr; + // 0 is used as a skip sentinel for UTF-8 and trailing bytes + if (hv == 0) { continue; } + + for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) { + // permutation formula used by datatrove + hash_value_type const v = + ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max; + auto const block_idx = ((param_idx % params_per_thread) * block_size) + lane_idx; + block_values[block_idx] = cuda::std::min(v, block_values[block_idx]); + } + } + block.sync(); + + // reduce each parameter values vector to a single min value; + // assumes that the block_size > params_per_thread; + // each thread reduces a block_size of parameter values (thread per parameter) + if (lane_idx < param_count) { + auto const values = block_values + (lane_idx * block_size); + // cooperative groups does not have a min function and cub::BlockReduce was slower + auto const minv = + thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); + if constexpr (blocks_per_string > 1) { + // accumulates mins for each block into d_output + cuda::atomic_ref ref{d_output[lane_idx + i]}; + ref.fetch_min(minv, cuda::std::memory_order_relaxed); + } else { + d_output[lane_idx + i] = minv; + } + } + block.sync(); + } +} + +template +std::unique_ptr minhash_fn(cudf::strings_column_view const& input, + hash_value_type seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(width >= 2, + "Parameter width should be an integer value of 2 or greater", + std::invalid_argument); + CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument); + CUDF_EXPECTS(parameter_a.size() == parameter_b.size(), + "Parameters A and B should have the same number of elements", + std::invalid_argument); + CUDF_EXPECTS( + (static_cast(input.size()) * parameter_a.size()) < + static_cast(std::numeric_limits::max()), + "The number of parameters times the number of input rows exceeds the column size limit", + std::overflow_error); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + + auto results = + cudf::make_numeric_column(output_type, + input.size() * static_cast(parameter_a.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_results = results->mutable_view().data(); + + cudf::detail::grid_1d grid{static_cast(input.size()) * block_size, + block_size}; + auto const hashes_size = input.chars_size(stream); + auto d_hashes = rmm::device_uvector(hashes_size, stream); + auto d_threshold_count = cudf::detail::device_scalar(0, stream); + + minhash_seed_kernel + <<>>(*d_strings, + seed, + width, + d_hashes.data(), + d_threshold_count.data(), + parameter_a.size(), + d_results); + auto const threshold_count = d_threshold_count.value(stream); + + auto indices = rmm::device_uvector(input.size(), stream); + thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end()); + cudf::size_type threshold_index = threshold_count < input.size() ? input.size() : 0; + + // if we counted a split of above/below threshold then + // compute partitions based on the size of each string + if ((threshold_count > 0) && (threshold_count < input.size())) { + auto sizes = rmm::device_uvector(input.size(), stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + sizes.data(), + cuda::proclaim_return_type( + [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type { + if (d_strings.is_null(idx)) { return 0; } + return d_strings.element(idx).size_bytes(); + })); + thrust::sort_by_key( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin()); + auto const lb = thrust::lower_bound( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold); + threshold_index = static_cast(thrust::distance(sizes.begin(), lb)); + } + + // handle the strings below the threshold width + if (threshold_index > 0) { + auto d_indices = cudf::device_span(indices.data(), threshold_index); + cudf::detail::grid_1d grid{static_cast(d_indices.size()) * block_size, + block_size}; + minhash_permuted_kernel + <<>>( + *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + } + + // handle the strings above the threshold width + if (threshold_index < input.size()) { + auto const count = static_cast(input.size() - threshold_index); + auto d_indices = + cudf::device_span(indices.data() + threshold_index, count); + cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size}; + minhash_permuted_kernel + <<>>( + *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + } + + return results; +} + /** * @brief Compute the minhash of each list row of strings for each seed * @@ -309,6 +647,20 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } +std::unique_ptr minhash(cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = + detail::minhash_fn(input, seed, parameter_a, parameter_b, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::numeric_scalar const& seed, cudf::size_type width, @@ -333,6 +685,20 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } +std::unique_ptr minhash64(cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = + detail::minhash_fn(input, seed, parameter_a, parameter_b, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + std::unique_ptr word_minhash(cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream, @@ -374,6 +740,18 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return detail::minhash(input, seeds, width, stream, mr); } +std::unique_ptr minhash_permuted(cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::numeric_scalar seed, cudf::size_type width, @@ -394,6 +772,18 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return detail::minhash64(input, seeds, width, stream, mr); } +std::unique_ptr minhash64_permuted(cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); +} + std::unique_ptr word_minhash(cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream, diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 3a9b930830b..cbca0ceef77 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -610,6 +610,7 @@ ConfigureTest( text/bpe_tests.cpp text/edit_distance_tests.cpp text/jaccard_tests.cpp + text/minhash_tests.cpp text/ngrams_tests.cpp text/ngrams_tokenize_tests.cpp text/normalize_tests.cpp diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index ef35a4472cf..042ac44621e 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -28,155 +28,169 @@ struct MinHashTest : public cudf::test::BaseFixture {}; -TEST_F(MinHashTest, Basic) +TEST_F(MinHashTest, Permuted) { - auto validity = cudf::test::iterators::null_at(1); auto input = cudf::test::strings_column_wrapper({"doc 1", - "", "this is doc 2", - "", "doc 3", "d", - "The quick brown fox jumpéd over the lazy brown dog."}, - validity); + "The quick brown fox jumpéd over the lazy brown dog.", + "line six", + "line seven", + "line eight", + "line nine", + "line ten"}); auto view = cudf::strings_column_view(input); - auto results = nvtext::minhash(view); + auto first = thrust::counting_iterator(10); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - auto expected = cudf::test::fixed_width_column_wrapper( - {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, validity); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{1392101586u, 394869177u, 811528444u}, + LCW32{ 211415830u, 187088503u, 130291444u}, + LCW32{2098117052u, 394869177u, 799753544u}, + LCW32{2264583304u, 2920538364u, 3576493424u}, + LCW32{ 253327882u, 41747273u, 302030804u}, + LCW32{2109809594u, 1017470651u, 326988172u}, + LCW32{1303819864u, 850676747u, 147107852u}, + LCW32{ 736021564u, 720812292u, 1405158760u}, + LCW32{ 902780242u, 134064807u, 1613944636u}, + LCW32{ 547084870u, 1748895564u, 656501844u} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto results64 = nvtext::minhash64(view); - auto expected64 = cudf::test::fixed_width_column_wrapper({774489391575805754ul, - 0ul, - 3232308021562742685ul, - 0ul, - 13145552576991307582ul, - 14660046701545912182ul, - 398062025280761388ul}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); -TEST_F(MinHashTest, LengthEqualsWidth) -{ - auto input = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"}); - auto view = cudf::strings_column_view(input); - auto results = nvtext::minhash(view, 0, 5); - auto expected = cudf::test::fixed_width_column_wrapper( - {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{ 827364888116975697ul, 1601854279692781452ul, 70500662054893256ul}, + LCW64{ 18312093741021833ul, 133793446674258329ul, 21974512489226198ul}, + LCW64{ 22474244732520567ul, 1638811775655358395ul, 949306297364502264ul}, + LCW64{1332357434996402861ul, 2157346081260151330ul, 676491718310205848ul}, + LCW64{ 65816830624808020ul, 43323600380520789ul, 63511816333816345ul}, + LCW64{ 629657184954525200ul, 49741036507643002ul, 97466271004074331ul}, + LCW64{ 301611977846331113ul, 101188874709594830ul, 97466271004074331ul}, + LCW64{ 121498891461700668ul, 171065800427907402ul, 97466271004074331ul}, + LCW64{ 54617739511834072ul, 231454301607238929ul, 97466271004074331ul}, + LCW64{ 576418665851990314ul, 231454301607238929ul, 97466271004074331ul} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } -TEST_F(MinHashTest, MultiSeed) +TEST_F(MinHashTest, PermutedWide) { - auto input = - cudf::test::strings_column_wrapper({"doc 1", - "this is doc 2", - "doc 3", - "d", - "The quick brown fox jumpéd over the lazy brown dog."}); - - auto view = cudf::strings_column_view(input); + std::string const small(2 << 10, 'x'); // below wide_string_threshold + std::string const wide(2 << 19, 'y'); // above wide_string_threshold + auto input = cudf::test::strings_column_wrapper({small, wide}); + auto view = cudf::strings_column_view(input); - auto seeds = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - auto results = nvtext::minhash(view, cudf::column_view(seeds)); + auto first = thrust::counting_iterator(20); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - using LCW = cudf::test::lists_column_wrapper; + using LCW32 = cudf::test::lists_column_wrapper; // clang-format off - LCW expected({LCW{1207251914u, 1677652962u, 1061355987u}, - LCW{ 21141582u, 580916568u, 1258052021u}, - LCW{1207251914u, 943567174u, 1109272887u}, - LCW{ 655955059u, 488346356u, 2394664816u}, - LCW{ 86520422u, 236622901u, 102546228u}}); + LCW32 expected({ + LCW32{1731998032u, 315359380u, 3193688024u}, + LCW32{1293098788u, 2860992281u, 133918478u} + }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto seeds64 = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off - LCW64 expected64({LCW64{ 774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul}, - LCW64{ 3232308021562742685ul, 4445611509348165860ul, 1188598072697676120ul}, - LCW64{13145552576991307582ul, 6846192680998069919ul, 1188598072697676120ul}, - LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul}, - LCW64{ 398062025280761388ul, 377720198157450084ul, 984941365662009329ul}}); + LCW64 expected64({ + LCW64{1818322427062143853ul, 641024893347719371ul, 1769570368846988848ul}, + LCW64{1389920339306667795ul, 421787002125838902ul, 1759496674158703968ul} + }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } -TEST_F(MinHashTest, MultiSeedWithNullInputRow) +TEST_F(MinHashTest, PermutedManyParameters) { - auto validity = cudf::test::iterators::null_at(1); - auto input = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity); - auto view = cudf::strings_column_view(input); + std::string const small(2 << 10, 'x'); + std::string const wide(2 << 19, 'y'); + auto input = cudf::test::strings_column_wrapper({small, wide}); + auto view = cudf::strings_column_view(input); - auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); - auto results = nvtext::minhash(view, cudf::column_view(seeds)); + auto first = thrust::counting_iterator(20); + // more than params_per_thread + auto params = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - using LCW = cudf::test::lists_column_wrapper; - LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}}, - validity); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{1731998032u, 315359380u, 3193688024u, 1777049372u, 360410720u, 3238739364u, 1822100712u, 405462060u, + 3283790704u, 1867152052u, 450513400u, 3328842044u, 1912203392u, 495564740u, 3373893384u, 1957254732u, + 540616080u, 3418944724u, 2002306072u, 585667420u, 3463996064u, 2047357412u, 630718760u, 3509047404u, + 2092408752u, 675770100u, 3554098744u, 2137460092u, 720821440u, 3599150084u, 2182511432u}, + LCW32{1293098788u, 2860992281u, 133918478u, 1701811971u, 3269705464u, 542631661u, 2110525154u, 3678418647u, + 951344844u, 2519238337u, 4087131830u, 1360058027u, 2927951520u, 200877717u, 1768771210u, 3336664703u, + 609590900u, 2177484393u, 3745377886u, 1018304083u, 2586197576u, 4154091069u, 1427017266u, 2994910759u, + 267836956u, 1835730449u, 3403623942u, 676550139u, 2244443632u, 3812337125u, 1085263322u} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); - auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); + // more than params_per_thread + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; - LCW64 expected64({LCW64{2597399324547032480ul, 4461410998582111052ul}, - LCW64{}, - LCW64{0ul, 0ul}, - LCW64{2717781266371273264ul, 6977325820868387259ul}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} - -TEST_F(MinHashTest, WordsMinHash) -{ - using LCWS = cudf::test::lists_column_wrapper; - auto validity = cudf::test::iterators::null_at(1); - - LCWS input( - {LCWS({"hello", "abcdéfgh"}), - LCWS{}, - LCWS({"rapids", "moré", "test", "text"}), - LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})}, - validity); - - auto view = cudf::lists_column_view(input); - - auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); - auto results = nvtext::word_minhash(view, cudf::column_view(seeds)); - using LCW32 = cudf::test::lists_column_wrapper; - LCW32 expected({LCW32{2069617641u, 1975382903u}, - LCW32{}, - LCW32{657297235u, 1010955999u}, - LCW32{644643885u, 310002789u}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - - auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); - auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64)); - using LCW64 = cudf::test::lists_column_wrapper; - LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul}, - LCW64{}, - LCW64{5331949571924938590ul, 2088583894581919741ul}, - LCW64{3400468157617183341ul, 2398577492366130055ul}}, - validity); + // clang-format off + LCW64 expected64({ + LCW64{1818322427062143853, 641024893347719371, 1769570368846988848, 592272835132564366, + 1720818310631833835, 543520776917409353, 1672066252416678822, 494768718702254348, + 1623314194201523817, 446016660487099335, 1574562135986368804, 397264602271944322, + 1525810077771213799, 348512544056789317, 1477058019556058786, 299760485841634304, + 1428305961340903773, 251008427626479291, 1379553903125748768, 202256369411324286, + 1330801844910593755, 153504311196169273, 1282049786695438742, 104752252981014268, + 1233297728480283737, 56000194765859255, 1184545670265128724, 7248136550704242, + 1135793612049973719, 2264339087549243188, 1087041553834818706}, + LCW64{1389920339306667795, 421787002125838902, 1759496674158703968, 791363336977875075, + 2129073009010740141, 1160939671829911248, 192806334649082363, 1530516006681947421, + 562382669501118536, 1900092341533983602, 931959004353154709, 2269668676386019775, + 1301535339205190882, 333402002024361997, 1671111674057227055, 702978336876398170, + 2040688008909263228, 1072554671728434343, 104421334547605450, 1442131006580470516, + 473997669399641631, 1811707341432506689, 843574004251677804, 2181283676284542862, + 1213150339103713977, 245017001922885084, 1582726673955750150, 614593336774921257, + 1952303008807786323, 984169671626957438, 16036334446128545} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } TEST_F(MinHashTest, EmptyTest) { - auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - auto view = cudf::strings_column_view(input->view()); - auto results = nvtext::minhash(view); + auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + auto view = cudf::strings_column_view(input->view()); + auto params = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); EXPECT_EQ(results->size(), 0); - results = nvtext::minhash64(view); + auto params64 = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + results = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); EXPECT_EQ(results->size(), 0); } @@ -184,20 +198,39 @@ TEST_F(MinHashTest, ErrorsTest) { auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); auto view = cudf::strings_column_view(input); - EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument); - EXPECT_THROW(nvtext::minhash64(view, 0, 0), std::invalid_argument); - auto seeds = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument); - auto seeds64 = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::invalid_argument); + auto empty = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), + std::invalid_argument); + auto empty64 = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), + std::invalid_argument); std::vector h_input(50000, ""); input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end()); view = cudf::strings_column_view(input); auto const zeroes = thrust::constant_iterator(0); - seeds = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error); - seeds64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::overflow_error); + auto params = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4), + std::overflow_error); + auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW(nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), + std::overflow_error); + + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), + std::invalid_argument); } diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 5e39cafa47b..25cfcf99ca6 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -1,5 +1,7 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t, uint64_t + from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column @@ -17,6 +19,19 @@ def minhash(Column input, Column seeds, int width=4): return Column.from_pylibcudf(result) +@acquire_spill_lock() +def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width): + return Column.from_pylibcudf( + nvtext.minhash.minhash_permuted( + input.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() def minhash64(Column input, Column seeds, int width=4): result = nvtext.minhash.minhash64( @@ -27,6 +42,19 @@ def minhash64(Column input, Column seeds, int width=4): return Column.from_pylibcudf(result) +@acquire_spill_lock() +def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width): + return Column.from_pylibcudf( + nvtext.minhash.minhash64_permuted( + input.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() def word_minhash(Column input, Column seeds): result = nvtext.minhash.word_minhash( diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index ffa5e603408..4c0ec2d9ac5 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -9,6 +9,8 @@ from cudf._lib.nvtext.minhash import ( minhash, minhash64, + minhash64_permuted, + minhash_permuted, word_minhash, word_minhash64, ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 856ce0f75de..3d70b01b7e4 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5350,11 +5350,65 @@ def minhash( libstrings.minhash(self._column, seeds_column, width) ) + def minhash_permuted( + self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int + ) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + + This uses the MurmurHash3_x86_32 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a substring of width characters, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + seed : uint32 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint32. + b : ColumnLike + Values for minhash calculation. + Must be of type uint32. + width : int + The width of the substring to hash. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series(['this is my', 'favorite book']) + >>> a = cudf.Series([1, 2, 3], dtype=np.uint32) + >>> b = cudf.Series([4, 5, 6], dtype=np.uint32) + >>> s.str.minhash_permuted(0, a=a, b=b, width=5) + 0 [1305480171, 462824409, 74608232] + 1 [32665388, 65330773, 97996158] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(b)}" + ) + return self._return_or_inplace( + libstrings.minhash_permuted( + self._column, seed, a_column, b_column, width + ) + ) + def minhash64( self, seeds: ColumnLike | None = None, width: int = 4 ) -> SeriesOrIndex: """ Compute the minhash of a strings column. + This uses the MurmurHash3_x64_128 algorithm for the hash function. This function generates 2 uint64 values but only the first uint64 value is used. @@ -5390,6 +5444,59 @@ def minhash64( libstrings.minhash64(self._column, seeds_column, width) ) + def minhash64_permuted( + self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int + ) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + This uses the MurmurHash3_x64_128 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a substring of width characters, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + seed : uint64 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint64. + b : ColumnLike + Values for minhash calculation. + Must be of type uint64. + width : int + The width of the substring to hash. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series(['this is my', 'favorite book', 'to read']) + >>> a = cudf.Series([2, 3], dtype=np.uint64) + >>> b = cudf.Series([5, 6], dtype=np.uint64) + >>> s.str.minhash64_permuted(0, a=a, b=b, width=5) + 0 [172452388517576012, 316595762085180527] + 1 [71427536958126239, 58787297728258215] + 2 [423885828176437114, 1140588505926961370] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(b)}" + ) + return self._return_or_inplace( + libstrings.minhash64_permuted( + self._column, seed, a_column, b_column, width + ) + ) + def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: """ Compute the minhash of a list column of strings. diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 997ca357986..47e541fdcef 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -882,68 +882,48 @@ def test_is_vowel_consonant(): assert_eq(expected, actual) -def test_minhash(): +def test_minhash_permuted(): strings = cudf.Series(["this is my", "favorite book", None, ""]) + params = cudf.Series([1, 2, 3], dtype=np.uint32) expected = cudf.Series( [ - cudf.Series([21141582], dtype=np.uint32), - cudf.Series([962346254], dtype=np.uint32), - None, - cudf.Series([0], dtype=np.uint32), - ] - ) - actual = strings.str.minhash() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - expected = cudf.Series( - [ - cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32), - cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32), + cudf.Series([1305480168, 462824406, 74608229], dtype=np.uint32), + cudf.Series([32665385, 65330770, 97996155], dtype=np.uint32), None, cudf.Series([0, 0, 0], dtype=np.uint32), ] ) - actual = strings.str.minhash(seeds=seeds, width=5) + actual = strings.str.minhash_permuted(0, a=params, b=params, width=5) assert_eq(expected, actual) - expected = cudf.Series( - [ - cudf.Series([3232308021562742685], dtype=np.uint64), - cudf.Series([23008204270530356], dtype=np.uint64), - None, - cudf.Series([0], dtype=np.uint64), - ] - ) - actual = strings.str.minhash64() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + params = cudf.Series([1, 2, 3], dtype=np.uint64) expected = cudf.Series( [ cudf.Series( - [7082801294247314046, 185949556058924788, 167570629329462454], + [105531920695060180, 172452388517576009, 316595762085180524], dtype=np.uint64, ), cudf.Series( - [382665377781028452, 86243762733551437, 7688750597953083512], + [35713768479063122, 71427536958126236, 58787297728258212], dtype=np.uint64, ), None, cudf.Series([0, 0, 0], dtype=np.uint64), ] ) - actual = strings.str.minhash64(seeds=seeds, width=5) + actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5) assert_eq(expected, actual) # test wrong seed types with pytest.raises(ValueError): - strings.str.minhash(seeds="a") + strings.str.minhash_permuted(1, a="a", b="b", width=7) with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.int32) - strings.str.minhash(seeds=seeds) + params = cudf.Series([0, 1, 2], dtype=np.int32) + strings.str.minhash_permuted(1, a=params, b=params, width=6) with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - strings.str.minhash64(seeds=seeds) + params = cudf.Series([0, 1, 2], dtype=np.uint32) + strings.str.minhash64_permuted(1, a=params, b=params, width=8) def test_word_minhash(): diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 41250037dcf..ebf8eda1ce3 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -22,6 +22,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const size_type width, ) except + + cdef unique_ptr[column] minhash_permuted( + const column_view &strings, + const uint32_t seed, + const column_view &a, + const column_view &b, + const size_type width, + ) except + + cdef unique_ptr[column] minhash64( const column_view &strings, const column_view &seeds, @@ -34,6 +42,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const size_type width, ) except + + cdef unique_ptr[column] minhash64_permuted( + const column_view &strings, + const uint64_t seed, + const column_view &a, + const column_view &b, + const size_type width, + ) except + + cdef unique_ptr[column] word_minhash( const column_view &input, const column_view &seeds diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd index 97e8c9dc83c..6b544282f44 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -11,8 +11,24 @@ ctypedef fused ColumnOrScalar: cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) +cpdef Column minhash_permuted( + Column input, + uint32_t seed, + Column a, + Column b, + size_type width +) + cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) +cpdef Column minhash64_permuted( + Column input, + uint64_t seed, + Column a, + Column b, + size_type width +) + cpdef Column word_minhash(Column input, Column seeds) cpdef Column word_minhash64(Column input, Column seeds) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index f1e012e60e5..5a51e32b287 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -8,6 +8,8 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, + minhash64_permuted as cpp_minhash64_permuted, + minhash_permuted as cpp_minhash_permuted, word_minhash as cpp_word_minhash, word_minhash64 as cpp_word_minhash64, ) @@ -16,6 +18,7 @@ from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar from cython.operator import dereference +import warnings cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): @@ -40,6 +43,12 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): Column List column of minhash values for each string per seed """ + warnings.warn( + "Starting in version 25.02, the signature of this function will " + "be changed to match pylibcudf.nvtext.minhash_permuted.", + FutureWarning + ) + cdef unique_ptr[column] c_result if not isinstance(seeds, (Column, Scalar)): @@ -55,6 +64,50 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): return Column.from_libcudf(move(c_result)) +cpdef Column minhash_permuted( + Column input, + uint32_t seed, + Column a, + Column b, + size_type width +): + """ + Returns the minhash values for each string. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`minhash_permuted`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seed : uint32_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + width : size_type + Character width used for apply substrings; + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash_permuted( + input.view(), + seed, + a.view(), + b.view(), + width + ) + + return Column.from_libcudf(move(c_result)) + cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): """ Returns the minhash values for each string per seed. @@ -77,6 +130,12 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): Column List column of minhash values for each string per seed """ + warnings.warn( + "Starting in version 25.02, the signature of this function will " + "be changed to match pylibcudf.nvtext.minhash64_permuted.", + FutureWarning + ) + cdef unique_ptr[column] c_result if not isinstance(seeds, (Column, Scalar)): @@ -92,6 +151,50 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): return Column.from_libcudf(move(c_result)) +cpdef Column minhash64_permuted( + Column input, + uint64_t seed, + Column a, + Column b, + size_type width +): + """ + Returns the minhash values for each string. + This function uses MurmurHash3_x64_128 for the hash algorithm. + + For details, see :cpp:func:`minhash64_permuted`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seed : uint64_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + width : size_type + Character width used for apply substrings; + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash64_permuted( + input.view(), + seed, + a.view(), + b.view(), + width + ) + + return Column.from_libcudf(move(c_result)) + cpdef Column word_minhash(Column input, Column seeds): """ Returns the minhash values for each row of strings per seed. diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index ead9ee094af..ec533e64307 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -21,15 +21,19 @@ def word_minhash_input_data(request): @pytest.mark.parametrize("width", [5, 12]) -def test_minhash(minhash_input_data, width): +def test_minhash_permuted(minhash_input_data, width): input_arr, seeds, seed_type = minhash_input_data minhash_func = ( - plc.nvtext.minhash.minhash + plc.nvtext.minhash.minhash_permuted if seed_type == pa.uint32() - else plc.nvtext.minhash.minhash64 + else plc.nvtext.minhash.minhash64_permuted ) result = minhash_func( - plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width + plc.interop.from_arrow(input_arr), + 0, + plc.interop.from_arrow(seeds), + plc.interop.from_arrow(seeds), + width, ) pa_result = plc.interop.to_arrow(result) assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) From 7682edbfd418cf30c0f5494dbed36a5dbb102c06 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 12 Nov 2024 15:57:36 +0000 Subject: [PATCH 15/19] Add type stubs for pylibcudf (#17258) Having looked at a bunch of the automation options, I just did it by hand. A followup will add some automation to add docstrings (so we can see those via LSP integration in editors) and do some simple validation. - Closes #15190 Authors: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17258 --- docs/cudf/source/conf.py | 73 ++++++- docs/cudf/source/developer_guide/pylibcudf.md | 73 ++++++- python/cudf/cudf/_lib/labeling.pyx | 4 +- python/cudf/cudf/_lib/lists.pyx | 24 +-- .../cudf_polars/containers/dataframe.py | 2 +- .../cudf_polars/dsl/expressions/datetime.py | 4 +- .../cudf_polars/dsl/expressions/literal.py | 2 +- python/cudf_polars/cudf_polars/dsl/ir.py | 2 +- python/pylibcudf/pylibcudf/aggregation.pyi | 110 +++++++++++ python/pylibcudf/pylibcudf/aggregation.pyx | 34 ++++ python/pylibcudf/pylibcudf/binaryop.pyi | 54 +++++ python/pylibcudf/pylibcudf/binaryop.pyx | 1 + python/pylibcudf/pylibcudf/column.pyi | 48 +++++ python/pylibcudf/pylibcudf/column.pyx | 5 + .../pylibcudf/pylibcudf/column_factories.pyi | 20 ++ .../pylibcudf/pylibcudf/column_factories.pyx | 9 + python/pylibcudf/pylibcudf/concatenate.pyi | 8 + python/pylibcudf/pylibcudf/concatenate.pyx | 1 + .../pylibcudf/pylibcudf/contiguous_split.pyi | 14 ++ .../pylibcudf/pylibcudf/contiguous_split.pyx | 11 ++ python/pylibcudf/pylibcudf/copying.pyi | 54 +++++ python/pylibcudf/pylibcudf/copying.pyx | 17 ++ python/pylibcudf/pylibcudf/datetime.pyi | 45 +++++ python/pylibcudf/pylibcudf/datetime.pyx | 18 ++ python/pylibcudf/pylibcudf/experimental.pyi | 5 + python/pylibcudf/pylibcudf/experimental.pyx | 2 + python/pylibcudf/pylibcudf/expressions.pyi | 79 ++++++++ python/pylibcudf/pylibcudf/expressions.pyx | 12 +- python/pylibcudf/pylibcudf/filling.pyi | 17 ++ python/pylibcudf/pylibcudf/filling.pyx | 8 + python/pylibcudf/pylibcudf/gpumemoryview.pyi | 9 + python/pylibcudf/pylibcudf/gpumemoryview.pyx | 3 + python/pylibcudf/pylibcudf/groupby.pyi | 38 ++++ python/pylibcudf/pylibcudf/groupby.pyx | 6 + python/pylibcudf/pylibcudf/hashing.pyi | 18 ++ python/pylibcudf/pylibcudf/hashing.pyx | 13 ++ python/pylibcudf/pylibcudf/interop.pyi | 52 +++++ python/pylibcudf/pylibcudf/interop.pyx | 8 + python/pylibcudf/pylibcudf/io/__init__.py | 16 ++ python/pylibcudf/pylibcudf/io/avro.pyi | 11 ++ python/pylibcudf/pylibcudf/io/avro.pyx | 2 + python/pylibcudf/pylibcudf/io/csv.pyi | 54 +++++ python/pylibcudf/pylibcudf/io/csv.pyx | 2 + python/pylibcudf/pylibcudf/io/datasource.pyi | 4 + python/pylibcudf/pylibcudf/io/datasource.pyx | 2 + python/pylibcudf/pylibcudf/io/json.pyi | 50 +++++ python/pylibcudf/pylibcudf/io/json.pyx | 1 + python/pylibcudf/pylibcudf/io/orc.pyi | 41 ++++ python/pylibcudf/pylibcudf/io/orc.pyx | 10 + python/pylibcudf/pylibcudf/io/parquet.pyi | 36 ++++ python/pylibcudf/pylibcudf/io/parquet.pyx | 4 + .../pylibcudf/io/parquet_metadata.pyx | 9 +- python/pylibcudf/pylibcudf/io/text.pyx | 9 + python/pylibcudf/pylibcudf/io/timezone.pyi | 7 + python/pylibcudf/pylibcudf/io/timezone.pyx | 1 + python/pylibcudf/pylibcudf/io/types.pyi | 97 +++++++++ python/pylibcudf/pylibcudf/io/types.pyx | 18 ++ python/pylibcudf/pylibcudf/join.pyi | 78 ++++++++ python/pylibcudf/pylibcudf/join.pyx | 18 ++ python/pylibcudf/pylibcudf/json.pyi | 23 +++ python/pylibcudf/pylibcudf/json.pyx | 3 + python/pylibcudf/pylibcudf/labeling.pxd | 4 +- python/pylibcudf/pylibcudf/labeling.pyi | 17 ++ python/pylibcudf/pylibcudf/labeling.pyx | 24 +-- .../pylibcudf/libcudf/CMakeLists.txt | 1 + .../pylibcudf/libcudf/lists/CMakeLists.txt | 23 +++ .../pylibcudf/libcudf/lists/combine.pxd | 8 +- .../pylibcudf/libcudf/lists/combine.pyx | 0 .../pylibcudf/libcudf/lists/contains.pyx | 0 python/pylibcudf/pylibcudf/lists.pxd | 30 ++- python/pylibcudf/pylibcudf/lists.pyi | 70 +++++++ python/pylibcudf/pylibcudf/lists.pyx | 185 ++++++++---------- python/pylibcudf/pylibcudf/merge.pyi | 11 ++ python/pylibcudf/pylibcudf/merge.pyx | 1 + python/pylibcudf/pylibcudf/null_mask.pyi | 14 ++ python/pylibcudf/pylibcudf/null_mask.pyx | 7 + .../pylibcudf/nvtext/byte_pair_encode.pyi | 11 ++ .../pylibcudf/nvtext/byte_pair_encode.pyx | 3 + .../pylibcudf/nvtext/edit_distance.pyi | 6 + .../pylibcudf/nvtext/edit_distance.pyx | 1 + .../pylibcudf/nvtext/generate_ngrams.pyi | 10 + .../pylibcudf/nvtext/generate_ngrams.pyx | 5 + python/pylibcudf/pylibcudf/nvtext/jaccard.pyi | 5 + python/pylibcudf/pylibcudf/nvtext/jaccard.pyx | 1 + python/pylibcudf/pylibcudf/nvtext/minhash.pyi | 13 ++ python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 6 + .../pylibcudf/nvtext/ngrams_tokenize.pyi | 8 + .../pylibcudf/nvtext/ngrams_tokenize.pyx | 1 + .../pylibcudf/pylibcudf/nvtext/normalize.pyi | 6 + .../pylibcudf/pylibcudf/nvtext/normalize.pyx | 1 + python/pylibcudf/pylibcudf/nvtext/replace.pyi | 17 ++ python/pylibcudf/pylibcudf/nvtext/replace.pyx | 1 + python/pylibcudf/pylibcudf/nvtext/stemmer.pyi | 8 + python/pylibcudf/pylibcudf/nvtext/stemmer.pyx | 1 + .../pylibcudf/nvtext/subword_tokenize.pyi | 15 ++ .../pylibcudf/nvtext/subword_tokenize.pyx | 3 + .../pylibcudf/pylibcudf/nvtext/tokenize.pyi | 26 +++ .../pylibcudf/pylibcudf/nvtext/tokenize.pyx | 12 ++ python/pylibcudf/pylibcudf/partitioning.pyi | 14 ++ python/pylibcudf/pylibcudf/partitioning.pyx | 5 + python/pylibcudf/pylibcudf/py.typed | 0 python/pylibcudf/pylibcudf/quantiles.pyi | 23 +++ python/pylibcudf/pylibcudf/quantiles.pyx | 1 + python/pylibcudf/pylibcudf/reduce.pyi | 16 ++ python/pylibcudf/pylibcudf/reduce.pyx | 1 + python/pylibcudf/pylibcudf/replace.pyi | 29 +++ python/pylibcudf/pylibcudf/replace.pyx | 8 + python/pylibcudf/pylibcudf/reshape.pyi | 7 + python/pylibcudf/pylibcudf/reshape.pyx | 1 + python/pylibcudf/pylibcudf/rolling.pyi | 12 ++ python/pylibcudf/pylibcudf/rolling.pyx | 1 + python/pylibcudf/pylibcudf/round.pyi | 15 ++ python/pylibcudf/pylibcudf/round.pyx | 1 + python/pylibcudf/pylibcudf/scalar.pyi | 10 + python/pylibcudf/pylibcudf/scalar.pyx | 4 + python/pylibcudf/pylibcudf/search.pyi | 19 ++ python/pylibcudf/pylibcudf/search.pyx | 1 + python/pylibcudf/pylibcudf/sorting.pyi | 64 ++++++ python/pylibcudf/pylibcudf/sorting.pyx | 12 ++ .../pylibcudf/pylibcudf/stream_compaction.pxd | 2 + .../pylibcudf/pylibcudf/stream_compaction.pyi | 53 +++++ .../pylibcudf/pylibcudf/stream_compaction.pyx | 12 ++ .../pylibcudf/pylibcudf/strings/__init__.py | 4 +- .../pylibcudf/strings/attributes.pyi | 7 + .../pylibcudf/strings/attributes.pyx | 1 + .../pylibcudf/strings/capitalize.pyi | 12 ++ .../pylibcudf/strings/capitalize.pyx | 1 + python/pylibcudf/pylibcudf/strings/case.pyi | 7 + python/pylibcudf/pylibcudf/strings/case.pyx | 1 + .../pylibcudf/strings/char_types.pyi | 30 +++ .../pylibcudf/strings/char_types.pyx | 5 + .../pylibcudf/pylibcudf/strings/combine.pyi | 34 ++++ .../pylibcudf/pylibcudf/strings/combine.pyx | 7 + .../pylibcudf/pylibcudf/strings/contains.pyi | 14 ++ .../pylibcudf/pylibcudf/strings/contains.pyx | 1 + .../pylibcudf/strings/convert/__init__.py | 12 ++ .../strings/convert/convert_booleans.pyi | 9 + .../strings/convert/convert_booleans.pyx | 1 + .../strings/convert/convert_datetime.pyi | 12 ++ .../strings/convert/convert_datetime.pyx | 1 + .../strings/convert/convert_durations.pyi | 9 + .../strings/convert/convert_durations.pyx | 1 + .../strings/convert/convert_fixed_point.pyi | 10 + .../strings/convert/convert_fixed_point.pyx | 2 + .../strings/convert/convert_floats.pyi | 8 + .../strings/convert/convert_floats.pyx | 1 + .../strings/convert/convert_integers.pyi | 11 ++ .../strings/convert/convert_integers.pyx | 8 + .../strings/convert/convert_ipv4.pyi | 7 + .../strings/convert/convert_ipv4.pyx | 1 + .../strings/convert/convert_lists.pyi | 10 + .../strings/convert/convert_lists.pyx | 1 + .../strings/convert/convert_urls.pyi | 6 + .../strings/convert/convert_urls.pyx | 1 + .../pylibcudf/pylibcudf/strings/extract.pyi | 8 + .../pylibcudf/pylibcudf/strings/extract.pyx | 1 + python/pylibcudf/pylibcudf/strings/find.pyi | 14 ++ python/pylibcudf/pylibcudf/strings/find.pyx | 1 + .../pylibcudf/strings/find_multiple.pyi | 5 + .../pylibcudf/strings/find_multiple.pyx | 1 + .../pylibcudf/pylibcudf/strings/findall.pyi | 7 + .../pylibcudf/pylibcudf/strings/findall.pyx | 1 + .../pylibcudf/pylibcudf/strings/padding.pyi | 9 + .../pylibcudf/pylibcudf/strings/padding.pyx | 1 + .../pylibcudf/strings/regex_flags.pyi | 7 + .../pylibcudf/strings/regex_flags.pyx | 2 + .../pylibcudf/strings/regex_program.pyi | 8 + .../pylibcudf/strings/regex_program.pyx | 3 + python/pylibcudf/pylibcudf/strings/repeat.pyi | 5 + python/pylibcudf/pylibcudf/strings/repeat.pyx | 1 + .../pylibcudf/pylibcudf/strings/replace.pyi | 14 ++ .../pylibcudf/pylibcudf/strings/replace.pyx | 1 + .../pylibcudf/strings/replace_re.pyi | 27 +++ .../pylibcudf/strings/replace_re.pyx | 1 + .../pylibcudf/pylibcudf/strings/side_type.pyi | 7 + .../pylibcudf/pylibcudf/strings/side_type.pyx | 2 + python/pylibcudf/pylibcudf/strings/slice.pyi | 11 ++ python/pylibcudf/pylibcudf/strings/slice.pyx | 1 + .../pylibcudf/strings/split/__init__.py | 2 + .../pylibcudf/strings/split/partition.pyi | 8 + .../pylibcudf/strings/split/partition.pyx | 1 + .../pylibcudf/strings/split/split.pyi | 27 +++ .../pylibcudf/strings/split/split.pyx | 10 + python/pylibcudf/pylibcudf/strings/strip.pyi | 11 ++ python/pylibcudf/pylibcudf/strings/strip.pyx | 1 + .../pylibcudf/pylibcudf/strings/translate.pyi | 20 ++ .../pylibcudf/pylibcudf/strings/translate.pyx | 1 + python/pylibcudf/pylibcudf/strings/wrap.pyi | 5 + python/pylibcudf/pylibcudf/strings/wrap.pyx | 1 + python/pylibcudf/pylibcudf/table.pyi | 9 + python/pylibcudf/pylibcudf/table.pyx | 3 + .../pylibcudf/tests/test_binaryops.py | 14 -- .../pylibcudf/tests/test_labeling.py | 8 +- .../pylibcudf/pylibcudf/tests/test_lists.py | 83 ++++---- .../pylibcudf/tests/test_string_attributes.py | 2 +- python/pylibcudf/pylibcudf/traits.pyi | 23 +++ python/pylibcudf/pylibcudf/traits.pyx | 21 ++ python/pylibcudf/pylibcudf/transform.pyi | 16 ++ python/pylibcudf/pylibcudf/transform.pyx | 9 + python/pylibcudf/pylibcudf/transpose.pyi | 4 + python/pylibcudf/pylibcudf/transpose.pyx | 1 + python/pylibcudf/pylibcudf/types.pyi | 86 ++++++++ python/pylibcudf/pylibcudf/types.pyx | 16 ++ python/pylibcudf/pylibcudf/unary.pyi | 38 ++++ python/pylibcudf/pylibcudf/unary.pyx | 10 + python/pylibcudf/pyproject.toml | 23 ++- 206 files changed, 2863 insertions(+), 228 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/aggregation.pyi create mode 100644 python/pylibcudf/pylibcudf/binaryop.pyi create mode 100644 python/pylibcudf/pylibcudf/column.pyi create mode 100644 python/pylibcudf/pylibcudf/column_factories.pyi create mode 100644 python/pylibcudf/pylibcudf/concatenate.pyi create mode 100644 python/pylibcudf/pylibcudf/contiguous_split.pyi create mode 100644 python/pylibcudf/pylibcudf/copying.pyi create mode 100644 python/pylibcudf/pylibcudf/datetime.pyi create mode 100644 python/pylibcudf/pylibcudf/experimental.pyi create mode 100644 python/pylibcudf/pylibcudf/expressions.pyi create mode 100644 python/pylibcudf/pylibcudf/filling.pyi create mode 100644 python/pylibcudf/pylibcudf/gpumemoryview.pyi create mode 100644 python/pylibcudf/pylibcudf/groupby.pyi create mode 100644 python/pylibcudf/pylibcudf/hashing.pyi create mode 100644 python/pylibcudf/pylibcudf/interop.pyi create mode 100644 python/pylibcudf/pylibcudf/io/avro.pyi create mode 100644 python/pylibcudf/pylibcudf/io/csv.pyi create mode 100644 python/pylibcudf/pylibcudf/io/datasource.pyi create mode 100644 python/pylibcudf/pylibcudf/io/json.pyi create mode 100644 python/pylibcudf/pylibcudf/io/orc.pyi create mode 100644 python/pylibcudf/pylibcudf/io/parquet.pyi create mode 100644 python/pylibcudf/pylibcudf/io/timezone.pyi create mode 100644 python/pylibcudf/pylibcudf/io/types.pyi create mode 100644 python/pylibcudf/pylibcudf/join.pyi create mode 100644 python/pylibcudf/pylibcudf/json.pyi create mode 100644 python/pylibcudf/pylibcudf/labeling.pyi create mode 100644 python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt create mode 100644 python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx create mode 100644 python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx create mode 100644 python/pylibcudf/pylibcudf/lists.pyi create mode 100644 python/pylibcudf/pylibcudf/merge.pyi create mode 100644 python/pylibcudf/pylibcudf/null_mask.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/jaccard.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/minhash.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/normalize.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/replace.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/stemmer.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/tokenize.pyi create mode 100644 python/pylibcudf/pylibcudf/partitioning.pyi create mode 100644 python/pylibcudf/pylibcudf/py.typed create mode 100644 python/pylibcudf/pylibcudf/quantiles.pyi create mode 100644 python/pylibcudf/pylibcudf/reduce.pyi create mode 100644 python/pylibcudf/pylibcudf/replace.pyi create mode 100644 python/pylibcudf/pylibcudf/reshape.pyi create mode 100644 python/pylibcudf/pylibcudf/rolling.pyi create mode 100644 python/pylibcudf/pylibcudf/round.pyi create mode 100644 python/pylibcudf/pylibcudf/scalar.pyi create mode 100644 python/pylibcudf/pylibcudf/search.pyi create mode 100644 python/pylibcudf/pylibcudf/sorting.pyi create mode 100644 python/pylibcudf/pylibcudf/stream_compaction.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/attributes.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/capitalize.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/case.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/char_types.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/combine.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/contains.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/extract.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/find.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/find_multiple.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/findall.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/padding.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/regex_flags.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/regex_program.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/repeat.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/replace.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/replace_re.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/side_type.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/slice.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/split/partition.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/split/split.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/strip.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/translate.pyi create mode 100644 python/pylibcudf/pylibcudf/strings/wrap.pyi create mode 100644 python/pylibcudf/pylibcudf/table.pyi create mode 100644 python/pylibcudf/pylibcudf/traits.pyi create mode 100644 python/pylibcudf/pylibcudf/transform.pyi create mode 100644 python/pylibcudf/pylibcudf/transpose.pyi create mode 100644 python/pylibcudf/pylibcudf/types.pyi create mode 100644 python/pylibcudf/pylibcudf/unary.pyi diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 0d463b918d3..fbb9ca4b128 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -26,16 +26,18 @@ import tempfile import warnings import xml.etree.ElementTree as ET +from enum import IntEnum +from typing import Any +import cudf from docutils.nodes import Text from packaging.version import Version -from sphinx.addnodes import pending_xref -from sphinx.highlighting import lexers -from sphinx.ext import intersphinx from pygments.lexer import RegexLexer from pygments.token import Text as PText - -import cudf +from sphinx.addnodes import pending_xref +from sphinx.ext import intersphinx +from sphinx.ext.autodoc import ClassDocumenter, bool_option +from sphinx.highlighting import lexers class PseudoLexer(RegexLexer): @@ -342,7 +344,10 @@ def clean_all_xml_files(path): "cudf.Series": ("cudf.core.series.Series", "cudf.Series"), "cudf.Index": ("cudf.core.index.Index", "cudf.Index"), "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"), - "DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"), + "DeviceBuffer": ( + "rmm.pylibrmm.device_buffer.DeviceBuffer", + "rmm.DeviceBuffer", + ), } @@ -373,7 +378,14 @@ def _generate_namespaces(namespaces): _all_namespaces = _generate_namespaces( { # Note that io::datasource is actually a nested class - "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"}, + "cudf": { + "io", + "io::datasource", + "strings", + "ast", + "ast::expression", + "io::text", + }, "numeric": {}, "nvtext": {}, } @@ -642,9 +654,54 @@ def linkcode_resolve(domain, info) -> str | None: f"branch-{version}/python/cudf/cudf/{fn}{linespec}" ) + # Needed for avoid build warning for PandasCompat extension suppress_warnings = ["myst.domains"] + +class PLCIntEnumDocumenter(ClassDocumenter): + objtype = "enum" + directivetype = "attribute" + priority = 10 + ClassDocumenter.priority + + option_spec = dict(ClassDocumenter.option_spec) + + @classmethod + def can_document_member( + cls, member: Any, membername: str, isattr: bool, parent: Any + ) -> bool: + try: + return issubclass( + member, IntEnum + ) and member.__module__.startswith("pylibcudf") + except TypeError: + return False + + def add_directive_header(self, sig: str) -> None: + self.directivetype = "attribute" + super().add_directive_header(sig) + + def add_content(self, more_content) -> None: + doc_as_attr = self.doc_as_attr + self.doc_as_attr = False + super().add_content(more_content) + self.doc_as_attr = doc_as_attr + source_name = self.get_sourcename() + enum_object: IntEnum = self.object + + if self.object.__name__ != "Kind": + self.add_line(f"See also :cpp:enum:`cudf::{self.object.__name__}`.", source_name) + self.add_line("", source_name) + self.add_line("Enum members", source_name) + self.add_line("", source_name) + + for the_member_name in enum_object.__members__: # type: ignore[attr-defined] + self.add_line( + f"* ``{the_member_name}``", source_name + ) + self.add_line("", source_name) + + def setup(app): app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") app.add_js_file( @@ -652,3 +709,5 @@ def setup(app): ) app.connect("doctree-read", resolve_aliases) app.connect("missing-reference", on_missing_reference) + app.setup_extension("sphinx.ext.autodoc") + app.add_autodocumenter(PLCIntEnumDocumenter) diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md index 39840e72e21..1ee828e7c4e 100644 --- a/docs/cudf/source/developer_guide/pylibcudf.md +++ b/docs/cudf/source/developer_guide/pylibcudf.md @@ -15,7 +15,8 @@ To satisfy the goals of pylibcudf, we impose the following set of design princip - All typing in code should be written using Cython syntax, not PEP 484 Python typing syntax. Not only does this ensure compatibility with Cython < 3, but even with Cython 3 PEP 484 support remains incomplete as of this writing. - All cudf code should interact only with pylibcudf, never with libcudf directly. This is not currently the case, but is the direction that the library is moving towards. - Ideally, pylibcudf should depend on no RAPIDS component other than rmm, and should in general have minimal runtime dependencies. - +- Type stubs are provided and generated manually. When adding new + functionality, ensure that the matching type stub is appropriately updated. ## Relationship to libcudf @@ -249,3 +250,73 @@ In the event that libcudf provides multiple overloads for the same function with and set arguments not shared between overloads to `None`. If a user tries to pass in an unsupported argument for a specific overload type, you should raise `ValueError`. Finally, consider making an libcudf issue if you think this inconsistency can be addressed on the libcudf side. + +### Type stubs + +Since static type checkers like `mypy` and `pyright` cannot parse +Cython code, we provide type stubs for the pylibcudf package. These +are currently maintained manually, alongside the matching pylibcudf +files. + +Every `pyx` file should have a matching `pyi` file that provides the +type stubs. Most functions can be exposed straightforwardly. Some +guiding principles: + +- For typed integer arguments in libcudf, use `int` as a type + annotation. +- For functions which are annotated as a `list` in Cython, but the + function body does more detailed checking, try and encode the + detailed information in the type. +- For Cython fused types there are two options: + 1. If the fused type appears only once in the function signature, + use a `Union` type; + 2. If the fused type appears more than once (or as both an input + and output type), use a `TypeVar` with + the variants in the fused type provided as constraints. + + +As an example, `pylibcudf.copying.split` is typed in Cython as: + +```cython +ctypedef fused ColumnOrTable: + Table + Column + +cpdef list split(ColumnOrTable input, list splits): ... +``` + +Here we only have a single use of the fused type, and the `list` +arguments do not specify their values. Here, if we provide a `Column` +as input, we receive a `list[Column]` as output, and if we provide a +`Table` we receive `list[Table]` as output. + +In the type stub, we can encode this with a `TypeVar`, we can also +provide typing for the `splits` argument that indicates that the split +values must be integers: + +```python +ColumnOrTable = TypeVar("ColumnOrTable", Column, Table) + +def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ... +``` + +Conversely, `pylibcudf.copying.scatter` uses a fused type only once in +its input: + +```cython +ctypedef fused TableOrListOfScalars: + Table + list + +cpdef Table scatter( + TableOrListOfScalars source, Column scatter_map, Table target +) +``` + +In the type stub, we can use a normal union in this case + +```python +def scatter( + source: Table | list[Scalar], scatter_map: Column, target: Table +) -> Table: ... +``` diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx index 3966cce8981..524bfd3b2e8 100644 --- a/python/cudf/cudf/_lib/labeling.pyx +++ b/python/cudf/cudf/_lib/labeling.pyx @@ -17,8 +17,8 @@ def label_bins(Column input, Column left_edges, cbool left_inclusive, plc_column = plc.labeling.label_bins( input.to_pylibcudf(mode="read"), left_edges.to_pylibcudf(mode="read"), - left_inclusive, + plc.labeling.Inclusive.YES if left_inclusive else plc.labeling.Inclusive.NO, right_edges.to_pylibcudf(mode="read"), - right_inclusive + plc.labeling.Inclusive.YES if right_inclusive else plc.labeling.Inclusive.NO, ) return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index a91d44274e5..9a2aa4a6130 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -4,7 +4,9 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from pylibcudf.libcudf.types cimport size_type +from pylibcudf.libcudf.types cimport ( + nan_equality, null_equality, null_order, order, size_type +) from cudf._lib.column cimport Column from cudf._lib.utils cimport columns_from_pylibcudf_table @@ -37,8 +39,8 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal): return Column.from_pylibcudf( plc.lists.distinct( col.to_pylibcudf(mode="read"), - nulls_equal, - nans_all_equal, + null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL, + nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL, ) ) @@ -48,12 +50,8 @@ def sort_lists(Column col, bool ascending, str na_position): return Column.from_pylibcudf( plc.lists.sort_lists( col.to_pylibcudf(mode="read"), - ascending, - ( - plc.types.NullOrder.BEFORE - if na_position == "first" - else plc.types.NullOrder.AFTER - ), + order.ASCENDING if ascending else order.DESCENDING, + null_order.BEFORE if na_position == "first" else null_order.AFTER, False, ) ) @@ -95,7 +93,7 @@ def index_of_scalar(Column col, object py_search_key): plc.lists.index_of( col.to_pylibcudf(mode="read"), py_search_key.device_value.c_value, - True, + plc.lists.DuplicateFindOption.FIND_FIRST, ) ) @@ -106,7 +104,7 @@ def index_of_column(Column col, Column search_keys): plc.lists.index_of( col.to_pylibcudf(mode="read"), search_keys.to_pylibcudf(mode="read"), - True, + plc.lists.DuplicateFindOption.FIND_FIRST, ) ) @@ -127,7 +125,9 @@ def concatenate_list_elements(Column input_column, dropna=False): return Column.from_pylibcudf( plc.lists.concatenate_list_elements( input_column.to_pylibcudf(mode="read"), - dropna, + plc.lists.ConcatenateNullPolicy.IGNORE + if dropna + else plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW, ) ) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 08bc9d0ea3f..7560a0f5a64 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -60,7 +60,7 @@ def to_polars(self) -> pl.DataFrame: # To guarantee we produce correct names, we therefore # serialise with names we control and rename with that map. name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)} - table: pa.Table = plc.interop.to_arrow( + table = plc.interop.to_arrow( self.table, [plc.interop.ColumnMetadata(name=name) for name in name_map], ) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py index 65fa4bfa62f..cd8e5c6a4eb 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py @@ -27,7 +27,9 @@ class TemporalFunction(Expr): __slots__ = ("name", "options") - _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = { + _COMPONENT_MAP: ClassVar[ + dict[pl_expr.TemporalFunction, plc.datetime.DatetimeComponent] + ] = { pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR, pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH, pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY, diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py index c16313bf83c..7eba0c110ab 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py @@ -58,7 +58,7 @@ def collect_agg(self, *, depth: int) -> AggInfo: class LiteralColumn(Expr): __slots__ = ("value",) _non_child = ("dtype", "value") - value: pa.Array[Any, Any] + value: pa.Array[Any] def __init__(self, dtype: plc.DataType, value: pl.Series) -> None: self.dtype = dtype diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index beea5908e56..1f935190f28 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -517,7 +517,7 @@ def do_evaluate( # Mask must have been applied. return df elif typ == "ndjson": - json_schema: list[tuple[str, str, list]] = [ + json_schema: list[plc.io.json.NameAndType] = [ (name, typ, []) for name, typ in schema.items() ] plc_tbl_w_meta = plc.io.json.read_json( diff --git a/python/pylibcudf/pylibcudf/aggregation.pyi b/python/pylibcudf/pylibcudf/aggregation.pyi new file mode 100644 index 00000000000..a59e2a9dc93 --- /dev/null +++ b/python/pylibcudf/pylibcudf/aggregation.pyi @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.types import ( + DataType, + Interpolation, + NanEquality, + NullEquality, + NullOrder, + NullPolicy, + Order, +) + +class Kind(IntEnum): + SUM = ... + PRODUCT = ... + MIN = ... + MAX = ... + COUNT_VALID = ... + COUNT_ALL = ... + ANY = ... + ALL = ... + SUM_OF_SQUARES = ... + MEAN = ... + VARIANCE = ... + STD = ... + MEDIAN = ... + QUANTILE = ... + ARGMAX = ... + ARGMIN = ... + NUNIQUE = ... + NTH_ELEMENT = ... + RANK = ... + COLLECT_LIST = ... + COLLECT_SET = ... + PTX = ... + CUDA = ... + CORRELATION = ... + COVARIANCE = ... + +class CorrelationType(IntEnum): + PEARSON = ... + KENDALL = ... + SPEARMAN = ... + +class EWMHistory(IntEnum): + INFINITE = ... + FINITE = ... + +class RankMethod(IntEnum): + FIRST = ... + AVERAGE = ... + MIN = ... + MAX = ... + DENSE = ... + +class RankPercentage(IntEnum): + NONE = ... + ZERO_NORMALIZED = ... + ONE_NORMALIZED = ... + +class UdfType(IntEnum): + CUDA = ... + PTX = ... + +class Aggregation: + def __init__(self): ... + def kind(self) -> Kind: ... + +def sum() -> Aggregation: ... +def product() -> Aggregation: ... +def min() -> Aggregation: ... +def max() -> Aggregation: ... +def count(null_handling: NullPolicy = NullPolicy.INCLUDE) -> Aggregation: ... +def any() -> Aggregation: ... +def all() -> Aggregation: ... +def sum_of_squares() -> Aggregation: ... +def mean() -> Aggregation: ... +def variance(ddof: int = 1) -> Aggregation: ... +def std(ddof: int = 1) -> Aggregation: ... +def median() -> Aggregation: ... +def quantile( + quantiles: list[float], interp: Interpolation = Interpolation.LINEAR +) -> Aggregation: ... +def argmax() -> Aggregation: ... +def argmin() -> Aggregation: ... +def ewma(center_of_mass: float, history: EWMHistory) -> Aggregation: ... +def nunique(null_handling: NullPolicy = NullPolicy.EXCLUDE) -> Aggregation: ... +def nth_element( + n: int, null_handling: NullPolicy = NullPolicy.INCLUDE +) -> Aggregation: ... +def collect_list( + null_handling: NullPolicy = NullPolicy.INCLUDE, +) -> Aggregation: ... +def collect_set( + null_handling: NullPolicy = NullPolicy.INCLUDE, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Aggregation: ... +def udf(operation: str, output_type: DataType) -> Aggregation: ... +def correlation(type: CorrelationType, min_periods: int) -> Aggregation: ... +def covariance(min_periods: int, ddof: int) -> Aggregation: ... +def rank( + method: RankMethod, + column_order: Order = Order.ASCENDING, + null_handling: NullPolicy = NullPolicy.EXCLUDE, + null_precedence: NullOrder = NullOrder.AFTER, + percentage: RankPercentage = RankPercentage.NONE, +) -> Aggregation: ... diff --git a/python/pylibcudf/pylibcudf/aggregation.pyx b/python/pylibcudf/pylibcudf/aggregation.pyx index e510b738f70..662f76d5c8e 100644 --- a/python/pylibcudf/pylibcudf/aggregation.pyx +++ b/python/pylibcudf/pylibcudf/aggregation.pyx @@ -64,6 +64,40 @@ from pylibcudf.libcudf.aggregation import udf_type as UdfType # no-cython-lint from .types cimport DataType +__all__ = [ + "Aggregation", + "CorrelationType", + "EWMHistory", + "Kind", + "RankMethod", + "RankPercentage", + "UdfType", + "all", + "any", + "argmax", + "argmin", + "collect_list", + "collect_set", + "correlation", + "count", + "covariance", + "ewma", + "max", + "mean", + "median", + "min", + "nth_element", + "nunique", + "product", + "quantile", + "rank", + "std", + "sum", + "sum_of_squares", + "udf", + "variance", +] + cdef class Aggregation: """A type of aggregation to perform. diff --git a/python/pylibcudf/pylibcudf/binaryop.pyi b/python/pylibcudf/pylibcudf/binaryop.pyi new file mode 100644 index 00000000000..f745e6c6854 --- /dev/null +++ b/python/pylibcudf/pylibcudf/binaryop.pyi @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.types import DataType + +class BinaryOperator(IntEnum): + ADD = ... + SUB = ... + MUL = ... + DIV = ... + TRUE_DIV = ... + FLOOR_DIV = ... + MOD = ... + PMOD = ... + PYMOD = ... + POW = ... + INT_POW = ... + LOG_BASE = ... + ATAN2 = ... + SHIFT_LEFT = ... + SHIFT_RIGHT = ... + SHIFT_RIGHT_UNSIGNED = ... + BITWISE_AND = ... + BITWISE_OR = ... + BITWISE_XOR = ... + LOGICAL_AND = ... + LOGICAL_OR = ... + EQUAL = ... + NOT_EQUAL = ... + LESS = ... + GREATER = ... + LESS_EQUAL = ... + GREATER_EQUAL = ... + NULL_EQUALS = ... + NULL_MAX = ... + NULL_MIN = ... + NULL_NOT_EQUALS = ... + GENERIC_BINARY = ... + NULL_LOGICAL_AND = ... + NULL_LOGICAL_OR = ... + INVALID_BINARY = ... + +def binary_operation( + lhs: Column | Scalar, + rhs: Column | Scalar, + op: BinaryOperator, + output_type: DataType, +) -> Column: ... +def is_supported_operation( + out: DataType, lhs: DataType, rhs: DataType, op: BinaryOperator +) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx index eef73bf4e9d..b7b4ecc6e83 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pyx +++ b/python/pylibcudf/pylibcudf/binaryop.pyx @@ -16,6 +16,7 @@ from .column cimport Column from .scalar cimport Scalar from .types cimport DataType +__all__ = ["BinaryOperator", "binary_operation", "is_supported_operation"] cpdef Column binary_operation( LeftBinaryOperand lhs, diff --git a/python/pylibcudf/pylibcudf/column.pyi b/python/pylibcudf/pylibcudf/column.pyi new file mode 100644 index 00000000000..c9f70de3dbf --- /dev/null +++ b/python/pylibcudf/pylibcudf/column.pyi @@ -0,0 +1,48 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Sequence +from typing import Any + +from pylibcudf.gpumemoryview import gpumemoryview +from pylibcudf.scalar import Scalar +from pylibcudf.types import DataType + +class Column: + def __init__( + self, + data_type: DataType, + size: int, + data: gpumemoryview | None, + mask: gpumemoryview | None, + null_count: int, + offset: int, + children: list[Column], + ) -> None: ... + def type(self) -> DataType: ... + def child(self, index: int) -> Column: ... + def size(self) -> int: ... + def null_count(self) -> int: ... + def offset(self) -> int: ... + def data(self) -> gpumemoryview | None: ... + def null_mask(self) -> gpumemoryview | None: ... + def children(self) -> list[Column]: ... + def copy(self) -> Column: ... + def with_mask( + self, mask: gpumemoryview | None, null_count: int + ) -> Column: ... + def list_view(self) -> ListColumnView: ... + @staticmethod + def from_scalar(scalar: Scalar, size: int) -> Column: ... + @staticmethod + def all_null_like(like: Column, size: int) -> Column: ... + @staticmethod + def from_cuda_array_interface_obj(obj: Any) -> Column: ... + +class ListColumnView: + def __init__(self, column: Column): ... + def child(self) -> Column: ... + def offsets(self) -> Column: ... + +def is_c_contiguous( + shape: Sequence[int], strides: Sequence[int], itemsize: int +) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx index 4e5698566d0..9bb5574608e 100644 --- a/python/pylibcudf/pylibcudf/column.pyx +++ b/python/pylibcudf/pylibcudf/column.pyx @@ -17,6 +17,7 @@ from .utils cimport int_to_bitmask_ptr, int_to_void_ptr import functools +__all__ = ["Column", "ListColumnView", "is_c_contiguous"] cdef class Column: """A container of nullable device data as a column of elements. @@ -61,6 +62,8 @@ cdef class Column: self._children = children self._num_children = len(children) + __hash__ = None + cdef column_view view(self) nogil: """Generate a libcudf column_view to pass to libcudf algorithms. @@ -384,6 +387,8 @@ cdef class ListColumnView: raise TypeError("Column is not a list type") self._column = col + __hash__ = None + cpdef child(self): """The data column of the underlying list column.""" return self._column.child(1) diff --git a/python/pylibcudf/pylibcudf/column_factories.pyi b/python/pylibcudf/pylibcudf/column_factories.pyi new file mode 100644 index 00000000000..c87fe423acb --- /dev/null +++ b/python/pylibcudf/pylibcudf/column_factories.pyi @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column import Column +from pylibcudf.types import DataType, MaskState, TypeId + +def make_empty_column(type_or_id: DataType | TypeId) -> Column: ... +def make_numeric_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_fixed_point_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_timestamp_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_duration_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_fixed_width_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx index ac942a620b5..c4969a7f502 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pyx +++ b/python/pylibcudf/pylibcudf/column_factories.pyx @@ -17,6 +17,15 @@ from .types cimport DataType, type_id from .types import MaskState, TypeId +__all__ = [ + "make_duration_column", + "make_empty_column", + "make_fixed_point_column", + "make_fixed_width_column", + "make_numeric_column", + "make_timestamp_column", +] + cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): """Creates an empty column of the specified type. diff --git a/python/pylibcudf/pylibcudf/concatenate.pyi b/python/pylibcudf/pylibcudf/concatenate.pyi new file mode 100644 index 00000000000..79076f509e0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/concatenate.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table + +def concatenate[ColumnOrTable: (Column, Table)]( + objects: list[ColumnOrTable], +) -> ColumnOrTable: ... diff --git a/python/pylibcudf/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx index 10c860d97bb..42c5f34cf3e 100644 --- a/python/pylibcudf/pylibcudf/concatenate.pyx +++ b/python/pylibcudf/pylibcudf/concatenate.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view from .column cimport Column from .table cimport Table +__all__ = ["concatenate"] cpdef concatenate(list objects): """Concatenate columns or tables. diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyi b/python/pylibcudf/pylibcudf/contiguous_split.pyi new file mode 100644 index 00000000000..dd6328fbf23 --- /dev/null +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.gpumemoryview import gpumemoryview +from pylibcudf.table import Table + +class PackedColumns: + def __init__(self): ... + def release(self) -> tuple[memoryview, gpumemoryview]: ... + +def pack(input: Table) -> PackedColumns: ... +def unpack(input: PackedColumns) -> Table: ... +def unpack_from_memoryviews( + metadata: memoryview, gpu_data: gpumemoryview +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index ed926a3fcc0..94873e079c9 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -20,6 +20,13 @@ from .table cimport Table from .utils cimport int_to_void_ptr +__all__ = [ + "PackedColumns", + "pack", + "unpack", + "unpack_from_memoryviews", +] + cdef class HostBuffer: """Owning host buffer that implements the buffer protocol""" cdef unique_ptr[vector[uint8_t]] c_obj @@ -38,6 +45,8 @@ cdef class HostBuffer: out.strides[0] = 1 return out + __hash__ = None + def __getbuffer__(self, Py_buffer *buffer, int flags): buffer.buf = dereference(self.c_obj).data() buffer.format = NULL # byte @@ -69,6 +78,8 @@ cdef class PackedColumns: "Use one of the factories." ) + __hash__ = None + @staticmethod cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data): """Create a Python PackedColumns from a libcudf packed_columns.""" diff --git a/python/pylibcudf/pylibcudf/copying.pyi b/python/pylibcudf/pylibcudf/copying.pyi new file mode 100644 index 00000000000..6cf4ed48724 --- /dev/null +++ b/python/pylibcudf/pylibcudf/copying.pyi @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum +from typing import TypeVar + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +class MaskAllocationPolicy(IntEnum): + NEVER = ... + RETAIN = ... + ALWAYS = ... + +class OutOfBoundsPolicy(IntEnum): + NULLIFY = ... + DONT_CHECK = ... + +ColumnOrTable = TypeVar("ColumnOrTable", Column, Table) + +def gather( + source_table: Table, gather_map: Column, bounds_policy: OutOfBoundsPolicy +) -> Table: ... +def scatter( + source: Table | list[Scalar], scatter_map: Column, target_table: Table +) -> Table: ... +def empty_like(input: ColumnOrTable) -> ColumnOrTable: ... +def allocate_like( + input_column: Column, policy: MaskAllocationPolicy, size: int | None = None +) -> Column: ... +def copy_range_in_place( + input_column: Column, + target_column: Column, + input_begin: int, + input_end: int, + target_begin: int, +) -> Column: ... +def copy_range( + input_column: Column, + target_column: Column, + input_begin: int, + input_end: int, + target_begin: int, +) -> Column: ... +def shift(input: Column, offset: int, fill_value: Scalar) -> Column: ... +def slice(input: ColumnOrTable, indices: list[int]) -> list[ColumnOrTable]: ... +def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ... +def copy_if_else( + lhs: Column | Scalar, rhs: Column | Scalar, boolean_mask: Column +) -> Column: ... +def boolean_mask_scatter( + input: Table | list[Scalar], target: Table, boolean_mask: Column +) -> Table: ... +def get_element(input_column: Column, index: int) -> Scalar: ... diff --git a/python/pylibcudf/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx index 4938f1a3dda..fb8b6f9890e 100644 --- a/python/pylibcudf/pylibcudf/copying.pyx +++ b/python/pylibcudf/pylibcudf/copying.pyx @@ -36,6 +36,23 @@ from .table cimport Table from .utils cimport _as_vector +__all__ = [ + "MaskAllocationPolicy", + "OutOfBoundsPolicy", + "allocate_like", + "boolean_mask_scatter", + "copy_if_else", + "copy_range", + "copy_range_in_place", + "empty_like", + "gather", + "get_element", + "scatter", + "shift", + "slice", + "split", +] + cpdef Table gather( Table source_table, Column gather_map, diff --git a/python/pylibcudf/pylibcudf/datetime.pyi b/python/pylibcudf/pylibcudf/datetime.pyi new file mode 100644 index 00000000000..6a3ae7953d9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/datetime.pyi @@ -0,0 +1,45 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class DatetimeComponent(IntEnum): + YEAR = ... + MONTH = ... + DAY = ... + WEEKDAY = ... + HOUR = ... + MINUTE = ... + SECOND = ... + MILLISECOND = ... + MICROSECOND = ... + NANOSECOND = ... + +class RoundingFrequency(IntEnum): + DAY = ... + HOUR = ... + MINUTE = ... + SECOND = ... + MILLISECOND = ... + MICROSECOND = ... + NANOSECOND = ... + +def extract_millisecond_fraction(input: Column) -> Column: ... +def extract_microsecond_fraction(input: Column) -> Column: ... +def extract_nanosecond_fraction(input: Column) -> Column: ... +def extract_datetime_component( + input: Column, component: DatetimeComponent +) -> Column: ... +def ceil_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... +def floor_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... +def round_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... +def add_calendrical_months( + input: Column, months: Column | Scalar +) -> Column: ... +def day_of_year(input: Column) -> Column: ... +def is_leap_year(input: Column) -> Column: ... +def last_day_of_month(input: Column) -> Column: ... +def extract_quarter(input: Column) -> Column: ... +def days_in_month(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index 9e5e709d81d..b100e3e22d0 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -29,6 +29,24 @@ from cython.operator cimport dereference from .column cimport Column +__all__ = [ + "DatetimeComponent", + "RoundingFrequency", + "add_calendrical_months", + "ceil_datetimes", + "day_of_year", + "days_in_month", + "extract_datetime_component", + "extract_microsecond_fraction", + "extract_millisecond_fraction", + "extract_nanosecond_fraction", + "extract_quarter", + "floor_datetimes", + "is_leap_year", + "last_day_of_month", + "round_datetimes", +] + cpdef Column extract_millisecond_fraction( Column input ): diff --git a/python/pylibcudf/pylibcudf/experimental.pyi b/python/pylibcudf/pylibcudf/experimental.pyi new file mode 100644 index 00000000000..bbfb86b0ff6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/experimental.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +def enable_prefetching(key: str) -> None: ... +def disable_prefetching(key: str) -> None: ... +def prefetch_debugging(enable: bool) -> None: ... diff --git a/python/pylibcudf/pylibcudf/experimental.pyx b/python/pylibcudf/pylibcudf/experimental.pyx index b25a53e13b2..d94d6d087ac 100644 --- a/python/pylibcudf/pylibcudf/experimental.pyx +++ b/python/pylibcudf/pylibcudf/experimental.pyx @@ -5,6 +5,8 @@ from libcpp.string cimport string from pylibcudf.libcudf cimport experimental as cpp_experimental +__all__ = ["disable_prefetching", "enable_prefetching", "prefetch_debugging"] + cpdef enable_prefetching(str key): """Turn on prefetch instructions for the given key. diff --git a/python/pylibcudf/pylibcudf/expressions.pyi b/python/pylibcudf/pylibcudf/expressions.pyi new file mode 100644 index 00000000000..12b473d8605 --- /dev/null +++ b/python/pylibcudf/pylibcudf/expressions.pyi @@ -0,0 +1,79 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum + +from pylibcudf.scalar import Scalar + +class TableReference(IntEnum): + LEFT = ... + RIGHT = ... + +class ASTOperator(IntEnum): + ADD = ... + SUB = ... + MUL = ... + DIV = ... + TRUE_DIV = ... + FLOOR_DIV = ... + MOD = ... + PYMOD = ... + POW = ... + EQUAL = ... + NULL_EQUAL = ... + NOT_EQUAL = ... + LESS = ... + GREATER = ... + LESS_EQUAL = ... + GREATER_EQUAL = ... + BITWISE_AND = ... + BITWISE_OR = ... + BITWISE_XOR = ... + NULL_LOGICAL_AND = ... + LOGICAL_AND = ... + NULL_LOGICAL_OR = ... + LOGICAL_OR = ... + IDENTITY = ... + IS_NULL = ... + SIN = ... + COS = ... + TAN = ... + ARCSIN = ... + ARCCOS = ... + ARCTAN = ... + SINH = ... + COSH = ... + TANH = ... + ARCSINH = ... + ARCCOSH = ... + ARCTANH = ... + EXP = ... + LOG = ... + SQRT = ... + CBRT = ... + CEIL = ... + FLOOR = ... + ABS = ... + RINT = ... + BIT_INVERT = ... + NOT = ... + +class Expression: + def __init__(self): ... + +class Literal(Expression): + def __init__(self, value: Scalar): ... + +class ColumnReference(Expression): + def __init__( + self, index: int, table_source: TableReference = TableReference.LEFT + ): ... + +class ColumnNameReference(Expression): + def __init__(self, name: str): ... + +class Operation(Expression): + def __init__( + self, + op: ASTOperator, + left: Expression, + right: Expression | None = None, + ): ... diff --git a/python/pylibcudf/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx index 1535f68366b..0f12cfe313c 100644 --- a/python/pylibcudf/pylibcudf/expressions.pyx +++ b/python/pylibcudf/pylibcudf/expressions.pyx @@ -49,6 +49,16 @@ from .types cimport DataType # Aliases for simplicity ctypedef unique_ptr[libcudf_exp.expression] expression_ptr +__all__ = [ + "ASTOperator", + "ColumnNameReference", + "ColumnReference", + "Expression", + "Literal", + "Operation", + "TableReference", +] + # Define this class just to have a docstring for it cdef class Expression: """ @@ -58,7 +68,7 @@ cdef class Expression: For details, see :cpp:class:`cudf::ast::expression`. """ - pass + __hash__ = None cdef class Literal(Expression): """ diff --git a/python/pylibcudf/pylibcudf/filling.pyi b/python/pylibcudf/pylibcudf/filling.pyi new file mode 100644 index 00000000000..0b5e29bdc32 --- /dev/null +++ b/python/pylibcudf/pylibcudf/filling.pyi @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +def fill( + destination: Column, begin: int, end: int, value: Scalar +) -> Column: ... +def fill_in_place( + destination: Column, begin: int, end: int, value: Scalar +) -> None: ... +def sequence(size: int, init: Scalar, step: Scalar) -> Column: ... +def repeat(input_table: Table, count: Column | int) -> Table: ... +def calendrical_month_sequence( + n: int, init: Scalar, months: int +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx index 313605ead16..ea5b45ff7c2 100644 --- a/python/pylibcudf/pylibcudf/filling.pyx +++ b/python/pylibcudf/pylibcudf/filling.pyx @@ -19,6 +19,14 @@ from .scalar cimport Scalar from .table cimport Table +__all__ = [ + "fill", + "fill_in_place", + "repeat", + "sequence", + "calendrical_month_sequence", +] + cpdef Column fill( Column destination, size_type begin, diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyi b/python/pylibcudf/pylibcudf/gpumemoryview.pyi new file mode 100644 index 00000000000..50f1f39a515 --- /dev/null +++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Mapping +from typing import Any + +class gpumemoryview: + def __init__(self, data: Any): ... + @property + def __cuda_array_interface__(self) -> Mapping[str, Any]: ... diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyx b/python/pylibcudf/pylibcudf/gpumemoryview.pyx index 0904022a944..41316eddb60 100644 --- a/python/pylibcudf/pylibcudf/gpumemoryview.pyx +++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +__all__ = ["gpumemoryview"] cdef class gpumemoryview: """Minimal representation of a memory buffer. @@ -25,3 +26,5 @@ cdef class gpumemoryview: @property def __cuda_array_interface__(self): return self.obj.__cuda_array_interface__ + + __hash__ = None diff --git a/python/pylibcudf/pylibcudf/groupby.pyi b/python/pylibcudf/pylibcudf/groupby.pyi new file mode 100644 index 00000000000..883ad6e34cf --- /dev/null +++ b/python/pylibcudf/pylibcudf/groupby.pyi @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.aggregation import Aggregation +from pylibcudf.column import Column +from pylibcudf.replace import ReplacePolicy +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, NullPolicy, Order, Sorted + +class GroupByRequest: + def __init__( + self, values: Column, aggregations: list[Aggregation] + ) -> None: ... + +class GroupBy: + def __init__( + self, + keys: Table, + null_handling: NullPolicy = NullPolicy.EXCLUDE, + keys_are_sorted: Sorted = Sorted.NO, + column_order: list[Order] | None = None, + null_precedence: list[NullOrder] | None = None, + ) -> None: ... + def aggregate( + self, requests: list[GroupByRequest] + ) -> tuple[Table, list[Table]]: ... + def scan( + self, requests: list[GroupByRequest] + ) -> tuple[Table, list[Table]]: ... + def shift( + self, values: Table, offset: list[int], fill_values: list[Scalar] + ) -> tuple[Table, Table]: ... + def replace_nulls( + self, value: Table, replace_policies: list[ReplacePolicy] + ) -> tuple[Table, Table]: ... + def get_groups( + self, values: Table | None = None + ) -> tuple[list[int], Table, Table]: ... diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx index 71f9ecb0453..e6cb3ac81a7 100644 --- a/python/pylibcudf/pylibcudf/groupby.pyx +++ b/python/pylibcudf/pylibcudf/groupby.pyx @@ -25,6 +25,8 @@ from .types cimport null_order, null_policy, order, sorted from .utils cimport _as_vector +__all__ = ["GroupBy", "GroupByRequest"] + cdef class GroupByRequest: """A request for a groupby aggregation or scan. @@ -45,6 +47,8 @@ cdef class GroupByRequest: self._values = values self._aggregations = aggregations + __hash__ = None + cdef aggregation_request _to_libcudf_agg_request(self) except *: """Convert to a libcudf aggregation_request object. @@ -127,6 +131,8 @@ cdef class GroupBy: # deallocated from under us: self._keys = keys + __hash__ = None + @staticmethod cdef tuple _parse_outputs( pair[unique_ptr[table], vector[aggregation_result]] c_res diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi new file mode 100644 index 00000000000..a849f5d0729 --- /dev/null +++ b/python/pylibcudf/pylibcudf/hashing.pyi @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import Final + +from pylibcudf.column import Column +from pylibcudf.table import Table + +LIBCUDF_DEFAULT_HASH_SEED: Final[int] + +def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ... +def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ... +def xxhash_64(input: Table, seed: int = ...) -> Column: ... +def md5(input: Table) -> Column: ... +def sha1(input: Table) -> Column: ... +def sha224(input: Table) -> Column: ... +def sha256(input: Table) -> Column: ... +def sha384(input: Table) -> Column: ... +def sha512(input: Table) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx index 9ea3d4d1bda..548cffc0ce8 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyx +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -20,6 +20,19 @@ from pylibcudf.libcudf.table.table cimport table from .column cimport Column from .table cimport Table +__all__ = [ + "LIBCUDF_DEFAULT_HASH_SEED", + "md5", + "murmurhash3_x64_128", + "murmurhash3_x86_32", + "sha1", + "sha224", + "sha256", + "sha384", + "sha512", + "xxhash_64", +] + LIBCUDF_DEFAULT_HASH_SEED = DEFAULT_HASH_SEED cpdef Column murmurhash3_x86_32( diff --git a/python/pylibcudf/pylibcudf/interop.pyi b/python/pylibcudf/pylibcudf/interop.pyi new file mode 100644 index 00000000000..63de816010b --- /dev/null +++ b/python/pylibcudf/pylibcudf/interop.pyi @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass +from typing import Any, overload + +import pyarrow as pa + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table +from pylibcudf.types import DataType + +@dataclass +class ColumnMetadata: + name: str = ... + children_meta: list[ColumnMetadata] = ... + +@overload +def from_arrow(obj: pa.DataType) -> DataType: ... +@overload +def from_arrow( + obj: pa.Scalar[Any], *, data_type: DataType | None = None +) -> Scalar: ... +@overload +def from_arrow(obj: pa.Array[Any]) -> Column: ... +@overload +def from_arrow(obj: pa.Table) -> Table: ... +@overload +def to_arrow( + obj: DataType, + *, + precision: int | None = None, + fields: Iterable[pa.Field[pa.DataType] | tuple[str, pa.DataType]] + | Mapping[str, pa.DataType] + | None = None, + value_type: pa.DataType | None = None, +) -> pa.DataType: ... +@overload +def to_arrow( + obj: Table, metadata: list[ColumnMetadata | str] | None = None +) -> pa.Table: ... +@overload +def to_arrow( + obj: Column, metadata: ColumnMetadata | str | None = None +) -> pa.Array[Any]: ... +@overload +def to_arrow( + obj: Scalar, metadata: ColumnMetadata | str | None = None +) -> pa.Scalar[Any]: ... +def from_dlpack(managed_tensor: Any) -> Table: ... +def to_dlpack(input: Table) -> Any: ... diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx index 61e812353b7..bd5397ac328 100644 --- a/python/pylibcudf/pylibcudf/interop.pyx +++ b/python/pylibcudf/pylibcudf/interop.pyx @@ -38,6 +38,14 @@ from .scalar cimport Scalar from .table cimport Table from .types cimport DataType, type_id +__all__ = [ + "ColumnMetadata", + "from_arrow", + "from_dlpack", + "to_arrow", + "to_dlpack", +] + ARROW_TO_PYLIBCUDF_TYPES = { pa.int8(): type_id.INT8, pa.int16(): type_id.INT16, diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py index 9e8e0f6e080..f913a400684 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.py +++ b/python/pylibcudf/pylibcudf/io/__init__.py @@ -13,3 +13,19 @@ types, ) from .types import SinkInfo, SourceInfo, TableWithMetadata + +__all__ = [ + "SinkInfo", + "SourceInfo", + "TableWithMetadata", + "avro", + "csv", + "datasource", + "json", + "orc", + "parquet", + "parquet_metadata", + "text", + "timezone", + "types", +] diff --git a/python/pylibcudf/pylibcudf/io/avro.pyi b/python/pylibcudf/pylibcudf/io/avro.pyi new file mode 100644 index 00000000000..49c2f083702 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/avro.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.io.types import SourceInfo, TableWithMetadata + +__all__ = ["read_avro"] + +def read_avro( + source_info: SourceInfo, + columns: list[str] | None = None, + skip_rows: int = 0, + num_rows: int = -1, +) -> TableWithMetadata: ... diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx index fe765b34f82..4271333511a 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pyx +++ b/python/pylibcudf/pylibcudf/io/avro.pyx @@ -10,6 +10,8 @@ from pylibcudf.libcudf.io.avro cimport ( ) from pylibcudf.libcudf.types cimport size_type +__all__ = ["read_avro"] + cpdef TableWithMetadata read_avro( SourceInfo source_info, diff --git a/python/pylibcudf/pylibcudf/io/csv.pyi b/python/pylibcudf/pylibcudf/io/csv.pyi new file mode 100644 index 00000000000..356825a927d --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/csv.pyi @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Mapping + +from pylibcudf.io.types import ( + CompressionType, + QuoteStyle, + SourceInfo, + TableWithMetadata, +) +from pylibcudf.types import DataType + +def read_csv( + source_info: SourceInfo, + *, + compression: CompressionType = CompressionType.AUTO, + byte_range_offset: int = 0, + byte_range_size: int = 0, + col_names: list[str] | None = None, + prefix: str = "", + mangle_dupe_cols: bool = True, + usecols: list[int] | list[str] | None = None, + nrows: int = -1, + skiprows: int = 0, + skipfooter: int = 0, + header: int = 0, + lineterminator: str = "\n", + delimiter: str | None = None, + thousands: str | None = None, + decimal: str = ".", + comment: str | None = None, + delim_whitespace: bool = False, + skipinitialspace: bool = False, + skip_blank_lines: bool = True, + quoting: QuoteStyle = QuoteStyle.MINIMAL, + quotechar: str = '"', + doublequote: bool = True, + parse_dates: list[str] | list[int] | None = None, + parse_hex: list[str] | list[int] | None = None, + # Technically this should be dict/list + # but using a fused type prevents using None as default + dtypes: Mapping[str, DataType] | list[DataType] | None = None, + true_values: list[str] | None = None, + false_values: list[str] | None = None, + na_values: list[str] | None = None, + keep_default_na: bool = True, + na_filter: bool = True, + dayfirst: bool = False, + # Note: These options are supported by the libcudf reader + # but are not exposed here since there is no demand for them + # on the Python side yet. + # detect_whitespace_around_quotes: bool = False, + # timestamp_type: DataType = DataType(type_id.EMPTY), +) -> TableWithMetadata: ... diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index 2c61cc42d82..858e580ab34 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -19,6 +19,8 @@ from pylibcudf.libcudf.types cimport data_type, size_type from pylibcudf.types cimport DataType +__all__ = ["read_csv"] + cdef tuple _process_parse_dates_hex(list cols): cdef vector[string] str_cols cdef vector[int] int_cols diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyi b/python/pylibcudf/pylibcudf/io/datasource.pyi new file mode 100644 index 00000000000..e52197f793b --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/datasource.pyi @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +class Datasource: + def __init__(self): ... diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyx b/python/pylibcudf/pylibcudf/io/datasource.pyx index 02418444caa..aac1c0d1014 100644 --- a/python/pylibcudf/pylibcudf/io/datasource.pyx +++ b/python/pylibcudf/pylibcudf/io/datasource.pyx @@ -2,8 +2,10 @@ from pylibcudf.libcudf.io.datasource cimport datasource +__all__ = ["Datasource"] cdef class Datasource: + __hash__ = None cdef datasource* get_datasource(self) except * nogil: with gil: raise NotImplementedError("get_datasource() should not " diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi new file mode 100644 index 00000000000..b2bc6a43700 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/json.pyi @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from collections.abc import Mapping +from typing import TypeAlias + +from pylibcudf.column import Column +from pylibcudf.io.types import ( + CompressionType, + JSONRecoveryMode, + SinkInfo, + SourceInfo, + TableWithMetadata, +) +from pylibcudf.types import DataType + +ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap] + +NameAndType: TypeAlias = tuple[str, DataType, list[NameAndType]] + +def read_json( + source_info: SourceInfo, + dtypes: list[NameAndType] | None = None, + compression: CompressionType = CompressionType.AUTO, + lines: bool = False, + byte_range_offset: int = 0, + byte_range_size: int = 0, + keep_quotes: bool = False, + mixed_types_as_string: bool = False, + prune_columns: bool = False, + recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, +) -> TableWithMetadata: ... +def write_json( + sink_info: SinkInfo, + table_w_meta: TableWithMetadata, + na_rep: str = "", + include_nulls: bool = False, + lines: bool = False, + rows_per_chunk: int = 2**32 - 1, + true_value: str = "true", + false_value: str = "false", +) -> None: ... +def chunked_read_json( + source_info: SourceInfo, + dtypes: list[NameAndType] | None = None, + compression: CompressionType = CompressionType.AUTO, + keep_quotes: bool = False, + mixed_types_as_string: bool = False, + prune_columns: bool = False, + recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, + chunk_size: int = 100_000_000, +) -> tuple[list[Column], list[str], ChildNameToTypeMap]: ... diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index 65f78f830f1..ad2989925c9 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -23,6 +23,7 @@ from pylibcudf.libcudf.io.types cimport ( from pylibcudf.libcudf.types cimport data_type, size_type from pylibcudf.types cimport DataType +__all__ = ["chunked_read_json", "read_json", "write_json"] cdef map[string, schema_element] _generate_schema_map(list dtypes): cdef map[string, schema_element] schema_map diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi new file mode 100644 index 00000000000..4cf87f1a832 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/orc.pyi @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import Any + +from pylibcudf.io.types import SourceInfo, TableWithMetadata +from pylibcudf.types import DataType + +def read_orc( + source_info: SourceInfo, + columns: list[str] | None = None, + stripes: list[list[int]] | None = None, + skip_rows: int = 0, + nrows: int = -1, + use_index: bool = True, + use_np_dtypes: bool = True, + timestamp_type: DataType | None = None, + decimal128_columns: list[str] | None = None, +) -> TableWithMetadata: ... + +class OrcColumnStatistics: + def __init__(self): ... + @property + def number_of_values(self) -> int | None: ... + @property + def has_null(self) -> bool | None: ... + def __getitem__(self, item: str) -> Any: ... + def __contains__(self, item: str) -> bool: ... + def get[T](self, item: str, default: None | T = None) -> T | None: ... + +class ParsedOrcStatistics: + def __init__(self): ... + @property + def column_names(self) -> list[str]: ... + @property + def file_stats(self) -> list[OrcColumnStatistics]: ... + @property + def stripes_stats(self) -> list[OrcColumnStatistics]: ... + +def read_parsed_orc_statistics( + source_info: SourceInfo, +) -> ParsedOrcStatistics: ... diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 70e0a7995a2..4270f5b4f95 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -30,6 +30,12 @@ from pylibcudf.libcudf.types cimport size_type from pylibcudf.types cimport DataType from pylibcudf.variant cimport get_if, holds_alternative +__all__ = [ + "OrcColumnStatistics", + "ParsedOrcStatistics", + "read_orc", + "read_parsed_orc_statistics", +] cdef class OrcColumnStatistics: def __init__(self): @@ -39,6 +45,8 @@ cdef class OrcColumnStatistics: "use `OrcColumnStatistics.from_libcudf` instead." ) + __hash__ = None + @property def number_of_values(self): if self.number_of_values_c.has_value(): @@ -183,6 +191,8 @@ cdef class OrcColumnStatistics: cdef class ParsedOrcStatistics: + __hash__ = None + @property def column_names(self): return [name.decode() for name in self.c_obj.column_names] diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi new file mode 100644 index 00000000000..bcf1d1cce09 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/parquet.pyi @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.expressions import Expression +from pylibcudf.io.types import SourceInfo, TableWithMetadata + +class ChunkedParquetReader: + def __init__( + self, + source_info: SourceInfo, + columns: list[str] | None = None, + row_groups: list[list[int]] | None = None, + use_pandas_metadata: bool = True, + convert_strings_to_categories: bool = False, + skip_rows: int = 0, + nrows: int = 0, + chunk_read_limit: int = 0, + pass_read_limit: int = 1024000000, + allow_mismatched_pq_schemas: bool = False, + ) -> None: ... + def has_next(self) -> bool: ... + def read_chunk(self) -> TableWithMetadata: ... + +def read_parquet( + source_info: SourceInfo, + columns: list[str] | None = None, + row_groups: list[list[int]] | None = None, + filters: Expression | None = None, + convert_strings_to_categories: bool = False, + use_pandas_metadata: bool = True, + skip_rows: int = 0, + nrows: int = -1, + allow_mismatched_pq_schemas: bool = False, + # disabled see comment in parquet.pyx for more + # reader_column_schema: ReaderColumnSchema = *, + # timestamp_type: DataType = * +) -> TableWithMetadata: ... diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index 981ca7b8159..b76a352d633 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -16,6 +16,8 @@ from pylibcudf.libcudf.io.parquet cimport ( from pylibcudf.libcudf.io.types cimport table_with_metadata from pylibcudf.libcudf.types cimport size_type +__all__ = ["ChunkedParquetReader", "read_parquet"] + cdef parquet_reader_options _setup_parquet_reader_options( SourceInfo source_info, @@ -123,6 +125,8 @@ cdef class ChunkedParquetReader: ) ) + __hash__ = None + cpdef bool has_next(self): """ Returns True if there is another chunk in the Parquet file diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx index 352905ff0f8..0ad4dafb0cf 100644 --- a/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx @@ -4,6 +4,13 @@ from pylibcudf.io.types cimport SourceInfo from pylibcudf.libcudf.io cimport parquet_metadata as cpp_parquet_metadata +__all__ = [ + "ParquetColumnSchema", + "ParquetMetadata", + "ParquetSchema", + "read_parquet_metadata", +] + cdef class ParquetColumnSchema: """ Schema of a parquet column, including the nested columns. @@ -164,7 +171,7 @@ cdef class ParquetMetadata: Returns ------- - dict[bytes, bytes] + dict[str, str] Key value metadata as a map. """ return {key.decode(): val.decode() for key, val in self.meta.metadata()} diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx index 667a054baaa..d3cbdc4cd60 100644 --- a/python/pylibcudf/pylibcudf/io/text.pyx +++ b/python/pylibcudf/pylibcudf/io/text.pyx @@ -10,6 +10,15 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.io cimport text as cpp_text +__all__ = [ + "DataChunkSource", + "ParseOptions", + "make_source", + "make_source_from_bgzip_file", + "make_source_from_file", + "multibyte_split", +] + cdef class ParseOptions: """ Parsing options for `multibyte_split` diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyi b/python/pylibcudf/pylibcudf/io/timezone.pyi new file mode 100644 index 00000000000..0582800c4af --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/timezone.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.table import Table + +def make_timezone_transition_table( + tzif_dir: str, timezone_name: str +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx index f120b65fb2c..af7cf8a4ee5 100644 --- a/python/pylibcudf/pylibcudf/io/timezone.pyx +++ b/python/pylibcudf/pylibcudf/io/timezone.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.table.table cimport table from ..table cimport Table +__all__ = ["make_timezone_transition_table"] cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name): """ diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi new file mode 100644 index 00000000000..a4f4fc13bdc --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/types.pyi @@ -0,0 +1,97 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import io +import os +from collections.abc import Mapping +from enum import IntEnum +from typing import Any, Literal, TypeAlias, overload + +from pylibcudf.column import Column +from pylibcudf.io.datasource import Datasource +from pylibcudf.table import Table + +class JSONRecoveryMode(IntEnum): + FAIL = ... + RECOVER_WITH_NULL = ... + +class CompressionType(IntEnum): + NONE = ... + AUTO = ... + SNAPPY = ... + GZIP = ... + BZIP2 = ... + BROTLI = ... + ZIP = ... + XZ = ... + ZLIB = ... + LZ4 = ... + LZO = ... + ZSTD = ... + +class ColumnEncoding(IntEnum): + USE_DEFAULT = ... + DICTIONARY = ... + PLAIN = ... + DELTA_BINARY_PACKED = ... + DELTA_LENGTH_BYTE_ARRAY = ... + DELTA_BYTE_ARRAY = ... + BYTE_STREAM_SPLIT = ... + DIRECT = ... + DIRECT_V2 = ... + DICTIONARY_V2 = ... + +class DictionaryPolicy(IntEnum): + NEVER = ... + ADAPTIVE = ... + ALWAYS = ... + +class StatisticsFreq(IntEnum): + STATISTICS_NONE = ... + STATISTICS_ROWGROUP = ... + STATISTICS_PAGE = ... + STATISTICS_COLUMN = ... + +class QuoteStyle(IntEnum): + MINIMAL = ... + ALL = ... + NONNUMERIC = ... + NONE = ... + +ColumnNameSpec: TypeAlias = tuple[str, list[ColumnNameSpec]] +ChildNameSpec: TypeAlias = Mapping[str, ChildNameSpec] + +class TableWithMetadata: + tbl: Table + def __init__( + self, tbl: Table, column_names: list[ColumnNameSpec] + ) -> None: ... + @property + def columns(self) -> list[Column]: ... + @overload + def column_names(self, include_children: Literal[False]) -> list[str]: ... + @overload + def column_names( + self, include_children: Literal[True] + ) -> list[ColumnNameSpec]: ... + @overload + def column_names( + self, include_children: bool = False + ) -> list[str] | list[ColumnNameSpec]: ... + @property + def child_names(self) -> ChildNameSpec: ... + @property + def per_file_user_data(self) -> list[Mapping[str, str]]: ... + +class SourceInfo: + def __init__( + self, sources: list[str] | list[os.PathLike[Any]] | list[Datasource] + ) -> None: ... + +class SinkInfo: + def __init__( + self, + sinks: list[os.PathLike[Any]] + | list[io.StringIO] + | list[io.BytesIO] + | list[io.TextIOBase] + | list[str], + ) -> None: ... diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index c129903f8f1..5db4eeb9583 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -28,9 +28,21 @@ from pylibcudf.libcudf.io.types import ( compression_type as CompressionType, # no-cython-lint column_encoding as ColumnEncoding, # no-cython-lint dictionary_policy as DictionaryPolicy, # no-cython-lint + quote_style as QuoteStyle, # no-cython-lint statistics_freq as StatisticsFreq, # no-cython-lint ) +__all__ = [ + "ColumnEncoding", + "CompressionType", + "DictionaryPolicy", + "JSONRecoveryMode", + "QuoteStyle", + "SinkInfo", + "SourceInfo", + "StatisticsFreq", + "TableWithMetadata", +] cdef class TableWithMetadata: """A container holding a table and its associated metadata @@ -54,6 +66,8 @@ cdef class TableWithMetadata: self.metadata.schema_info = self._make_column_info(column_names) + __hash__ = None + cdef vector[column_name_info] _make_column_info(self, list column_names): cdef vector[column_name_info] col_name_infos cdef column_name_info info @@ -219,6 +233,8 @@ cdef class SourceInfo: self.c_obj = source_info(c_host_buffers) + __hash__ = None + # Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you # write from cudf to any python file-like object (File/BytesIO/SocketIO etc) @@ -301,3 +317,5 @@ cdef class SinkInfo: else: # we don't have sinks so we must have paths to sinks self.c_obj = sink_info(paths) + + __hash__ = None diff --git a/python/pylibcudf/pylibcudf/join.pyi b/python/pylibcudf/pylibcudf/join.pyi new file mode 100644 index 00000000000..f34357baa67 --- /dev/null +++ b/python/pylibcudf/pylibcudf/join.pyi @@ -0,0 +1,78 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.expressions import Expression +from pylibcudf.table import Table +from pylibcudf.types import NullEquality + +def inner_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> tuple[Column, Column]: ... +def left_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> tuple[Column, Column]: ... +def full_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> tuple[Column, Column]: ... +def left_semi_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> Column: ... +def left_anti_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> Column: ... +def cross_join(left: Table, right: Table) -> Table: ... +def conditional_inner_join( + left: Table, right: Table, binary_predicate: Expression +) -> tuple[Column, Column]: ... +def conditional_left_join( + left: Table, right: Table, binary_predicate: Expression +) -> tuple[Column, Column]: ... +def conditional_full_join( + left: Table, right: Table, binary_predicate: Expression +) -> tuple[Column, Column]: ... +def conditional_left_semi_join( + left: Table, right: Table, binary_predicate: Expression +) -> Column: ... +def conditional_left_anti_join( + left: Table, right: Table, binary_predicate: Expression +) -> Column: ... +def mixed_inner_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> tuple[Column, Column]: ... +def mixed_left_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> tuple[Column, Column]: ... +def mixed_full_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> tuple[Column, Column]: ... +def mixed_left_semi_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> Column: ... +def mixed_left_anti_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx index 0d841eee194..c2efe05ffc4 100644 --- a/python/pylibcudf/pylibcudf/join.pyx +++ b/python/pylibcudf/pylibcudf/join.pyx @@ -15,6 +15,24 @@ from .column cimport Column from .expressions cimport Expression from .table cimport Table +__all__ = [ + "conditional_full_join", + "conditional_inner_join", + "conditional_left_anti_join", + "conditional_left_join", + "conditional_left_semi_join", + "cross_join", + "full_join", + "inner_join", + "left_anti_join", + "left_join", + "left_semi_join", + "mixed_full_join", + "mixed_inner_join", + "mixed_left_anti_join", + "mixed_left_join", + "mixed_left_semi_join", +] cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map): # helper to convert a gather map to a Column diff --git a/python/pylibcudf/pylibcudf/json.pyi b/python/pylibcudf/pylibcudf/json.pyi new file mode 100644 index 00000000000..b93d4876dab --- /dev/null +++ b/python/pylibcudf/pylibcudf/json.pyi @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class GetJsonObjectOptions: + def __init__( + self, + *, + allow_single_quotes: bool = False, + strip_quotes_from_single_strings: bool = True, + missing_fields_as_nulls: bool = False, + ) -> None: ... + def get_allow_single_quotes(self) -> bool: ... + def get_strip_quotes_from_single_strings(self) -> bool: ... + def get_missing_fields_as_nulls(self) -> bool: ... + def set_allow_single_quotes(self, val: bool) -> None: ... + def set_strip_quotes_from_single_strings(self, val: bool) -> None: ... + def set_missing_fields_as_nulls(self, val: bool) -> None: ... + +def get_json_object( + col: Column, json_path: Scalar, options: GetJsonObjectOptions | None = None +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx index ebb82f80408..5ec1e1be971 100644 --- a/python/pylibcudf/pylibcudf/json.pyx +++ b/python/pylibcudf/pylibcudf/json.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.scalar cimport Scalar +__all__ = ["GetJsonObjectOptions", "get_json_object"] cdef class GetJsonObjectOptions: """Settings for ``get_json_object()``""" @@ -26,6 +27,8 @@ cdef class GetJsonObjectOptions: ) self.set_missing_fields_as_nulls(missing_fields_as_nulls) + __hash__ = None + def get_allow_single_quotes(self): """ Returns true/false depending on whether single-quotes for representing strings diff --git a/python/pylibcudf/pylibcudf/labeling.pxd b/python/pylibcudf/pylibcudf/labeling.pxd index 6f8797ae7d3..b1f9f2e806d 100644 --- a/python/pylibcudf/pylibcudf/labeling.pxd +++ b/python/pylibcudf/pylibcudf/labeling.pxd @@ -8,7 +8,7 @@ from .column cimport Column cpdef Column label_bins( Column input, Column left_edges, - bool left_inclusive, + inclusive left_inclusive, Column right_edges, - bool right_inclusive + inclusive right_inclusive ) diff --git a/python/pylibcudf/pylibcudf/labeling.pyi b/python/pylibcudf/pylibcudf/labeling.pyi new file mode 100644 index 00000000000..c3a75d10baf --- /dev/null +++ b/python/pylibcudf/pylibcudf/labeling.pyi @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column + +class Inclusive(IntEnum): + YES = ... + NO = ... + +def label_bins( + input: Column, + left_edges: Column, + left_inclusive: Inclusive, + right_edges: Column, + right_inclusive: Inclusive, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx index 226a9e14172..cae1830f6b9 100644 --- a/python/pylibcudf/pylibcudf/labeling.pyx +++ b/python/pylibcudf/pylibcudf/labeling.pyx @@ -10,13 +10,14 @@ from pylibcudf.libcudf.labeling import inclusive as Inclusive # no-cython-lint from .column cimport Column +__all__ = ["Inclusive", "label_bins"] cpdef Column label_bins( Column input, Column left_edges, - bool left_inclusive, + inclusive left_inclusive, Column right_edges, - bool right_inclusive + inclusive right_inclusive ): """Labels elements based on membership in the specified bins. @@ -28,11 +29,11 @@ cpdef Column label_bins( Column of input elements to label according to the specified bins. left_edges : Column Column of the left edge of each bin. - left_inclusive : bool + left_inclusive : Inclusive Whether or not the left edge is inclusive. right_edges : Column Column of the right edge of each bin. - right_inclusive : bool + right_inclusive : Inclusive Whether or not the right edge is inclusive. Returns @@ -42,24 +43,13 @@ cpdef Column label_bins( according to the specified bins. """ cdef unique_ptr[column] c_result - cdef inclusive c_left_inclusive = ( - inclusive.YES - if left_inclusive - else inclusive.NO - ) - cdef inclusive c_right_inclusive = ( - inclusive.YES - if right_inclusive - else inclusive.NO - ) - with nogil: c_result = cpp_labeling.label_bins( input.view(), left_edges.view(), - c_left_inclusive, + left_inclusive, right_edges.view(), - c_right_inclusive, + right_inclusive, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt index 15beaee47d4..00669ff579a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt @@ -24,4 +24,5 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp ) add_subdirectory(io) +add_subdirectory(lists) add_subdirectory(strings) diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt new file mode 100644 index 00000000000..c896db2c85a --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt @@ -0,0 +1,23 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources combine.pyx contains.pyx) + +set(linked_libraries cudf::cudf) + +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_lists +) diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd index d077958ce03..09a5d84c64f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -9,10 +10,9 @@ from pylibcudf.libcudf.table.table_view cimport table_view cdef extern from "cudf/lists/combine.hpp" namespace \ "cudf::lists" nogil: - ctypedef enum concatenate_null_policy: - IGNORE "cudf::lists::concatenate_null_policy::IGNORE" - NULLIFY_OUTPUT_ROW \ - "cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW" + cpdef enum class concatenate_null_policy(int32_t): + IGNORE + NULLIFY_OUTPUT_ROW cdef unique_ptr[column] concatenate_rows( const table_view input_table diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/lists.pxd b/python/pylibcudf/pylibcudf/lists.pxd index e7d006e6e2e..10c1c26e24e 100644 --- a/python/pylibcudf/pylibcudf/lists.pxd +++ b/python/pylibcudf/pylibcudf/lists.pxd @@ -1,7 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp cimport bool -from pylibcudf.libcudf.types cimport null_order, size_type +from pylibcudf.libcudf.types cimport ( + nan_equality, null_equality, null_order, order, size_type +) +from pylibcudf.libcudf.lists.combine cimport concatenate_null_policy +from pylibcudf.libcudf.lists.contains cimport duplicate_find_option from .column cimport Column from .scalar cimport Scalar @@ -19,13 +23,13 @@ cpdef Table explode_outer(Table, size_type explode_column_idx) cpdef Column concatenate_rows(Table) -cpdef Column concatenate_list_elements(Column, bool dropna) +cpdef Column concatenate_list_elements(Column, concatenate_null_policy null_policy) cpdef Column contains(Column, ColumnOrScalar) cpdef Column contains_nulls(Column) -cpdef Column index_of(Column, ColumnOrScalar, bool) +cpdef Column index_of(Column, ColumnOrScalar, duplicate_find_option) cpdef Column reverse(Column) @@ -37,16 +41,24 @@ cpdef Column count_elements(Column) cpdef Column sequences(Column, Column, Column steps = *) -cpdef Column sort_lists(Column, bool, null_order, bool stable = *) +cpdef Column sort_lists(Column, order, null_order, bool stable = *) -cpdef Column difference_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column difference_distinct( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) -cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column have_overlap( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) -cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column intersect_distinct( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) -cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column union_distinct( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) cpdef Column apply_boolean_mask(Column, Column) -cpdef Column distinct(Column, bool, bool) +cpdef Column distinct(Column, null_equality, nan_equality) diff --git a/python/pylibcudf/pylibcudf/lists.pyi b/python/pylibcudf/pylibcudf/lists.pyi new file mode 100644 index 00000000000..dff6c400638 --- /dev/null +++ b/python/pylibcudf/pylibcudf/lists.pyi @@ -0,0 +1,70 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table +from pylibcudf.types import NanEquality, NullEquality, NullOrder, Order + +class ConcatenateNullPolicy(IntEnum): + IGNORE = ... + NULLIFY_OUTPUT_ROW = ... + +class DuplicateFindOption(IntEnum): + FIND_FIRST = ... + FIND_LAST = ... + +def explode_outer(input: Table, explode_column_idx: int) -> Table: ... +def concatenate_rows(input: Table) -> Column: ... +def concatenate_list_elements( + input: Column, null_policy: ConcatenateNullPolicy +) -> Column: ... +def contains(input: Column, search_key: Column | Scalar) -> Column: ... +def contains_nulls(input: Column) -> Column: ... +def index_of( + input: Column, + search_key: Column | Scalar, + find_option: DuplicateFindOption, +) -> Column: ... +def reverse(input: Column) -> Column: ... +def segmented_gather(input: Column, gather_map_list: Column) -> Column: ... +def extract_list_element(input: Column, index: Column | int) -> Column: ... +def count_elements(input: Column) -> Column: ... +def sequences( + starts: Column, sizes: Column, steps: Column | None = None +) -> Column: ... +def sort_lists( + input: Column, + sort_order: Order, + na_position: NullOrder, + stable: bool = False, +) -> Column: ... +def difference_distinct( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def have_overlap( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def intersect_distinct( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def union_distinct( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def apply_boolean_mask(input: Column, boolean_mask: Column) -> Column: ... +def distinct( + input: Column, nulls_equal: NullEquality, nans_equal: NanEquality +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx index ecaf62d6895..ccc56eaa520 100644 --- a/python/pylibcudf/pylibcudf/lists.pyx +++ b/python/pylibcudf/pylibcudf/lists.pyx @@ -42,10 +42,35 @@ from pylibcudf.libcudf.types cimport ( ) from pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType +from pylibcudf.libcudf.lists.combine import concatenate_null_policy as ConcatenateNullPolicy # no-cython-lint +from pylibcudf.libcudf.lists.contains import duplicate_find_option as DuplicateFindOption # no-cython-lint + from .column cimport Column, ListColumnView from .scalar cimport Scalar from .table cimport Table +__all__ = [ + "ConcatenateNullPolicy", + "DuplicateFindOption", + "apply_boolean_mask", + "concatenate_list_elements", + "concatenate_rows", + "contains", + "contains_nulls", + "count_elements", + "difference_distinct", + "distinct", + "explode_outer", + "extract_list_element", + "have_overlap", + "index_of", + "intersect_distinct", + "reverse", + "segmented_gather", + "sequences", + "sort_lists", + "union_distinct", +] cpdef Table explode_outer(Table input, size_type explode_column_idx): """Explode a column of lists into rows. @@ -97,7 +122,9 @@ cpdef Column concatenate_rows(Table input): return Column.from_libcudf(move(c_result)) -cpdef Column concatenate_list_elements(Column input, bool dropna): +cpdef Column concatenate_list_elements( + Column input, concatenate_null_policy null_policy +): """Concatenate multiple lists on the same row into a single list. For details, see :cpp:func:`concatenate_list_elements`. @@ -106,20 +133,14 @@ cpdef Column concatenate_list_elements(Column input, bool dropna): ---------- input : Column The input column - dropna : bool - If true, null list elements will be ignored - from concatenation. Otherwise any input null values will result in - the corresponding output row being set to null. + null_policy : ConcatenateNullPolicy + How to treat null list elements. Returns ------- Column A new Column of concatenated list elements """ - cdef concatenate_null_policy null_policy = ( - concatenate_null_policy.IGNORE if dropna - else concatenate_null_policy.NULLIFY_OUTPUT_ROW - ) cdef unique_ptr[column] c_result with nogil: @@ -191,7 +212,9 @@ cpdef Column contains_nulls(Column input): return Column.from_libcudf(move(c_result)) -cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_option): +cpdef Column index_of( + Column input, ColumnOrScalar search_key, duplicate_find_option find_option +): """Create a column of index values indicating the position of a search key row within the corresponding list row in the lists column. @@ -207,9 +230,8 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o The input column. search_key : Union[Column, Scalar] The search key. - find_first_option : bool - If true, index_of returns the first match. - Otherwise the last match is returned. + find_option : DuplicateFindOption + Which match to return if there are duplicates. Returns ------- @@ -220,11 +242,6 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o """ cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() - cdef cpp_contains.duplicate_find_option find_option = ( - cpp_contains.duplicate_find_option.FIND_FIRST if find_first_option - else cpp_contains.duplicate_find_option.FIND_LAST - ) - with nogil: c_result = cpp_contains.index_of( list_view.view(), @@ -380,7 +397,7 @@ cpdef Column sequences(Column starts, Column sizes, Column steps = None): cpdef Column sort_lists( Column input, - bool ascending, + order sort_order, null_order na_position, bool stable = False ): @@ -392,8 +409,8 @@ cpdef Column sort_lists( ---------- input : Column The input column. - ascending : bool - If true, the sort order is ascending. Otherwise, the sort order is descending. + ascending : Order + Sort order in the list. na_position : NullOrder If na_position equals NullOrder.FIRST, then the null values in the output column are placed first. Otherwise, they are be placed after. @@ -409,21 +426,17 @@ cpdef Column sort_lists( cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() - cdef order c_sort_order = ( - order.ASCENDING if ascending else order.DESCENDING - ) - with nogil: if stable: c_result = cpp_stable_sort_lists( list_view.view(), - c_sort_order, + sort_order, na_position, ) else: c_result = cpp_sort_lists( list_view.view(), - c_sort_order, + sort_order, na_position, ) return Column.from_libcudf(move(c_result)) @@ -432,8 +445,8 @@ cpdef Column sort_lists( cpdef Column difference_distinct( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Create a column of index values indicating the position of a search key row within the corresponding list row in the lists column. @@ -446,11 +459,10 @@ cpdef Column difference_distinct( The input lists column of elements that may be included. rhs : Column The input lists column of elements to exclude. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -461,19 +473,12 @@ cpdef Column difference_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_set_operations.difference_distinct( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) @@ -481,8 +486,8 @@ cpdef Column difference_distinct( cpdef Column have_overlap( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Check if lists at each row of the given lists columns overlap. @@ -494,11 +499,10 @@ cpdef Column have_overlap( The input lists column for one side. rhs : Column The input lists column for the other side. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -509,19 +513,12 @@ cpdef Column have_overlap( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_set_operations.have_overlap( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) @@ -529,8 +526,8 @@ cpdef Column have_overlap( cpdef Column intersect_distinct( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Create a lists column of distinct elements common to two input lists columns. @@ -542,11 +539,10 @@ cpdef Column intersect_distinct( The input lists column of elements that may be included. rhs : Column The input lists column of elements to exclude. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -557,19 +553,12 @@ cpdef Column intersect_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_set_operations.intersect_distinct( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) @@ -577,8 +566,8 @@ cpdef Column intersect_distinct( cpdef Column union_distinct( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Create a lists column of distinct elements found in either of two input lists columns. @@ -591,11 +580,10 @@ cpdef Column union_distinct( The input lists column of elements that may be included. rhs : Column The input lists column of elements to exclude. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -606,19 +594,12 @@ cpdef Column union_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_set_operations.union_distinct( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) @@ -651,7 +632,7 @@ cpdef Column apply_boolean_mask(Column input, Column boolean_mask): return Column.from_libcudf(move(c_result)) -cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): +cpdef Column distinct(Column input, null_equality nulls_equal, nan_equality nans_equal): """Create a new list column without duplicate elements in each list. For details, see :cpp:func:`distinct`. @@ -660,11 +641,10 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): ---------- input : Column The input column. - nulls_equal : bool - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality + Are nulls considered equal. + nans_equal : NanEquality + Are nans considered equal. Returns ------- @@ -674,17 +654,10 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: c_result = cpp_distinct( list_view.view(), - c_nulls_equal, - c_nans_equal, + nulls_equal, + nans_equal, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/merge.pyi b/python/pylibcudf/pylibcudf/merge.pyi new file mode 100644 index 00000000000..b18eb01f8a2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/merge.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, Order + +def merge( + tables_to_merge: list[Table], + key_cols: list[int], + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx index 61a21aafdb2..c051cdc0c66 100644 --- a/python/pylibcudf/pylibcudf/merge.pyx +++ b/python/pylibcudf/pylibcudf/merge.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.types cimport null_order, order, size_type from .table cimport Table +__all__ = ["merge"] cpdef Table merge ( list tables_to_merge, diff --git a/python/pylibcudf/pylibcudf/null_mask.pyi b/python/pylibcudf/pylibcudf/null_mask.pyi new file mode 100644 index 00000000000..1a6d96a0822 --- /dev/null +++ b/python/pylibcudf/pylibcudf/null_mask.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from rmm.pylibrmm.device_buffer import DeviceBuffer + +from pylibcudf.column import Column +from pylibcudf.types import MaskState + +def copy_bitmask(col: Column) -> DeviceBuffer: ... +def bitmask_allocation_size_bytes(number_of_bits: int) -> int: ... +def create_null_mask( + size: int, state: MaskState = MaskState.UNINITIALIZED +) -> DeviceBuffer: ... +def bitmask_and(columns: list[Column]) -> tuple[DeviceBuffer, int]: ... +def bitmask_or(columns: list[Column]) -> tuple[DeviceBuffer, int]: ... diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx index 74180951562..adc264e9af6 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pyx +++ b/python/pylibcudf/pylibcudf/null_mask.pyx @@ -14,6 +14,13 @@ from pylibcudf.libcudf.types import mask_state as MaskState # no-cython-lint from .column cimport Column from .table cimport Table +__all__ = [ + "bitmask_allocation_size_bytes", + "bitmask_and", + "bitmask_or", + "copy_bitmask", + "create_null_mask", +] cdef DeviceBuffer buffer_to_python(device_buffer buf): return DeviceBuffer.c_from_unique_ptr(make_unique[device_buffer](move(buf))) diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi new file mode 100644 index 00000000000..ca39aa16d7e --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class BPEMergePairs: + def __init__(self, merge_pairs: Column): ... + +def byte_pair_encoding( + input: Column, merge_pairs: BPEMergePairs, separator: Scalar | None = None +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx index 76caad276d4..7565b21084f 100644 --- a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx @@ -16,6 +16,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( ) from pylibcudf.scalar cimport Scalar +__all__ = ["BPEMergePairs", "byte_pair_encoding"] cdef class BPEMergePairs: """The table of merge pairs for the BPE encoder. @@ -27,6 +28,8 @@ cdef class BPEMergePairs: with nogil: self.c_obj = move(cpp_load_merge_pairs(c_pairs)) + __hash__ = None + cpdef Column byte_pair_encoding( Column input, BPEMergePairs merge_pairs, diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi new file mode 100644 index 00000000000..85bbbb880ee --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def edit_distance(input: Column, targets: Column) -> Column: ... +def edit_distance_matrix(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx index dcacb2e1267..eceeaff24e3 100644 --- a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.nvtext.edit_distance cimport ( edit_distance_matrix as cpp_edit_distance_matrix, ) +__all__ = ["edit_distance", "edit_distance_matrix"] cpdef Column edit_distance(Column input, Column targets): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi new file mode 100644 index 00000000000..2757518379d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def generate_ngrams( + input: Column, ngrams: int, separator: Scalar +) -> Column: ... +def generate_character_ngrams(input: Column, ngrams: int = 2) -> Column: ... +def hash_character_ngrams(input: Column, ngrams: int = 2) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx index 09859d09e9e..521bc0ef4a4 100644 --- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx @@ -14,6 +14,11 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +__all__ = [ + "generate_ngrams", + "generate_character_ngrams", + "hash_character_ngrams", +] cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi new file mode 100644 index 00000000000..18263c5c8fd --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def jaccard_index(input1: Column, input2: Column, width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx index 3d8669865d9..90cace088f7 100644 --- a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.nvtext.jaccard cimport ( ) from pylibcudf.libcudf.types cimport size_type +__all__ = ["jaccard_index"] cpdef Column jaccard_index(Column input1, Column input2, size_type width): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi new file mode 100644 index 00000000000..a2d9b6364f7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def minhash( + input: Column, seeds: Column | Scalar, width: int = 4 +) -> Column: ... +def minhash64( + input: Column, seeds: Column | Scalar, width: int = 4 +) -> Column: ... +def word_minhash(input: Column, seeds: Column) -> Column: ... +def word_minhash64(input: Column, seeds: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index 5a51e32b287..5448cc6de9b 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -20,6 +20,12 @@ from pylibcudf.scalar cimport Scalar from cython.operator import dereference import warnings +__all__ = [ + "minhash", + "minhash64", + "word_minhash", + "word_minhash64", +] cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi new file mode 100644 index 00000000000..224640ed44d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def ngrams_tokenize( + input: Column, ngrams: int, delimiter: Scalar, separator: Scalar +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx index 8a1854c5f0d..771c7c019fc 100644 --- a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +__all__ = ["ngrams_tokenize"] cpdef Column ngrams_tokenize( Column input, diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi new file mode 100644 index 00000000000..1d90a5a8960 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def normalize_spaces(input: Column) -> Column: ... +def normalize_characters(input: Column, do_lower_case: bool) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx index 637d900b659..b259ccaefa6 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.nvtext.normalize cimport ( normalize_spaces as cpp_normalize_spaces, ) +__all__ = ["normalize_characters", "normalize_spaces"] cpdef Column normalize_spaces(Column input): """ diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyi b/python/pylibcudf/pylibcudf/nvtext/replace.pyi new file mode 100644 index 00000000000..1f1ac72ce7c --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyi @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def replace_tokens( + input: Column, + targets: Column, + replacements: Column, + delimiter: Scalar | None = None, +) -> Column: ... +def filter_tokens( + input: Column, + min_token_length: int, + replacement: Scalar | None = None, + delimiter: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyx b/python/pylibcudf/pylibcudf/nvtext/replace.pyx index b65348ce14d..a27592fb434 100644 --- a/python/pylibcudf/pylibcudf/nvtext/replace.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyx @@ -16,6 +16,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +__all__ = ["filter_tokens", "replace_tokens"] cpdef Column replace_tokens( Column input, diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi new file mode 100644 index 00000000000..d6ba1d189bd --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def is_letter( + input: Column, check_vowels: bool, indices: Column | int +) -> Column: ... +def porter_stemmer_measure(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx index 854d1053624..c9e4f1274e4 100644 --- a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.nvtext.stemmer cimport ( ) from pylibcudf.libcudf.types cimport size_type +__all__ = ["is_letter", "porter_stemmer_measure"] cpdef Column is_letter( Column input, diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi new file mode 100644 index 00000000000..f6618e296b1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +class HashedVocabulary: + def __init__(self, hash_file: str): ... + +def subword_tokenize( + input: Column, + vocabulary_table: HashedVocabulary, + max_sequence_length: int, + stride: int, + do_lower_case: bool, + do_truncate: bool, +) -> tuple[Column, Column, Column]: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx index 04643d3bd84..14fb6f5fe1e 100644 --- a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.nvtext.subword_tokenize cimport ( tokenizer_result as cpp_tokenizer_result, ) +__all__ = ["HashedVocabulary", "subword_tokenize"] cdef class HashedVocabulary: """The vocabulary data for use with the subword_tokenize function. @@ -24,6 +25,8 @@ cdef class HashedVocabulary: with nogil: self.c_obj = move(cpp_load_vocabulary_file(c_hash_file)) + __hash__ = None + cpdef tuple[Column, Column, Column] subword_tokenize( Column input, HashedVocabulary vocabulary_table, diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi new file mode 100644 index 00000000000..b9aa2393514 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class TokenizeVocabulary: + def __init__(self, vocab: Column): ... + +def tokenize_scalar( + input: Column, delimiter: Scalar | None = None +) -> Column: ... +def tokenize_column(input: Column, delimiters: Column) -> Column: ... +def count_tokens_scalar( + input: Column, delimiter: Scalar | None = None +) -> Column: ... +def count_tokens_column(input: Column, delimiters: Column) -> Column: ... +def character_tokenize(input: Column) -> Column: ... +def detokenize( + input: Column, row_indices: Column, separator: Scalar | None = None +) -> Column: ... +def tokenize_with_vocabulary( + input: Column, + vocabulary: TokenizeVocabulary, + delimiter: Scalar, + default_id: int = -1, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx index ec02e8ebf4e..43d426489b4 100644 --- a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx @@ -20,6 +20,16 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( ) from pylibcudf.libcudf.types cimport size_type +__all__ = [ + "TokenizeVocabulary", + "character_tokenize", + "count_tokens_column", + "count_tokens_scalar", + "detokenize", + "tokenize_column", + "tokenize_scalar", + "tokenize_with_vocabulary", +] cdef class TokenizeVocabulary: """The Vocabulary object to be used with ``tokenize_with_vocabulary``. @@ -31,6 +41,8 @@ cdef class TokenizeVocabulary: with nogil: self.c_obj = move(cpp_load_vocabulary(c_vocab)) + __hash__ = None + cpdef Column tokenize_scalar(Column input, Scalar delimiter=None): """ Returns a single column of strings by tokenizing the input diff --git a/python/pylibcudf/pylibcudf/partitioning.pyi b/python/pylibcudf/pylibcudf/partitioning.pyi new file mode 100644 index 00000000000..48a2ade23f1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/partitioning.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table + +def hash_partition( + input: Table, columns_to_hash: list[int], num_partitions: int +) -> tuple[Table, list[int]]: ... +def partition( + t: Table, partition_map: Column, num_partitions: int +) -> tuple[Table, list[int]]: ... +def round_robin_partition( + input: Table, num_partitions: int, start_partition: int = 0 +) -> tuple[Table, list[int]]: ... diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx index 3cff4843735..1dacabceb06 100644 --- a/python/pylibcudf/pylibcudf/partitioning.pyx +++ b/python/pylibcudf/pylibcudf/partitioning.pyx @@ -11,6 +11,11 @@ from pylibcudf.libcudf.table.table cimport table from .column cimport Column from .table cimport Table +__all__ = [ + "hash_partition", + "partition", + "round_robin_partition", +] cpdef tuple[Table, list] hash_partition( Table input, diff --git a/python/pylibcudf/pylibcudf/py.typed b/python/pylibcudf/pylibcudf/py.typed new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/quantiles.pyi b/python/pylibcudf/pylibcudf/quantiles.pyi new file mode 100644 index 00000000000..dca6eed013a --- /dev/null +++ b/python/pylibcudf/pylibcudf/quantiles.pyi @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Sequence + +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import Interpolation, NullOrder, Order, Sorted + +def quantile( + input: Column, + q: Sequence[float], + interp: Interpolation = Interpolation.LINEAR, + ordered_indices: Column | None = None, + exact: bool = True, +) -> Column: ... +def quantiles( + input: Table, + q: Sequence[float], + interp: Interpolation = Interpolation.NEAREST, + is_input_sorted: Sorted = Sorted.NO, + column_order: list[Order] | None = None, + null_precedence: list[NullOrder] | None = None, +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx index 7d92b598bd0..634218586ac 100644 --- a/python/pylibcudf/pylibcudf/quantiles.pyx +++ b/python/pylibcudf/pylibcudf/quantiles.pyx @@ -17,6 +17,7 @@ from .column cimport Column from .table cimport Table from .types cimport interpolation +__all__ = ["quantile", "quantiles"] cpdef Column quantile( Column input, diff --git a/python/pylibcudf/pylibcudf/reduce.pyi b/python/pylibcudf/pylibcudf/reduce.pyi new file mode 100644 index 00000000000..a09949b7b30 --- /dev/null +++ b/python/pylibcudf/pylibcudf/reduce.pyi @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.aggregation import Aggregation +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.types import DataType + +class ScanType(IntEnum): + INCLUSIVE = ... + EXCLUSIVE = ... + +def reduce(col: Column, agg: Aggregation, data_type: DataType) -> Scalar: ... +def scan(col: Column, agg: Aggregation, inclusive: ScanType) -> Column: ... +def minmax(col: Column) -> tuple[Scalar, Scalar]: ... diff --git a/python/pylibcudf/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx index d9ec3a9bdc4..1d6ffd9de10 100644 --- a/python/pylibcudf/pylibcudf/reduce.pyx +++ b/python/pylibcudf/pylibcudf/reduce.pyx @@ -16,6 +16,7 @@ from .types cimport DataType from pylibcudf.libcudf.reduce import scan_type as ScanType # no-cython-lint +__all__ = ["ScanType", "minmax", "reduce", "scan"] cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type): """Perform a reduction on a column diff --git a/python/pylibcudf/pylibcudf/replace.pyi b/python/pylibcudf/pylibcudf/replace.pyi new file mode 100644 index 00000000000..eed7a2a6c52 --- /dev/null +++ b/python/pylibcudf/pylibcudf/replace.pyi @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class ReplacePolicy(IntEnum): + PRECEDING = ... + FOLLOWING = ... + +def replace_nulls( + source_column: Column, replacement: Column | Scalar | ReplacePolicy +) -> Column: ... +def find_and_replace_all( + source_column: Column, + values_to_replace: Column, + replacement_values: Column, +) -> Column: ... +def clamp( + source_column: Column, + lo: Scalar, + hi: Scalar, + lo_replace: Scalar | None = None, + hi_replace: Scalar | None = None, +) -> Column: ... +def normalize_nans_and_zeros( + source_column: Column, inplace: bool = False +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx index f77eba7ace5..51be2b29277 100644 --- a/python/pylibcudf/pylibcudf/replace.pyx +++ b/python/pylibcudf/pylibcudf/replace.pyx @@ -15,6 +15,14 @@ from pylibcudf.libcudf.replace import \ from .column cimport Column from .scalar cimport Scalar +__all__ = [ + "ReplacePolicy", + "clamp", + "find_and_replace_all", + "normalize_nans_and_zeros", + "replace_nulls", +] + cpdef Column replace_nulls(Column source_column, ReplacementType replacement): """Replace nulls in source_column. diff --git a/python/pylibcudf/pylibcudf/reshape.pyi b/python/pylibcudf/pylibcudf/reshape.pyi new file mode 100644 index 00000000000..d8d0ffcc3e0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/reshape.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table + +def interleave_columns(source_table: Table) -> Column: ... +def tile(source_table: Table, count: int) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx index 6540b5198ab..bdc212a1985 100644 --- a/python/pylibcudf/pylibcudf/reshape.pyx +++ b/python/pylibcudf/pylibcudf/reshape.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.types cimport size_type from .column cimport Column from .table cimport Table +__all__ = ["interleave_columns", "tile"] cpdef Column interleave_columns(Table source_table): """Interleave columns of a table into a single column. diff --git a/python/pylibcudf/pylibcudf/rolling.pyi b/python/pylibcudf/pylibcudf/rolling.pyi new file mode 100644 index 00000000000..ca0111e01ec --- /dev/null +++ b/python/pylibcudf/pylibcudf/rolling.pyi @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.aggregation import Aggregation +from pylibcudf.column import Column + +def rolling_window[WindowType: (Column, int)]( + source: Column, + preceding_window: WindowType, + following_window: WindowType, + min_periods: int, + agg: Aggregation, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx index 4fd0b005431..11acf57ccf4 100644 --- a/python/pylibcudf/pylibcudf/rolling.pyx +++ b/python/pylibcudf/pylibcudf/rolling.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.types cimport size_type from .aggregation cimport Aggregation from .column cimport Column +__all__ = ["rolling_window"] cpdef Column rolling_window( Column source, diff --git a/python/pylibcudf/pylibcudf/round.pyi b/python/pylibcudf/pylibcudf/round.pyi new file mode 100644 index 00000000000..410cf5de586 --- /dev/null +++ b/python/pylibcudf/pylibcudf/round.pyi @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column + +class RoundingMethod(IntEnum): + HALF_UP = ... + HALF_EVEN = ... + +def round( + source: Column, + decimal_places: int = 0, + round_method: RoundingMethod = RoundingMethod.HALF_UP, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx index 689363e652d..09e5a9cc3bc 100644 --- a/python/pylibcudf/pylibcudf/round.pyx +++ b/python/pylibcudf/pylibcudf/round.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.column.column cimport column from .column cimport Column +__all__ = ["RoundingMethod", "round"] cpdef Column round( Column source, diff --git a/python/pylibcudf/pylibcudf/scalar.pyi b/python/pylibcudf/pylibcudf/scalar.pyi new file mode 100644 index 00000000000..0b72b10ef86 --- /dev/null +++ b/python/pylibcudf/pylibcudf/scalar.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +class Scalar: + def type(self) -> DataType: ... + def is_valid(self) -> bool: ... + @staticmethod + def empty_like(column: Column) -> Scalar: ... diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx index d4888a62ad1..1ac014e891e 100644 --- a/python/pylibcudf/pylibcudf/scalar.pyx +++ b/python/pylibcudf/pylibcudf/scalar.pyx @@ -11,6 +11,8 @@ from rmm.pylibrmm.memory_resource cimport get_current_device_resource from .column cimport Column from .types cimport DataType +__all__ = ["Scalar"] + # The DeviceMemoryResource attribute could be released prematurely # by the gc if the Scalar is in a reference cycle. Removing the tp_clear @@ -37,6 +39,8 @@ cdef class Scalar: # DeviceScalar. raise ValueError("Scalar should be constructed with a factory") + __hash__ = None + cdef const scalar* get(self) noexcept nogil: return self.c_obj.get() diff --git a/python/pylibcudf/pylibcudf/search.pyi b/python/pylibcudf/pylibcudf/search.pyi new file mode 100644 index 00000000000..7f292b129b2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/search.pyi @@ -0,0 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, Order + +def lower_bound( + haystack: Table, + needles: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def upper_bound( + haystack: Table, + needles: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def contains(haystack: Column, needles: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx index 1a870248046..50353fcd0cc 100644 --- a/python/pylibcudf/pylibcudf/search.pyx +++ b/python/pylibcudf/pylibcudf/search.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.types cimport null_order, order from .column cimport Column from .table cimport Table +__all__ = ["contains", "lower_bound", "upper_bound"] cpdef Column lower_bound( Table haystack, diff --git a/python/pylibcudf/pylibcudf/sorting.pyi b/python/pylibcudf/pylibcudf/sorting.pyi new file mode 100644 index 00000000000..5255d869a4d --- /dev/null +++ b/python/pylibcudf/pylibcudf/sorting.pyi @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.aggregation import RankMethod +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, NullPolicy, Order + +def sorted_order( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def stable_sorted_order( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def rank( + input_view: Column, + method: RankMethod, + column_order: Order, + null_handling: NullPolicy, + null_precedence: NullOrder, + percentage: bool, +) -> Column: ... +def is_sorted( + tbl: Table, column_order: list[Order], null_precedence: list[NullOrder] +) -> bool: ... +def segmented_sort_by_key( + values: Table, + keys: Table, + segment_offsets: Column, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def stable_segmented_sort_by_key( + values: Table, + keys: Table, + segment_offsets: Column, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def sort_by_key( + values: Table, + keys: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def stable_sort_by_key( + values: Table, + keys: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def sort( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def stable_sort( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx index fc40f03e1fd..fb29ef8c571 100644 --- a/python/pylibcudf/pylibcudf/sorting.pyx +++ b/python/pylibcudf/pylibcudf/sorting.pyx @@ -12,6 +12,18 @@ from pylibcudf.libcudf.types cimport null_order, null_policy, order from .column cimport Column from .table cimport Table +__all__ = [ + "is_sorted", + "rank", + "segmented_sort_by_key", + "sort", + "sort_by_key", + "sorted_order", + "stable_segmented_sort_by_key", + "stable_sort", + "stable_sort_by_key", + "stable_sorted_order", +] cpdef Column sorted_order(Table source_table, list column_order, list null_precedence): """Computes the row indices required to sort the table. diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/stream_compaction.pxd index a4f39792f0c..a20a23e2e58 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pxd +++ b/python/pylibcudf/pylibcudf/stream_compaction.pxd @@ -17,6 +17,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold) cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold) +cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask) + cpdef Table unique( Table input, list keys, diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyi b/python/pylibcudf/pylibcudf/stream_compaction.pyi new file mode 100644 index 00000000000..99cade48309 --- /dev/null +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyi @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import NanEquality, NanPolicy, NullEquality, NullPolicy + +class DuplicateKeepOption(IntEnum): + KEEP_ANY = ... + KEEP_FIRST = ... + KEEP_LAST = ... + KEEP_NONE = ... + +def drop_nulls( + source_table: Table, keys: list[int], keep_threshold: int +) -> Table: ... +def drop_nans( + source_table: Table, keys: list[int], keep_threshold: int +) -> Table: ... +def apply_boolean_mask(source_table: Table, boolean_mask: Column) -> Table: ... +def unique( + input: Table, + keys: list[int], + keep: DuplicateKeepOption, + nulls_equal: NullEquality, +) -> Table: ... +def distinct( + input: Table, + keys: list[int], + keep: DuplicateKeepOption, + nulls_equal: NullEquality, + nans_equal: NanEquality, +) -> Table: ... +def distinct_indices( + input: Table, + keep: DuplicateKeepOption, + nulls_equal: NullEquality, + nans_equal: NanEquality, +) -> Column: ... +def stable_distinct( + input: Table, + keys: list[int], + keep: DuplicateKeepOption, + nulls_equal: NullEquality, + nans_equal: NanEquality, +) -> Table: ... +def unique_count( + source: Column, null_handling: NullPolicy, nan_handling: NanPolicy +) -> int: ... +def distinct_count( + source: Column, null_handling: NullPolicy, nan_handling: NanPolicy +) -> int: ... diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx index 2145398a191..6e403ca1b07 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pyx +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx @@ -21,6 +21,18 @@ from pylibcudf.libcudf.stream_compaction import \ from .column cimport Column from .table cimport Table +__all__ = [ + "DuplicateKeepOption", + "apply_boolean_mask", + "distinct", + "distinct_count", + "distinct_indices", + "drop_nans", + "drop_nulls", + "stable_distinct", + "unique", + "unique_count", +] cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): """Filters out rows from the input table based on the presence of nulls. diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index fa7294c7dbd..67054f0b447 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -28,6 +28,7 @@ from .side_type import SideType __all__ = [ + "SideType", "attributes", "capitalize", "case", @@ -46,9 +47,8 @@ "replace", "replace_re", "slice", - "strip", "split", - "SideType", + "strip", "translate", "wrap", ] diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyi b/python/pylibcudf/pylibcudf/strings/attributes.pyi new file mode 100644 index 00000000000..7fd5c9773d4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/attributes.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def count_characters(source_strings: Column) -> Column: ... +def count_bytes(source_strings: Column) -> Column: ... +def code_points(source_strings: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx index 8e46a32835d..f1eb09b4965 100644 --- a/python/pylibcudf/pylibcudf/strings/attributes.pyx +++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport attributes as cpp_attributes +__all__ = ["code_points", "count_bytes", "count_characters"] cpdef Column count_characters(Column source_strings): """ diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyi b/python/pylibcudf/pylibcudf/strings/capitalize.pyi new file mode 100644 index 00000000000..5c6689418e2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyi @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.char_types import StringCharacterTypes + +def capitalize(input: Column, delimiters: Scalar | None = None) -> Column: ... +def title( + input: Column, + sequence_type: StringCharacterTypes = StringCharacterTypes.ALPHA, +) -> Column: ... +def is_title(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyx b/python/pylibcudf/pylibcudf/strings/capitalize.pyx index 06b991c3cf1..a54480b8e4a 100644 --- a/python/pylibcudf/pylibcudf/strings/capitalize.pyx +++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyx @@ -14,6 +14,7 @@ from pylibcudf.strings.char_types cimport string_character_types from cython.operator import dereference +__all__ = ["capitalize", "is_title", "title"] cpdef Column capitalize( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/case.pyi b/python/pylibcudf/pylibcudf/strings/case.pyi new file mode 100644 index 00000000000..4e50db4d1da --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/case.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def to_lower(input: Column) -> Column: ... +def to_upper(input: Column) -> Column: ... +def swapcase(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/case.pyx b/python/pylibcudf/pylibcudf/strings/case.pyx index 9e6cd7717d3..d0e054bef72 100644 --- a/python/pylibcudf/pylibcudf/strings/case.pyx +++ b/python/pylibcudf/pylibcudf/strings/case.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport case as cpp_case +__all__ = ["swapcase", "to_lower", "to_upper"] cpdef Column to_lower(Column input): cdef unique_ptr[column] c_result diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyi b/python/pylibcudf/pylibcudf/strings/char_types.pyi new file mode 100644 index 00000000000..daa36cbb68d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyi @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class StringCharacterTypes(IntEnum): + DECIMAL = ... + NUMERIC = ... + DIGIT = ... + ALPHA = ... + SPACE = ... + UPPER = ... + LOWER = ... + ALPHANUM = ... + CASE_TYPES = ... + ALL_TYPES = ... + +def all_characters_of_type( + source_strings: Column, + types: StringCharacterTypes, + verify_types: StringCharacterTypes, +) -> Column: ... +def filter_characters_of_type( + source_strings: Column, + types_to_remove: StringCharacterTypes, + replacement: Scalar, + types_to_keep: StringCharacterTypes, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx index cb04efe5e8f..0af4a1f9c37 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pyx +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx @@ -12,6 +12,11 @@ from cython.operator import dereference from pylibcudf.libcudf.strings.char_types import \ string_character_types as StringCharacterTypes # no-cython-lint +__all__ = [ + "StringCharacterTypes", + "all_characters_of_type", + "filter_characters_of_type", +] cpdef Column all_characters_of_type( Column source_strings, diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyi b/python/pylibcudf/pylibcudf/strings/combine.pyi new file mode 100644 index 00000000000..3094b20f141 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pyi @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +class SeparatorOnNulls(IntEnum): + YES = ... + NO = ... + +class OutputIfEmptyList(IntEnum): + EMPTY_STRING = ... + NULL_ELEMENT = ... + +def concatenate( + strings_columns: Table, + separator: Column | Scalar, + narep: Scalar | None = None, + col_narep: Scalar | None = None, + separate_nulls: SeparatorOnNulls = SeparatorOnNulls.YES, +) -> Column: ... +def join_strings( + input: Column, separator: Scalar, narep: Scalar +) -> Column: ... +def join_list_elements( + lists_strings_column: Column, + separator: Column | Scalar, + separator_narep: Scalar, + string_narep: Scalar, + separate_nulls: SeparatorOnNulls, + empty_list_policy: OutputIfEmptyList, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx index f17d5265ab4..dc1e72c799b 100644 --- a/python/pylibcudf/pylibcudf/strings/combine.pyx +++ b/python/pylibcudf/pylibcudf/strings/combine.pyx @@ -17,6 +17,13 @@ from pylibcudf.libcudf.strings.combine import \ from pylibcudf.libcudf.strings.combine import \ separator_on_nulls as SeparatorOnNulls # no-cython-lint +__all__ = [ + "OutputIfEmptyList", + "SeparatorOnNulls", + "concatenate", + "join_list_elements", + "join_strings", +] cpdef Column concatenate( Table strings_columns, diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyi b/python/pylibcudf/pylibcudf/strings/contains.pyi new file mode 100644 index 00000000000..1f0620383b3 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/contains.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.regex_program import RegexProgram + +def contains_re(input: Column, prog: RegexProgram) -> Column: ... +def count_re(input: Column, prog: RegexProgram) -> Column: ... +def matches_re(input: Column, prog: RegexProgram) -> Column: ... +def like( + input: Column, + pattern: Column | Scalar, + escape_character: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx index d4b1130241d..7b4c53ed853 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyx +++ b/python/pylibcudf/pylibcudf/strings/contains.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( from pylibcudf.libcudf.strings cimport contains as cpp_contains from pylibcudf.strings.regex_program cimport RegexProgram +__all__ = ["contains_re", "count_re", "like", "matches_re"] cpdef Column contains_re( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index aa27a7c8929..08b5034456e 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -10,3 +10,15 @@ convert_lists, convert_urls, ) + +__all__ = [ + "convert_booleans", + "convert_datetime", + "convert_durations", + "convert_fixed_point", + "convert_floats", + "convert_integers", + "convert_ipv4", + "convert_lists", + "convert_urls", +] diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi new file mode 100644 index 00000000000..77c09242e9a --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def to_booleans(input: Column, true_string: Scalar) -> Column: ... +def from_booleans( + booleans: Column, true_string: Scalar, false_string: Scalar +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx index dc12b291b11..1899a3b27cc 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx @@ -12,6 +12,7 @@ from pylibcudf.scalar cimport Scalar from cython.operator import dereference +__all__ = ["from_booleans", "to_booleans"] cpdef Column to_booleans(Column input, Scalar true_string): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi new file mode 100644 index 00000000000..c6857169765 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_timestamps( + input: Column, timestamp_type: DataType, format: str +) -> Column: ... +def from_timestamps( + timestamps: Column, format: str, input_strings_names: Column +) -> Column: ... +def is_timestamp(input: Column, format: str) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx index 0ee60812e00..f1cd684166c 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.strings.convert cimport ( from pylibcudf.types import DataType +__all__ = ["from_timestamps", "is_timestamp", "to_timestamps"] cpdef Column to_timestamps( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi new file mode 100644 index 00000000000..a5787a5fe49 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_durations( + input: Column, duration_type: DataType, format: str +) -> Column: ... +def from_durations(durations: Column, format: str | None = None) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx index 31980ace418..a9654afd00a 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.strings.convert cimport ( from pylibcudf.types import DataType +__all__ = ["from_durations", "to_durations"] cpdef Column to_durations( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi new file mode 100644 index 00000000000..1192d3dfcd6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_fixed_point(input: Column, output_type: DataType) -> Column: ... +def from_fixed_point(input: Column) -> Column: ... +def is_fixed_point( + input: Column, decimal_type: DataType | None = None +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx index 962a47dfadf..00cbc822f36 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx @@ -9,6 +9,8 @@ from pylibcudf.libcudf.strings.convert cimport ( ) from pylibcudf.types cimport DataType, type_id +__all__ = ["from_fixed_point", "is_fixed_point", "to_fixed_point"] + cpdef Column to_fixed_point(Column input, DataType output_type): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi new file mode 100644 index 00000000000..ddf4042e10d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_floats(strings: Column, output_type: DataType) -> Column: ... +def from_floats(floats: Column) -> Column: ... +def is_float(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx index 1296f4f9db5..b5199aac577 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.strings.convert cimport ( ) from pylibcudf.types cimport DataType +__all__ = ["from_floats", "is_float", "to_floats"] cpdef Column to_floats(Column strings, DataType output_type): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi new file mode 100644 index 00000000000..b96226fba90 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_integers(input: Column, output_type: DataType) -> Column: ... +def from_integers(integers: Column) -> Column: ... +def is_integer(input: Column, int_type: DataType | None = None) -> Column: ... +def hex_to_integers(input: Column, output_type: DataType) -> Column: ... +def is_hex(input: Column) -> Column: ... +def integers_to_hex(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx index 5558683a502..12984e15ce9 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx @@ -9,6 +9,14 @@ from pylibcudf.libcudf.strings.convert cimport ( ) from pylibcudf.types cimport DataType +__all__ = [ + "from_integers", + "hex_to_integers", + "integers_to_hex", + "is_hex", + "is_integer", + "to_integers" +] cpdef Column to_integers(Column input, DataType output_type): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi new file mode 100644 index 00000000000..b017b32598c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def ipv4_to_integers(input: Column) -> Column: ... +def integers_to_ipv4(integers: Column) -> Column: ... +def is_ipv4(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx index 834781f95f3..e7c6aae4fa8 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4 +__all__ = ["integers_to_ipv4", "ipv4_to_integers", "is_ipv4"] cpdef Column ipv4_to_integers(Column input): """ diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi new file mode 100644 index 00000000000..6ab3a4183e9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def format_list_column( + input: Column, + na_rep: Scalar | None = None, + separators: Column | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx index cbfe5f5aa8b..518f72f6644 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx @@ -17,6 +17,7 @@ from pylibcudf.types cimport type_id from cython.operator import dereference +__all__ = ["format_list_column"] cpdef Column format_list_column( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi new file mode 100644 index 00000000000..49b8468957c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def url_encode(input: Column) -> Column: ... +def url_decode(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx index 82f8a75f1d9..bd5e23bca43 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls +__all__ = ["url_decode", "url_encode"] cpdef Column url_encode(Column input): """ diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyi b/python/pylibcudf/pylibcudf/strings/extract.pyi new file mode 100644 index 00000000000..4354bd3072d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.strings.regex_program import RegexProgram +from pylibcudf.table import Table + +def extract(input: Column, prog: RegexProgram) -> Table: ... +def extract_all_record(input: Column, prog: RegexProgram) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx index b56eccc8287..0ce70666e92 100644 --- a/python/pylibcudf/pylibcudf/strings/extract.pyx +++ b/python/pylibcudf/pylibcudf/strings/extract.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.strings.regex_program cimport RegexProgram from pylibcudf.table cimport Table +__all__ = ["extract", "extract_all_record"] cpdef Table extract(Column input, RegexProgram prog): """ diff --git a/python/pylibcudf/pylibcudf/strings/find.pyi b/python/pylibcudf/pylibcudf/strings/find.pyi new file mode 100644 index 00000000000..3d04a9c3161 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def find( + input: Column, target: Column | Scalar, start: int = 0, stop: int = -1 +) -> Column: ... +def rfind( + input: Column, target: Scalar, start: int = 0, stop: int = -1 +) -> Column: ... +def contains(input: Column, target: Column | Scalar) -> Column: ... +def starts_with(input: Column, target: Column | Scalar) -> Column: ... +def ends_with(input: Column, target: Column | Scalar) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/find.pyx b/python/pylibcudf/pylibcudf/strings/find.pyx index 6fc6dca24fd..f0af339ff08 100644 --- a/python/pylibcudf/pylibcudf/strings/find.pyx +++ b/python/pylibcudf/pylibcudf/strings/find.pyx @@ -10,6 +10,7 @@ from cython.operator import dereference from pylibcudf.libcudf.scalar.scalar cimport string_scalar +__all__ = ["contains", "ends_with", "find", "rfind", "starts_with"] cpdef Column find( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyi b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi new file mode 100644 index 00000000000..3d46fd2fa6d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def find_multiple(input: Column, targets: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx index 672aa606bd0..c9ce734b4be 100644 --- a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple +__all__ = ["find_multiple"] cpdef Column find_multiple(Column input, Column targets): """ diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyi b/python/pylibcudf/pylibcudf/strings/findall.pyi new file mode 100644 index 00000000000..77e38581d22 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/findall.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.strings.regex_program import RegexProgram + +def find_re(input: Column, pattern: RegexProgram) -> Column: ... +def findall(input: Column, pattern: RegexProgram) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx index 89fa4302824..23c84675a16 100644 --- a/python/pylibcudf/pylibcudf/strings/findall.pyx +++ b/python/pylibcudf/pylibcudf/strings/findall.pyx @@ -7,6 +7,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport findall as cpp_findall from pylibcudf.strings.regex_program cimport RegexProgram +__all__ = ["findall", "find_re"] cpdef Column findall(Column input, RegexProgram pattern): """ diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyi b/python/pylibcudf/pylibcudf/strings/padding.pyi new file mode 100644 index 00000000000..a991935e6e5 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/padding.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.strings.side_type import SideType + +def pad( + input: Column, width: int, side: SideType, fill_char: str +) -> Column: ... +def zfill(input: Column, width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx index f6950eecf60..0e349a7be47 100644 --- a/python/pylibcudf/pylibcudf/strings/padding.pyx +++ b/python/pylibcudf/pylibcudf/strings/padding.pyx @@ -6,6 +6,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport padding as cpp_padding from pylibcudf.libcudf.strings.side_type cimport side_type +__all__ = ["pad", "zfill"] cpdef Column pad(Column input, size_type width, side_type side, str fill_char): """ diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pyi b/python/pylibcudf/pylibcudf/strings/regex_flags.pyi new file mode 100644 index 00000000000..c551cebf181 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum + +class RegexFlags(IntEnum): + DEFAULT = ... + MULTILINE = ... + DOTALL = ... diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx index ce3b6b10a42..65b504e0dc7 100644 --- a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx +++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx @@ -2,3 +2,5 @@ from pylibcudf.libcudf.strings.regex_flags import \ regex_flags as RegexFlags # no-cython-lint + +__all__ = ["RegexFlags"] diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyi b/python/pylibcudf/pylibcudf/strings/regex_program.pyi new file mode 100644 index 00000000000..9abd6fa7802 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.strings.regex_flags import RegexFlags + +class RegexProgram: + def __init__(self): ... + @staticmethod + def create(pattern: str, flags: RegexFlags) -> RegexProgram: ... diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyx b/python/pylibcudf/pylibcudf/strings/regex_program.pyx index 91f585cd637..46bfde074d2 100644 --- a/python/pylibcudf/pylibcudf/strings/regex_program.pyx +++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyx @@ -11,6 +11,7 @@ from pylibcudf.strings.regex_flags import RegexFlags from pylibcudf.strings.regex_flags cimport regex_flags +__all__ = ["RegexProgram"] cdef class RegexProgram: """Regex program class. @@ -24,6 +25,8 @@ cdef class RegexProgram: def __init__(self, *args, **kwargs): raise ValueError("Do not instantiate RegexProgram directly, use create") + __hash__ = None + @staticmethod def create(str pattern, int flags): """Create a program from a pattern. diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyi b/python/pylibcudf/pylibcudf/strings/repeat.pyi new file mode 100644 index 00000000000..93a46b71caa --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def repeat_strings(input: Column, repeat_times: Column | int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx index fb2bb13c666..a497b1f438e 100644 --- a/python/pylibcudf/pylibcudf/strings/repeat.pyx +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx @@ -6,6 +6,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport repeat as cpp_repeat from pylibcudf.libcudf.types cimport size_type +__all__ = ["repeat_strings"] cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): """ diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyi b/python/pylibcudf/pylibcudf/strings/replace.pyi new file mode 100644 index 00000000000..64df09ef7e8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/replace.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def replace( + input: Column, target: Scalar, repl: Scalar, maxrepl: int = -1 +) -> Column: ... +def replace_multiple( + input: Column, target: Column, repl: Column, maxrepl: int = -1 +) -> Column: ... +def replace_slice( + input: Column, repl: Scalar | None = None, start: int = 0, stop: int = -1 +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx index 2b94f5e3fee..3ba6c1b5530 100644 --- a/python/pylibcudf/pylibcudf/strings/replace.pyx +++ b/python/pylibcudf/pylibcudf/strings/replace.pyx @@ -16,6 +16,7 @@ from pylibcudf.libcudf.strings.replace cimport ( from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +__all__ = ["replace", "replace_multiple", "replace_slice"] cpdef Column replace( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyi b/python/pylibcudf/pylibcudf/strings/replace_re.pyi new file mode 100644 index 00000000000..056bafbf7ef --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyi @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import overload + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.regex_flags import RegexFlags +from pylibcudf.strings.regex_program import RegexProgram + +@overload +def replace_re( + input: Column, + pattern: RegexProgram, + replacement: Scalar, + max_replace_count: int = -1, +) -> Column: ... +@overload +def replace_re( + input: Column, + patterns: list[str], + replacement: Column, + max_replace_count: int = -1, + flags: RegexFlags = RegexFlags.DEFAULT, +) -> Column: ... +def replace_with_backrefs( + input: Column, prog: RegexProgram, replacement: str +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyx b/python/pylibcudf/pylibcudf/strings/replace_re.pyx index ccc33fd4425..bdabc779ddf 100644 --- a/python/pylibcudf/pylibcudf/strings/replace_re.pyx +++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyx @@ -16,6 +16,7 @@ from pylibcudf.scalar cimport Scalar from pylibcudf.strings.regex_flags cimport regex_flags from pylibcudf.strings.regex_program cimport RegexProgram +__all__ = ["replace_re", "replace_with_backrefs"] cpdef Column replace_re( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyi b/python/pylibcudf/pylibcudf/strings/side_type.pyi new file mode 100644 index 00000000000..532edd60077 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum + +class SideType(IntEnum): + LEFT = ... + RIGHT = ... + BOTH = ... diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx index cf0c770cc11..87db4206a9c 100644 --- a/python/pylibcudf/pylibcudf/strings/side_type.pyx +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx @@ -1,3 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.libcudf.strings.side_type import \ side_type as SideType # no-cython-lint + +__all__ = ["SideType"] diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyi b/python/pylibcudf/pylibcudf/strings/slice.pyi new file mode 100644 index 00000000000..7bf9a7cb8c6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/slice.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def slice_strings( + input: Column, + start: Column | Scalar | None = None, + stop: Column | Scalar | None = None, + step: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyx b/python/pylibcudf/pylibcudf/strings/slice.pyx index 70d10cab36c..d32de7c50e0 100644 --- a/python/pylibcudf/pylibcudf/strings/slice.pyx +++ b/python/pylibcudf/pylibcudf/strings/slice.pyx @@ -14,6 +14,7 @@ from pylibcudf.scalar cimport Scalar from cython.operator import dereference +__all__ = ["slice_strings"] cpdef Column slice_strings( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/strings/split/__init__.py index 2033e5e275b..db2a597882e 100644 --- a/python/pylibcudf/pylibcudf/strings/split/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py @@ -1,2 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from . import partition, split + +__all__ = ["partition", "split"] diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyi b/python/pylibcudf/pylibcudf/strings/split/partition.pyi new file mode 100644 index 00000000000..f19a463bd7e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +def partition(input: Column, delimiter: Scalar | None = None) -> Table: ... +def rpartition(input: Column, delimiter: Scalar | None = None) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx index 0fb4f186c41..75537ea46d3 100644 --- a/python/pylibcudf/pylibcudf/strings/split/partition.pyx +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx @@ -13,6 +13,7 @@ from pylibcudf.table cimport Table from cython.operator import dereference +__all__ = ["partition", "rpartition"] cpdef Table partition(Column input, Scalar delimiter=None): """ diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyi b/python/pylibcudf/pylibcudf/strings/split/split.pyi new file mode 100644 index 00000000000..3ccf0bc2a01 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyi @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.regex_program import RegexProgram +from pylibcudf.table import Table + +def split( + strings_column: Column, delimiter: Scalar, maxsplit: int +) -> Table: ... +def rsplit( + strings_column: Column, delimiter: Scalar, maxsplit: int +) -> Table: ... +def split_record( + strings: Column, delimiter: Scalar, maxsplit: int +) -> Column: ... +def rsplit_record( + strings: Column, delimiter: Scalar, maxsplit: int +) -> Column: ... +def split_re(input: Column, prog: RegexProgram, maxsplit: int) -> Table: ... +def rsplit_re(input: Column, prog: RegexProgram, maxsplit: int) -> Table: ... +def split_record_re( + input: Column, prog: RegexProgram, maxsplit: int +) -> Column: ... +def rsplit_record_re( + input: Column, prog: RegexProgram, maxsplit: int +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx index e3827f6645e..90087f996f0 100644 --- a/python/pylibcudf/pylibcudf/strings/split/split.pyx +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx @@ -13,6 +13,16 @@ from pylibcudf.table cimport Table from cython.operator import dereference +__all__ = [ + "rsplit", + "rsplit_re", + "rsplit_record", + "rsplit_record_re", + "split", + "split_re", + "split_record", + "split_record_re", +] cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit): """ diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyi b/python/pylibcudf/pylibcudf/strings/strip.pyi new file mode 100644 index 00000000000..680355fc88f --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/strip.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.side_type import SideType + +def strip( + input: Column, + side: SideType = SideType.BOTH, + to_strip: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx index 429a23c3cdf..805d959891b 100644 --- a/python/pylibcudf/pylibcudf/strings/strip.pyx +++ b/python/pylibcudf/pylibcudf/strings/strip.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.strings cimport strip as cpp_strip from pylibcudf.scalar cimport Scalar from pylibcudf.strings.side_type cimport side_type +__all__ = ["strip"] cpdef Column strip( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyi b/python/pylibcudf/pylibcudf/strings/translate.pyi new file mode 100644 index 00000000000..7158b6eb05c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/translate.pyi @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from collections.abc import Mapping +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class FilterType(IntEnum): + KEEP = ... + REMOVE = ... + +def translate( + input: Column, chars_table: Mapping[int | str, int | str] +) -> Column: ... +def filter_characters( + input: Column, + characters_to_filter: Mapping[int | str, int | str], + keep_characters: FilterType, + replacement: Scalar, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx index d85da8e6cdd..ba1e8dc5d27 100644 --- a/python/pylibcudf/pylibcudf/strings/translate.pyx +++ b/python/pylibcudf/pylibcudf/strings/translate.pyx @@ -14,6 +14,7 @@ from cython.operator import dereference from pylibcudf.libcudf.strings.translate import \ filter_type as FilterType # no-cython-lint +__all__ = ["FilterType", "filter_characters", "translate"] cdef vector[pair[char_utf8, char_utf8]] _table_to_c_table(dict table): """ diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyi b/python/pylibcudf/pylibcudf/strings/wrap.pyi new file mode 100644 index 00000000000..5658f279197 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/wrap.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def wrap(input: Column, width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx index 2ced250f837..b696eb48e47 100644 --- a/python/pylibcudf/pylibcudf/strings/wrap.pyx +++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx @@ -7,6 +7,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport wrap as cpp_wrap from pylibcudf.libcudf.types cimport size_type +__all__ = ["wrap"] cpdef Column wrap(Column input, size_type width): """ diff --git a/python/pylibcudf/pylibcudf/table.pyi b/python/pylibcudf/pylibcudf/table.pyi new file mode 100644 index 00000000000..5aef7e009c8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/table.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +class Table: + def __init__(self, column: list[Column]): ... + def num_columns(self) -> int: ... + def num_rows(self) -> int: ... + def columns(self) -> list[Column]: ... diff --git a/python/pylibcudf/pylibcudf/table.pyx b/python/pylibcudf/pylibcudf/table.pyx index d0d6f2343d0..0c1e88a927c 100644 --- a/python/pylibcudf/pylibcudf/table.pyx +++ b/python/pylibcudf/pylibcudf/table.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.table.table cimport table from .column cimport Column +__all__ = ["Table"] cdef class Table: """A list of columns of the same size. @@ -24,6 +25,8 @@ cdef class Table: raise ValueError("All columns must be pylibcudf Column objects") self._columns = columns + __hash__ = None + cdef table_view view(self) nogil: """Generate a libcudf table_view to pass to libcudf algorithms. diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py index bbb08e8b95a..a33122221f6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py +++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py @@ -541,13 +541,6 @@ def py_shift_right_unsigned(x, y): plc.binaryop.BinaryOperator.LOGICAL_AND, pa.compute.and_, ), - ( - "int64", - "int64", - "int64", - plc.binaryop.BinaryOperator.LOGICAL_AND, - pa.compute.and_, - ), ( "int64", "int64", @@ -562,13 +555,6 @@ def py_shift_right_unsigned(x, y): plc.binaryop.BinaryOperator.LOGICAL_OR, pa.compute.or_, ), - ( - "int64", - "int64", - "int64", - plc.binaryop.BinaryOperator.LOGICAL_OR, - pa.compute.or_, - ), ( "int64", "int64", diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py index beacfc63ce5..946d583d1cc 100644 --- a/python/pylibcudf/pylibcudf/tests/test_labeling.py +++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py @@ -6,8 +6,12 @@ import pylibcudf as plc -@pytest.mark.parametrize("left_inclusive", [True, False]) -@pytest.mark.parametrize("right_inclusive", [True, False]) +@pytest.mark.parametrize( + "left_inclusive", [plc.labeling.Inclusive.YES, plc.labeling.Inclusive.NO] +) +@pytest.mark.parametrize( + "right_inclusive", [plc.labeling.Inclusive.YES, plc.labeling.Inclusive.NO] +) def test_label_bins(left_inclusive, right_inclusive): in_col = plc.interop.from_arrow(pa.array([1, 2, 3])) left_edges = plc.interop.from_arrow(pa.array([0, 5])) diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py index f3ef555f11d..8c1229c2a04 100644 --- a/python/pylibcudf/pylibcudf/tests/test_lists.py +++ b/python/pylibcudf/pylibcudf/tests/test_lists.py @@ -62,12 +62,12 @@ def test_concatenate_rows(test_data): [ ( [[[1, 2], [3, 4], [5]], [[6], None, [7, 8, 9]]], - False, + plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW, [[1, 2, 3, 4, 5], None], ), ( [[[1, 2], [3, 4], [5, None]], [[6], [None], [7, 8, 9]]], - True, + plc.lists.ConcatenateNullPolicy.IGNORE, [[1, 2, 3, 4, 5, None], [6, None, 7, 8, 9]], ), ], @@ -138,7 +138,9 @@ def test_index_of_scalar(list_column, scalar): plc_column = plc.interop.from_arrow(arr) plc_scalar = plc.interop.from_arrow(scalar) - res = plc.lists.index_of(plc_column, plc_scalar, True) + res = plc.lists.index_of( + plc_column, plc_scalar, plc.lists.DuplicateFindOption.FIND_FIRST + ) expect = pa.array([1, -1, -1, -1], type=pa.int32()) @@ -150,7 +152,9 @@ def test_index_of_list_column(list_column, search_key_column): arr2, expect = search_key_column plc_column1 = plc.interop.from_arrow(arr1) plc_column2 = plc.interop.from_arrow(arr2) - res = plc.lists.index_of(plc_column1, plc_column2, True) + res = plc.lists.index_of( + plc_column1, plc_column2, plc.lists.DuplicateFindOption.FIND_FIRST + ) expect = pa.array(search_key_column[1], type=pa.int32()) @@ -227,39 +231,34 @@ def test_sequences(): @pytest.mark.parametrize( - "ascending,na_position,expected", + "order,na_position,expected", [ ( - True, + plc.types.Order.ASCENDING, plc.types.NullOrder.BEFORE, [[1, 2, 3, 4], [None, 1, 2, 4], [-10, 0, 10, 10]], ), ( - True, + plc.types.Order.ASCENDING, plc.types.NullOrder.AFTER, [[1, 2, 3, 4], [1, 2, 4, None], [-10, 0, 10, 10]], ), ( - False, + plc.types.Order.DESCENDING, plc.types.NullOrder.BEFORE, [[4, 3, 2, 1], [4, 2, 1, None], [10, 10, 0, -10]], ), ( - False, - plc.types.NullOrder.AFTER, - [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]], - ), - ( - False, + plc.types.Order.DESCENDING, plc.types.NullOrder.AFTER, [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]], ), ], ) -def test_sort_lists(lists_column, ascending, na_position, expected): +def test_sort_lists(lists_column, order, na_position, expected): plc_column = plc.interop.from_arrow(pa.array(lists_column)) - res = plc.lists.sort_lists(plc_column, ascending, na_position, False) - res_stable = plc.lists.sort_lists(plc_column, ascending, na_position, True) + res = plc.lists.sort_lists(plc_column, order, na_position, False) + res_stable = plc.lists.sort_lists(plc_column, order, na_position, True) expect = pa.array(expected) @@ -272,44 +271,44 @@ def test_sort_lists(lists_column, ascending, na_position, expected): [ ( plc.lists.difference_distinct, - True, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, [[], [1, 2, 3], None, [4, 5]], ), ( plc.lists.difference_distinct, - False, - True, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.EQUAL, [[], [1, 2, 3], None, [4, None, 5]], ), ( plc.lists.have_overlap, - True, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, [True, False, None, True], ), ( plc.lists.have_overlap, - False, - False, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.UNEQUAL, [True, False, None, False], ), ( plc.lists.intersect_distinct, - True, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, [[np.nan, 1, 2], [], None, [None]], ), ( plc.lists.intersect_distinct, - True, - False, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.UNEQUAL, [[1, 2], [], None, [None]], ), ( plc.lists.union_distinct, - False, - True, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.EQUAL, [ [np.nan, 2, 1, 3], [1, 2, 3, 4, 5], @@ -319,8 +318,8 @@ def test_sort_lists(lists_column, ascending, na_position, expected): ), ( plc.lists.union_distinct, - False, - False, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.UNEQUAL, [ [np.nan, np.nan, 2, 1, np.nan, 3], [1, 2, 3, 4, 5], @@ -352,20 +351,24 @@ def test_set_operations( @pytest.mark.parametrize( "nans_equal,nulls_equal,expected", [ - (True, True, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]), ( - False, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, + [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]], + ), + ( + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.EQUAL, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]], ), ( - True, - False, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.UNEQUAL, [[np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]], ), ( - False, - False, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.UNEQUAL, [ [np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py index f461657281a..e85cd1cc443 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py @@ -8,7 +8,7 @@ import pylibcudf as plc -@pytest.fixture() +@pytest.fixture def str_data(): pa_data = pa.array(["A", None]) return pa_data, plc.interop.from_arrow(pa_data) diff --git a/python/pylibcudf/pylibcudf/traits.pyi b/python/pylibcudf/pylibcudf/traits.pyi new file mode 100644 index 00000000000..fdb31a262cf --- /dev/null +++ b/python/pylibcudf/pylibcudf/traits.pyi @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.types import DataType + +def is_relationally_comparable(typ: DataType) -> bool: ... +def is_equality_comparable(typ: DataType) -> bool: ... +def is_numeric(typ: DataType) -> bool: ... +def is_numeric_not_bool(typ: DataType) -> bool: ... +def is_index_type(typ: DataType) -> bool: ... +def is_unsigned(typ: DataType) -> bool: ... +def is_integral(typ: DataType) -> bool: ... +def is_integral_not_bool(typ: DataType) -> bool: ... +def is_floating_point(typ: DataType) -> bool: ... +def is_boolean(typ: DataType) -> bool: ... +def is_timestamp(typ: DataType) -> bool: ... +def is_fixed_point(typ: DataType) -> bool: ... +def is_duration(typ: DataType) -> bool: ... +def is_chrono(typ: DataType) -> bool: ... +def is_dictionary(typ: DataType) -> bool: ... +def is_fixed_width(typ: DataType) -> bool: ... +def is_compound(typ: DataType) -> bool: ... +def is_nested(typ: DataType) -> bool: ... +def is_bit_castable(source: DataType, target: DataType) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/traits.pyx b/python/pylibcudf/pylibcudf/traits.pyx index 9c52e0ac1ab..3cf0a3a4b3b 100644 --- a/python/pylibcudf/pylibcudf/traits.pyx +++ b/python/pylibcudf/pylibcudf/traits.pyx @@ -5,6 +5,27 @@ from pylibcudf.libcudf.utilities cimport traits from .types cimport DataType +__all__ = [ + "is_bit_castable", + "is_boolean", + "is_chrono", + "is_compound", + "is_dictionary", + "is_duration", + "is_equality_comparable", + "is_fixed_point", + "is_fixed_width", + "is_floating_point", + "is_index_type", + "is_integral", + "is_integral_not_bool", + "is_nested", + "is_numeric", + "is_numeric_not_bool", + "is_relationally_comparable", + "is_timestamp", + "is_unsigned", +] cpdef bool is_relationally_comparable(DataType typ): """Checks if the given data type supports relational comparisons. diff --git a/python/pylibcudf/pylibcudf/transform.pyi b/python/pylibcudf/pylibcudf/transform.pyi new file mode 100644 index 00000000000..5cbd2e635f0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/transform.pyi @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column import Column +from pylibcudf.expressions import Expression +from pylibcudf.gpumemoryview import gpumemoryview +from pylibcudf.table import Table +from pylibcudf.types import DataType + +def nans_to_nulls(input: Column) -> tuple[gpumemoryview, int]: ... +def compute_column(input: Table, expr: Expression) -> Column: ... +def bools_to_mask(input: Column) -> tuple[gpumemoryview, int]: ... +def mask_to_bools(bitmask: int, begin_bit: int, end_bit: int) -> Column: ... +def transform( + input: Column, unary_udf: str, output_type: DataType, is_ptx: bool +) -> Column: ... +def encode(input: Table) -> tuple[Table, Column]: ... +def one_hot_encode(input: Column, categories: Column) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index e8d95cadb0c..9700bcff221 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -18,6 +18,15 @@ from .gpumemoryview cimport gpumemoryview from .types cimport DataType from .utils cimport int_to_bitmask_ptr +__all__ = [ + "bools_to_mask", + "compute_column", + "encode", + "mask_to_bools", + "nans_to_nulls", + "one_hot_encode", + "transform", +] cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): """Create a null mask preserving existing nulls and converting nans to null. diff --git a/python/pylibcudf/pylibcudf/transpose.pyi b/python/pylibcudf/pylibcudf/transpose.pyi new file mode 100644 index 00000000000..a84ab8a60ea --- /dev/null +++ b/python/pylibcudf/pylibcudf/transpose.pyi @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.table import Table + +def transpose(input_table: Table) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx index a24f937ced3..5eb3e58cebc 100644 --- a/python/pylibcudf/pylibcudf/transpose.pyx +++ b/python/pylibcudf/pylibcudf/transpose.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view from .column cimport Column from .table cimport Table +__all__ = ["transpose"] cpdef Table transpose(Table input_table): """Transpose a Table. diff --git a/python/pylibcudf/pylibcudf/types.pyi b/python/pylibcudf/pylibcudf/types.pyi new file mode 100644 index 00000000000..c91a95414bd --- /dev/null +++ b/python/pylibcudf/pylibcudf/types.pyi @@ -0,0 +1,86 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum +from typing import Final + +class Interpolation(IntEnum): + LINEAR = ... + LOWER = ... + HIGHER = ... + MIDPOINT = ... + NEAREST = ... + +class MaskState(IntEnum): + UNALLOCATED = ... + UNINITIALIZED = ... + ALL_VALID = ... + ALL_NULL = ... + +class NanEquality(IntEnum): + ALL_EQUAL = ... + UNEQUAL = ... + +class NanPolicy(IntEnum): + NAN_IS_NULL = ... + NAN_IS_VALID = ... + +class NullEquality(IntEnum): + EQUAL = ... + UNEQUAL = ... + +class NullOrder(IntEnum): + AFTER = ... + BEFORE = ... + +class NullPolicy(IntEnum): + EXCLUDE = ... + INCLUDE = ... + +class Order(IntEnum): + ASCENDING = ... + DESCENDING = ... + +class Sorted(IntEnum): + NO = ... + YES = ... + +class TypeId(IntEnum): + EMPTY = ... + INT8 = ... + INT16 = ... + INT32 = ... + INT64 = ... + UINT8 = ... + UINT16 = ... + UINT32 = ... + UINT64 = ... + FLOAT32 = ... + FLOAT64 = ... + BOOL8 = ... + TIMESTAMP_DAYS = ... + TIMESTAMP_SECONDS = ... + TIMESTAMP_MILLISECONDS = ... + TIMESTAMP_MICROSECONDS = ... + TIMESTAMP_NANOSECONDS = ... + DURATION_DAYS = ... + DURATION_SECONDS = ... + DURATION_MILLISECONDS = ... + DURATION_MICROSECONDS = ... + DURATION_NANOSECONDS = ... + DICTIONARY32 = ... + STRING = ... + LIST = ... + DECIMAL32 = ... + DECIMAL64 = ... + DECIMAL128 = ... + STRUCT = ... + NUM_TYPE_IDS = ... + +class DataType: + def __init__(self, type_id: TypeId, scale: int = 0): ... + def id(self) -> TypeId: ... + def scale(self) -> int: ... + +def size_of(t: DataType) -> int: ... + +SIZE_TYPE: Final[DataType] +SIZE_TYPE_ID: Final[TypeId] diff --git a/python/pylibcudf/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx index a0c31f994a3..afa1b56f38a 100644 --- a/python/pylibcudf/pylibcudf/types.pyx +++ b/python/pylibcudf/pylibcudf/types.pyx @@ -20,6 +20,22 @@ from pylibcudf.libcudf.types import null_order as NullOrder # no-cython-lint, i from pylibcudf.libcudf.types import order as Order # no-cython-lint, isort:skip from pylibcudf.libcudf.types import sorted as Sorted # no-cython-lint, isort:skip +__all__ = [ + "DataType", + "Interpolation", + "MaskState", + "NanEquality", + "NanPolicy", + "NullEquality", + "NullOrder", + "NullPolicy", + "Order", + "SIZE_TYPE", + "SIZE_TYPE_ID", + "Sorted", + "TypeId", + "size_of" +] cdef class DataType: """Indicator for the logical data type of an element in a column. diff --git a/python/pylibcudf/pylibcudf/unary.pyi b/python/pylibcudf/pylibcudf/unary.pyi new file mode 100644 index 00000000000..7aa23b618f4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/unary.pyi @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +class UnaryOperator(IntEnum): + SIN = ... + COS = ... + TAN = ... + ARCSIN = ... + ARCCOS = ... + ARCTAN = ... + SINH = ... + COSH = ... + TANH = ... + ARCSINH = ... + ARCCOSH = ... + ARCTANH = ... + EXP = ... + LOG = ... + SQRT = ... + CBRT = ... + CEIL = ... + FLOOR = ... + ABS = ... + RINT = ... + BIT_INVERT = ... + NOT = ... + +def unary_operation(input: Column, op: UnaryOperator) -> Column: ... +def is_null(input: Column) -> Column: ... +def is_valid(input: Column) -> Column: ... +def cast(input: Column, data_type: DataType) -> Column: ... +def is_nan(input: Column) -> Column: ... +def is_not_nan(input: Column) -> Column: ... +def is_supported_cast(from_: DataType, to: DataType) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx index 53e8c382b5e..b738ab53d1b 100644 --- a/python/pylibcudf/pylibcudf/unary.pyx +++ b/python/pylibcudf/pylibcudf/unary.pyx @@ -13,6 +13,16 @@ from pylibcudf.libcudf.unary import \ from .column cimport Column from .types cimport DataType +__all__ = [ + "UnaryOperator", + "cast", + "is_nan", + "is_not_nan", + "is_null", + "is_supported_cast", + "is_valid", + "unary_operation", +] cpdef Column unary_operation(Column input, unary_operator op): """Perform a unary operation on a column. diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index ac3018b9333..83ed95823da 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -56,13 +56,30 @@ Documentation = "https://docs.rapids.ai/api/cudf/stable/" [tool.ruff] extend = "../../pyproject.toml" +[tool.ruff.lint] +extend-select = [ + "TCH", # flake8-type-checking + "TID", # flake8-tidy-imports + "PT", # flake8-pytest-style +] +extend-ignore = [ + "PT011", # pytest.raises(...) is too broad +] + +[tool.ruff.lint.flake8-pytest-style] +# https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style +fixture-parentheses = false +mark-parentheses = false +parametrize-names-type = "csv" +parametrize-values-type = "list" +parametrize-values-row-type = "tuple" + [tool.ruff.lint.isort] combine-as-imports = true -known-first-party = ["cudf"] -section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] +known-first-party = ["pylibcudf"] +section-order = ["future", "standard-library", "third-party", "rapids", "first-party", "local-folder"] [tool.ruff.lint.isort.sections] -dask = ["dask", "distributed", "dask_cuda"] rapids = ["rmm"] [tool.ruff.lint.per-file-ignores] From 796de4bd5131c38428b609c543323193f298624e Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 12 Nov 2024 11:59:04 -0500 Subject: [PATCH 16/19] Add cudf::strings::contains_multiple (#16900) Add new `cudf::strings::contains_multiple` API to search multiple targets within a strings column. Output is a table where the number of columns is the number of targets and each row is a boolean indicating that target was found at the row or not. This PR is to help in collaboration with #16641 Authors: - David Wendt (https://github.com/davidwendt) - GALI PREM SAGAR (https://github.com/galipremsagar) - Chong Gao (https://github.com/res-life) - Bradley Dice (https://github.com/bdice) Approvers: - Chong Gao (https://github.com/res-life) - Yunsong Wang (https://github.com/PointKernel) - MithunR (https://github.com/mythrocks) - Tianyu Liu (https://github.com/kingcrimsontianyu) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16900 --- cpp/CMakeLists.txt | 1 + cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/string/find.cpp | 14 +- cpp/benchmarks/string/find_multiple.cpp | 77 +++++ cpp/include/cudf/strings/find_multiple.hpp | 40 ++- cpp/src/strings/search/contains_multiple.cu | 316 ++++++++++++++++++++ cpp/src/strings/search/find_multiple.cu | 5 +- cpp/tests/strings/find_multiple_tests.cpp | 155 +++++++++- cpp/tests/strings/find_tests.cpp | 4 +- 9 files changed, 592 insertions(+), 21 deletions(-) create mode 100644 cpp/benchmarks/string/find_multiple.cpp create mode 100644 cpp/src/strings/search/contains_multiple.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 65b05fd518b..e237b0b2856 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -705,6 +705,7 @@ add_library( src/strings/replace/replace_slice.cu src/strings/reverse.cu src/strings/scan/scan_inclusive.cu + src/strings/search/contains_multiple.cu src/strings/search/findall.cu src/strings/search/find.cu src/strings/search/find_multiple.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 59f5602fd5a..419b78db9b0 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -375,6 +375,7 @@ ConfigureNVBench( string/count.cpp string/extract.cpp string/find.cpp + string/find_multiple.cpp string/join_strings.cpp string/lengths.cpp string/like.cpp diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index 996bdcf0332..3ea3ff13a2f 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -20,9 +20,7 @@ #include #include -#include #include -#include #include #include @@ -44,15 +42,13 @@ static void bench_find_string(nvbench::state& state) auto const col = create_string_column(n_rows, row_width, hit_rate); auto const input = cudf::strings_column_view(col->view()); - std::vector h_targets({"5W", "5W43", "0987 5W43"}); - cudf::string_scalar target(h_targets[2]); - cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end()); + cudf::string_scalar target("0987 5W43"); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); auto const chars_size = input.chars_size(stream); state.add_element_count(chars_size, "chars_size"); state.add_global_memory_reads(chars_size); - if (api.substr(0, 4) == "find") { + if (api == "find") { state.add_global_memory_writes(input.size()); } else { state.add_global_memory_writes(input.size()); @@ -61,10 +57,6 @@ static void bench_find_string(nvbench::state& state) if (api == "find") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::find(input, target); }); - } else if (api == "find_multi") { - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - cudf::strings::find_multiple(input, cudf::strings_column_view(targets)); - }); } else if (api == "contains") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::contains(input, target); }); @@ -79,7 +71,7 @@ static void bench_find_string(nvbench::state& state) NVBENCH_BENCH(bench_find_string) .set_name("find_string") - .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"}) + .add_string_axis("api", {"find", "contains", "starts_with", "ends_with"}) .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216}) .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp new file mode 100644 index 00000000000..0e780fdb302 --- /dev/null +++ b/cpp/benchmarks/string/find_multiple.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include +#include +#include +#include + +#include + +static void bench_find_string(nvbench::state& state) +{ + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); + auto const target_count = static_cast(state.get_int64("targets")); + auto const api = state.get_string("api"); + + auto const stream = cudf::get_default_stream(); + auto const col = create_string_column(n_rows, row_width, hit_rate); + auto const input = cudf::strings_column_view(col->view()); + + // Note that these all match the first row of the raw_data in create_string_column. + // This is so the hit_rate can properly accounted for. + std::vector const target_data( + {" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"}); + auto h_targets = std::vector{}; + for (cudf::size_type i = 0; i < target_count; i++) { + h_targets.emplace_back(target_data[i % target_data.size()]); + } + cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end()); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto const chars_size = input.chars_size(stream); + state.add_global_memory_reads(chars_size); + if (api == "find") { + state.add_global_memory_writes(input.size()); + } else { + state.add_global_memory_writes(input.size()); + } + + if (api == "find") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::find_multiple(input, cudf::strings_column_view(targets)); + }); + } else if (api == "contains") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::contains_multiple(input, cudf::strings_column_view(targets)); + }); + } +} + +NVBENCH_BENCH(bench_find_string) + .set_name("find_multiple") + .add_string_axis("api", {"find", "contains"}) + .add_int64_axis("targets", {10, 20, 40}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp index 1fe446db8da..e090766dd07 100644 --- a/cpp/include/cudf/strings/find_multiple.hpp +++ b/cpp/include/cudf/strings/find_multiple.hpp @@ -28,8 +28,42 @@ namespace strings { */ /** - * @brief Returns a lists column with character position values where each - * of the target strings are found in each string. + * @brief Searches for the given target strings within each string in the provided column + * + * Each column in the result table corresponds to the result for the target string at the same + * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1st for 1st, + * etc. + * + * If the target is not found for a string, false is returned for that entry in the output column. + * If the target is an empty string, true is returned for all non-null entries in the output column. + * + * Any null input strings return corresponding null entries in the output columns. + * + * @code{.pseudo} + * input = ["a", "b", "c"] + * targets = ["a", "c"] + * output is a table with two boolean columns: + * column 0: [true, false, false] + * column 1: [false, false, true] + * @endcode + * + * @throw std::invalid_argument if `targets` is empty or contains nulls + * + * @param input Strings instance for this operation + * @param targets UTF-8 encoded strings to search for in each string in `input` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Table of BOOL8 columns + */ +std::unique_ptr
contains_multiple( + strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Searches for the given target strings within each string in the provided column + * and returns the position the targets were found * * The size of the output column is `input.size()`. * Each row of the output column is of size `targets.size()`. @@ -45,7 +79,7 @@ namespace strings { * [-1,-1, 1 ]} // for "def": "a" and "b" not found, "e" at pos 1 * @endcode * - * @throw cudf::logic_error if `targets` is empty or contains nulls + * @throw std::invalid_argument if `targets` is empty or contains nulls * * @param input Strings instance for this operation * @param targets Strings to search for in each string diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu new file mode 100644 index 00000000000..1183e3e4038 --- /dev/null +++ b/cpp/src/strings/search/contains_multiple.cu @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { +namespace { + +/** + * @brief Threshold to decide on using string or warp parallel functions. + * + * If the average byte length of a string in a column exceeds this value then + * a warp-parallel function is used. + */ +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; + +/** + * @brief Kernel for finding multiple targets in each row of input strings + * + * The d_first_bytes is sorted and unique so the d_indices and d_offsets + * are used to map the corresponding character to its d_targets entry. + * + * Example + * d_targets = ["foo", "hello", "world", "hi"] + * - sorted first-chars: ['f','h','h','w'] + * d_indices = [0, 3, 1, 2] + * d_first_bytes = ['f', 'h', 'w'] (unique) + * d_offsets = [0, 1, 3] + * unique_count = 3 + * + * If 'h' is found, lower_bound produces pos=1 in d_first_bytes. + * This corresponds to d_offset[1]==1 which has two values: + * - (d_offsets[2] - d_offsets[1]) = (3 - 1) = 2. + * Set map_idx = d_offsets[1] = 1 and the two targets to check are sequential + * in the d_indices array: + * - tgt1_idx = d_indices[map_idx] = 3 --> d_targets[3] == 'hi' + * - tgt2_idx = d_indices[map_idx+1] = 1 --> d_targets[1] == 'hello' + * The logic now only needs to check for either of these 2 targets. + * + * This kernel works in either thread-per-string or warp-per-string depending + * on the template parameter. If tile_size==1, then this kernel executes as + * a row-per-string. If tile_size=32, the it executes as a warp-per-string. + * No other options are supported for now. + * + * @tparam tile_size Number of threads per string + * @param d_strings Input strings + * @param d_targets Target strings to search within input strings + * @param d_first_bytes Sorted, unique list of first bytes of the target strings + * @param d_indices Indices to map sorted d_first_bytes to d_targets + * @param d_offsets Offsets to map d_indices to d_targets + * @param unique_count Number of unique values in d_first_bytes (and d_offsets) + * @param working_memory Global memory to use if shared-memory is too small + * @param d_results Bool results for each target within each string row + */ +template +CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings, + column_device_view const d_targets, + u_char const* d_first_bytes, + size_type const* d_indices, + size_type const* d_offsets, + size_type unique_count, + bool* working_memory, + cudf::device_span d_results) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = idx / tile_size; + if (str_idx >= d_strings.size()) { return; } + if (d_strings.is_null(str_idx)) { return; } + + // get the string for this tile + auto const d_str = d_strings.element(str_idx); + + namespace cg = cooperative_groups; + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const lane_idx = tile.thread_rank(); + auto const num_targets = d_targets.size(); + + // size of shared_bools = num_targets * block_size + // each thread uses num_targets bools + extern __shared__ bool shared_bools[]; + // bools for the current string + auto bools = working_memory == nullptr + ? (shared_bools + (tile.meta_group_rank() * tile_size * num_targets)) + : (working_memory + (str_idx * tile_size * num_targets)); + + // initialize result: set true if target is empty, false otherwise + for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { + auto const d_target = d_targets.element(target_idx); + if constexpr (tile_size == 1) { + d_results[target_idx][str_idx] = d_target.empty(); + } else { + auto const begin = bools + (target_idx * tile_size); + thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty()); + } + } + tile.sync(); + + auto const last_ptr = d_first_bytes + unique_count; + for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes(); + str_byte_idx += tile_size) { + // search for byte in first_bytes array + auto const sptr = d_str.data() + str_byte_idx; + auto const chr = static_cast(*sptr); + auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr); + // if not found, continue to next byte + if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } + // compute index of matched byte + auto const offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); + auto map_idx = d_offsets[offset_idx]; + auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets; + // check for targets that begin with chr + while (map_idx < last_idx) { + auto const target_idx = d_indices[map_idx++]; + auto const bool_idx = (target_idx * tile_size) + lane_idx; + auto const found = tile_size == 1 ? d_results[target_idx][str_idx] : bools[bool_idx]; + if (!found) { // not found before + auto const d_target = d_targets.element(target_idx); + if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) { + // first char already checked, so just check the [1, end) chars match + auto const tp = d_target.data(); + if (thrust::equal(thrust::seq, tp + 1, tp + d_target.size_bytes(), sptr + 1)) { + if constexpr (tile_size == 1) { + d_results[target_idx][str_idx] = true; + } else { + bools[bool_idx] = true; + } + } + } + } + } + } + + if constexpr (tile_size > 1) { + tile.sync(); + // reduce the bools for each target to store in the result + for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { + auto const begin = bools + (target_idx * tile_size); + d_results[target_idx][str_idx] = + thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity{}); + // cooperative_group any() implementation was almost 3x slower than this parallel reduce + } + } +} +} // namespace + +std::unique_ptr
contains_multiple(strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS( + not targets.is_empty(), "Must specify at least one target string.", std::invalid_argument); + CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null", std::invalid_argument); + + auto const d_strings = column_device_view::create(input.parent(), stream); + auto const d_targets = column_device_view::create(targets.parent(), stream); + + // copy the first byte of each target and sort them + auto first_bytes = rmm::device_uvector(targets.size(), stream); + auto indices = rmm::device_uvector(targets.size(), stream); + { + auto tgt_itr = thrust::make_transform_iterator( + d_targets->begin(), + cuda::proclaim_return_type([] __device__(auto const& d_tgt) -> u_char { + return d_tgt.empty() ? u_char{0} : static_cast(d_tgt.data()[0]); + })); + auto count_itr = thrust::make_counting_iterator(0); + auto keys_out = first_bytes.begin(); + auto vals_out = indices.begin(); + auto num_items = targets.size(); + auto cmp_op = thrust::less(); + auto sv = stream.value(); + + std::size_t tmp_bytes = 0; + cub::DeviceMergeSort::SortPairsCopy( + nullptr, tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv); + auto tmp_stg = rmm::device_buffer(tmp_bytes, stream); + cub::DeviceMergeSort::SortPairsCopy( + tmp_stg.data(), tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv); + } + + // remove duplicates to help speed up lower_bound + auto offsets = rmm::device_uvector(targets.size(), stream); + thrust::sequence(rmm::exec_policy_nosync(stream), offsets.begin(), offsets.end()); + auto const end = thrust::unique_by_key( + rmm::exec_policy_nosync(stream), first_bytes.begin(), first_bytes.end(), offsets.begin()); + auto const unique_count = + static_cast(thrust::distance(first_bytes.begin(), end.first)); + + // create output columns + auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return make_numeric_column(data_type{type_id::BOOL8}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + }); + auto results = std::vector>(results_iter, results_iter + targets.size()); + auto d_results = [&] { + auto host_results_pointer_iter = + thrust::make_transform_iterator(results.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + results.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); + }(); + + constexpr cudf::thread_index_type block_size = 256; + // calculated (benchmarked) for efficient use of shared-memory + constexpr size_type targets_threshold = 32; + + auto d_first_bytes = first_bytes.data(); + auto d_indices = indices.data(); + auto d_offsets = offsets.data(); + + bool const row_parallel = ((input.null_count() == input.size()) || + ((input.chars_size(stream) / (input.size() - input.null_count())) <= + AVG_CHAR_BYTES_THRESHOLD)); + + if (row_parallel) { + // Smaller strings perform better with a row per string + cudf::detail::grid_1d grid{static_cast(input.size()), block_size}; + multi_contains_kernel<1> + <<>>(*d_strings, + *d_targets, + d_first_bytes, + d_indices, + d_offsets, + unique_count, + nullptr, + d_results); + } else { + constexpr cudf::thread_index_type tile_size = cudf::detail::warp_size; + + auto const shared_mem_size = + (targets.size() <= targets_threshold) ? (block_size * targets.size()) : 0; + auto const work_mem_size = + (targets.size() <= targets_threshold) ? 0 : tile_size * targets.size() * input.size(); + auto working_memory = rmm::device_uvector(work_mem_size, stream); + + cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, + block_size}; + multi_contains_kernel + <<>>( + *d_strings, + *d_targets, + d_first_bytes, + d_indices, + d_offsets, + unique_count, + working_memory.data(), + d_results); + } + + return std::make_unique
(std::move(results)); +} + +} // namespace detail + +std::unique_ptr
contains_multiple(strings_column_view const& strings, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::contains_multiple(strings, targets, stream, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu index ec7015878dd..67226b259d4 100644 --- a/cpp/src/strings/search/find_multiple.cu +++ b/cpp/src/strings/search/find_multiple.cu @@ -42,8 +42,9 @@ std::unique_ptr find_multiple(strings_column_view const& input, { auto const strings_count = input.size(); auto const targets_count = targets.size(); - CUDF_EXPECTS(targets_count > 0, "Must include at least one search target"); - CUDF_EXPECTS(!targets.has_nulls(), "Search targets cannot contain null strings"); + CUDF_EXPECTS(targets_count > 0, "Must include at least one search target", std::invalid_argument); + CUDF_EXPECTS( + !targets.has_nulls(), "Search targets cannot contain null strings", std::invalid_argument); auto strings_column = column_device_view::create(input.parent(), stream); auto d_strings = *strings_column; diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp index 41a5940c880..3c8483b153d 100644 --- a/cpp/tests/strings/find_multiple_tests.cpp +++ b/cpp/tests/strings/find_multiple_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -75,8 +76,158 @@ TEST_F(StringsFindMultipleTest, ErrorTest) auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view(); auto empty_view = cudf::strings_column_view(zero_size_strings_column); // targets must have at least one string - EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), cudf::logic_error); + EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), std::invalid_argument); + EXPECT_THROW(cudf::strings::contains_multiple(strings_view, empty_view), std::invalid_argument); // targets cannot have nulls - EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), cudf::logic_error); + EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), std::invalid_argument); + EXPECT_THROW(cudf::strings::contains_multiple(strings_view, strings_view), std::invalid_argument); +} + +TEST_F(StringsFindMultipleTest, MultiContains) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 9 rows: + std::vector s = { + "Héllo, there world and goodbye", + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving", + "the following code snippet demonstrates how to use search for values in an ordered range", + "it returns the last position where value could be inserted without violating the ordering", + "algorithms execution is parallelized as determined by an execution policy. t", + "he this is a continuation of previous row to make sure string boundaries are honored", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~", + "", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 8, 8 + 1 * 9, 8 + 2 * 9 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + auto strings_view = cudf::strings_column_view(strings); + std::vector match_targets({" the ", "a", "", "é"}); + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = + cudf::strings::contains_multiple(strings_view, cudf::strings_column_view(multi_targets_column)); + + std::vector ret_0 = {0, 1, 0, 1, 0, 0, 0, 0, 0}; + std::vector ret_1 = {1, 1, 1, 1, 1, 1, 1, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {1, 0, 0, 0, 0, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + + auto expected = cudf::table_view({expected_0, expected_1, expected_2, expected_3}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); +} + +TEST_F(StringsFindMultipleTest, MultiContainsMoreTargets) +{ + auto const strings = cudf::test::strings_column_wrapper{ + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position"}; + auto strings_view = cudf::strings_column_view(strings); + std::vector targets({"lazy brown", "non-exist", ""}); + + std::vector> expects; + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({0, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 1, 1})); + + std::vector match_targets; + int max_num_targets = 50; + + for (int num_targets = 1; num_targets < max_num_targets; num_targets++) { + match_targets.clear(); + for (int i = 0; i < num_targets; i++) { + match_targets.push_back(targets[i % targets.size()]); + } + + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = cudf::strings::contains_multiple( + strings_view, cudf::strings_column_view(multi_targets_column)); + EXPECT_EQ(results->num_columns(), num_targets); + for (int i = 0; i < num_targets; i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(i), expects[i % expects.size()]); + } + } +} + +TEST_F(StringsFindMultipleTest, MultiContainsLongStrings) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 7 rows: + std::vector s = { + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position", + "algorithms execution is parallelized as determined by an execution policy. t algorithms " + "execution is parallelized as ", + "he this is a continuation of previous row to make sure string boundaries are honored he this " + "is a continuation of previous row", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ " + "!@#$%^&*()~abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKL", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 6, 6 + 1 * 7, 6 + 2 * 7 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + + auto sv = cudf::strings_column_view(strings); + auto targets = cudf::test::strings_column_wrapper({" the ", "search", "", "string", "ox", "é "}); + auto results = cudf::strings::contains_multiple(sv, cudf::strings_column_view(targets)); + + std::vector ret_0 = {1, 0, 1, 0, 0, 0, 0}; + std::vector ret_1 = {0, 1, 0, 0, 0, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {0, 0, 0, 0, 1, 0, 0}; + std::vector ret_4 = {1, 0, 0, 0, 0, 0, 0}; + std::vector ret_5 = {0, 0, 1, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + auto expected_4 = make_bool_col_fn(ret_4); + auto expected_5 = make_bool_col_fn(ret_5); + + auto expected = + cudf::table_view({expected_0, expected_1, expected_2, expected_3, expected_4, expected_5}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); } diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index 2da95ba5c27..a3066c40650 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -17,16 +17,14 @@ #include #include #include +#include -#include #include #include #include #include #include -#include - #include struct StringsFindTest : public cudf::test::BaseFixture {}; From 1f9ad2f33867789d734c9be9bbacaabe1e348884 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 12 Nov 2024 16:20:29 -0600 Subject: [PATCH 17/19] enforce wheel size limits, README formatting in CI (#17284) Contributes to https://github.com/rapidsai/build-planning/issues/110 Proposes adding 2 types of validation on wheels in CI, to ensure we continue to produce wheels that are suitable for PyPI. * checks on wheel size (compressed), - *to be sure they're under PyPI limits* - *and to prompt discussion on PRs that significantly increase wheel sizes* * checks on README formatting - *to ensure they'll render properly as the PyPI project homepages* - *e.g. like how https://github.com/scikit-learn/scikit-learn/blob/main/README.rst becomes https://pypi.org/project/scikit-learn/* ## Notes for Reviewers ### How I tested this Initially set the size threshold for `libcudf` to a value that I knew it'd violate (75MB compressed, when the wheels are 400+ MB compressed). Saw CI fail as expected, and print a summary with the expected contents. ```text checking 'final_dist/libcudf_cu11-24.12.0a333-py3-none-manylinux_2_28_aarch64.whl' ----- package inspection summary ----- file size * compressed size: 0.4G * uncompressed size: 0.6G * compression space saving: 34.6% contents * directories: 164 * files: 1974 (2 compiled) size by extension * .so - 0.6G (97.0%) * .h - 6.7M (1.0%) * no-extension - 4.8M (0.7%) * .cuh - 3.8M (0.6%) * .hpp - 2.2M (0.3%) * .a - 1.1M (0.2%) * .inl - 0.8M (0.1%) * .cmake - 0.1M (0.0%) * .md - 8.3K (0.0%) * .py - 4.0K (0.0%) * .pc - 0.2K (0.0%) * .txt - 34.0B (0.0%) largest files * (0.6G) libcudf/lib64/libcudf.so * (3.3M) libcudf/bin/flatc * (1.0M) libcudf/lib64/libflatbuffers.a * (0.5M) libcudf/include/libcudf/rapids/libcudacxx/cuda/std/__atomic/functions/cuda_ptx_generated.h * (0.2M) libcudf_cu11-24.12.0a333.dist-info/RECORD ------------ check results ----------- 1. [distro-too-large-compressed] Compressed size 0.4G is larger than the allowed size (75.0M). errors found while checking: 1 ``` ([build link](https://github.com/rapidsai/cudf/actions/runs/11748370606/job/32732391718?pr=17284#step:13:3062)) Updated that threshold in `python/libcudf/pyproject.toml`, and saw the build succeed (but the summary still printed). # Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17284 --- ci/build_wheel_cudf.sh | 2 ++ ci/build_wheel_cudf_polars.sh | 1 + ci/build_wheel_dask_cudf.sh | 1 + ci/build_wheel_libcudf.sh | 2 ++ ci/build_wheel_pylibcudf.sh | 2 ++ ci/validate_wheel.sh | 21 +++++++++++++++++++++ python/cudf/pyproject.toml | 8 ++++++++ python/cudf_kafka/pyproject.toml | 8 ++++++++ python/cudf_polars/pyproject.toml | 8 ++++++++ python/custreamz/pyproject.toml | 8 ++++++++ python/dask_cudf/pyproject.toml | 8 ++++++++ python/libcudf/pyproject.toml | 8 ++++++++ python/pylibcudf/pyproject.toml | 8 ++++++++ 13 files changed, 85 insertions(+) create mode 100755 ci/validate_wheel.sh diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index ae4eb0d5c66..32dd5a7fa62 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -27,4 +27,6 @@ python -m auditwheel repair \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* +./ci/validate_wheel.sh ${package_dir} final_dist + RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh index 79853cdbdb2..38048125247 100755 --- a/ci/build_wheel_cudf_polars.sh +++ b/ci/build_wheel_cudf_polars.sh @@ -6,6 +6,7 @@ set -euo pipefail package_dir="python/cudf_polars" ./ci/build_wheel.sh cudf-polars ${package_dir} +./ci/validate_wheel.sh ${package_dir} dist RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh index 00c64afa2ef..b0ae2f23abc 100755 --- a/ci/build_wheel_dask_cudf.sh +++ b/ci/build_wheel_dask_cudf.sh @@ -6,6 +6,7 @@ set -euo pipefail package_dir="python/dask_cudf" ./ci/build_wheel.sh dask-cudf ${package_dir} +./ci/validate_wheel.sh ${package_dir} dist RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index aabd3814a24..af49942c8cd 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -37,4 +37,6 @@ python -m auditwheel repair \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* +./ci/validate_wheel.sh ${package_dir} final_dist + RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist" diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh index c4a89f20f5f..5a8f3397714 100755 --- a/ci/build_wheel_pylibcudf.sh +++ b/ci/build_wheel_pylibcudf.sh @@ -25,4 +25,6 @@ python -m auditwheel repair \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* +./ci/validate_wheel.sh ${package_dir} final_dist + RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh new file mode 100755 index 00000000000..5910a5c59fe --- /dev/null +++ b/ci/validate_wheel.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +package_dir=$1 +wheel_dir_relative_path=$2 + +cd "${package_dir}" + +rapids-logger "validate packages with 'pydistcheck'" + +pydistcheck \ + --inspect \ + "$(echo ${wheel_dir_relative_path}/*.whl)" + +rapids-logger "validate packages with 'twine'" + +twine check \ + --strict \ + "$(echo ${wheel_dir_relative_path}/*.whl)" diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index ca6dbddfecc..280dd52bb22 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -83,6 +83,14 @@ cudf-pandas-tests = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index ec0bc0eb22b..b2ea3f06e48 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -47,6 +47,14 @@ rapids = ["rmm", "cudf", "dask_cudf"] [tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402", "F401"] +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 2e75dff5c9e..32ea142a96c 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -49,6 +49,14 @@ license-files = ["LICENSE"] [tool.setuptools.dynamic] version = {file = "cudf_polars/VERSION"} +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index d3baf3bf4d2..dd67a019c77 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -65,6 +65,14 @@ include = [ ] exclude = ["*tests*"] +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.ruff] extend = "../../pyproject.toml" diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index c4bfc3054bc..07d9143db36 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -81,6 +81,14 @@ section-order = ["future", "standard-library", "third-party", "dask", "rapids", dask = ["dask", "distributed", "dask_cuda"] rapids = ["rmm", "cudf"] +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index 62726bb0df4..8c650eb2144 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -48,6 +48,14 @@ Homepage = "https://github.com/rapidsai/cudf" [project.entry-points."cmake.prefix"] libcudf = "libcudf" +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 600 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '525M' + [tool.scikit-build] build-dir = "build/{wheel_tag}" cmake.build-type = "Release" diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 83ed95823da..e83db47830c 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -85,6 +85,14 @@ rapids = ["rmm"] [tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402", "F401"] +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] # --import-mode=importlib because two test_json.py exists and tests directory is not a structured module addopts = "--tb=native --strict-config --strict-markers --import-mode=importlib" From bbaa1ab1eab41d26ca2b280b3b48a73ed3f411b9 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 12 Nov 2024 22:57:21 +0000 Subject: [PATCH 18/19] Support polars 1.13 (#17299) Polars 1.13 is out, so add support for that. I needed to change some of the logic in the callback raising after @Matt711's changes, I am not sure why tests were passing previously. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Matthew Murray (https://github.com/Matt711) - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17299 --- ci/test_cudf_polars_polars_tests.sh | 23 +----- ci/test_wheel_cudf_polars.sh | 23 +----- .../all_cuda-118_arch-x86_64.yaml | 2 +- .../all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf-polars/meta.yaml | 2 +- dependencies.yaml | 2 +- python/cudf_polars/cudf_polars/callback.py | 75 ++++++++----------- python/cudf_polars/cudf_polars/dsl/ir.py | 3 +- .../cudf_polars/cudf_polars/dsl/nodebase.py | 4 +- .../cudf_polars/cudf_polars/testing/plugin.py | 2 +- python/cudf_polars/pyproject.toml | 2 +- python/cudf_polars/tests/test_config.py | 2 +- 12 files changed, 44 insertions(+), 98 deletions(-) diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh index f5bcdc62604..fefe26984cb 100755 --- a/ci/test_cudf_polars_polars_tests.sh +++ b/ci/test_cudf_polars_polars_tests.sh @@ -3,22 +3,6 @@ set -eou pipefail -# We will only fail these tests if the PR touches code in pylibcudf -# or cudf_polars itself. -# Note, the three dots mean we are doing diff between the merge-base -# of upstream and HEAD. So this is asking, "does _this branch_ touch -# files in cudf_polars/pylibcudf", rather than "are there changes -# between upstream and this branch which touch cudf_polars/pylibcudf" -# TODO: is the target branch exposed anywhere in an environment variable? -if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ]; -then - HAS_CHANGES=1 - rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" -else - HAS_CHANGES=0 - rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" -fi - rapids-logger "Download wheels" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" @@ -63,9 +47,4 @@ if [ ${EXITCODE} != 0 ]; then else rapids-logger "Running polars test suite PASSED" fi - -if [ ${HAS_CHANGES} == 1 ]; then - exit ${EXITCODE} -else - exit 0 -fi +exit ${EXITCODE} diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index 2884757e46b..6c827406f78 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -3,22 +3,6 @@ set -eou pipefail -# We will only fail these tests if the PR touches code in pylibcudf -# or cudf_polars itself. -# Note, the three dots mean we are doing diff between the merge-base -# of upstream and HEAD. So this is asking, "does _this branch_ touch -# files in cudf_polars/pylibcudf", rather than "are there changes -# between upstream and this branch which touch cudf_polars/pylibcudf" -# TODO: is the target branch exposed anywhere in an environment variable? -if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/pylibcudf/)" ]; -then - HAS_CHANGES=1 - rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" -else - HAS_CHANGES=0 - rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" -fi - rapids-logger "Download wheels" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" @@ -65,9 +49,4 @@ if [ ${EXITCODE} != 0 ]; then else rapids-logger "Testing PASSED" fi - -if [ ${HAS_CHANGES} == 1 ]; then - exit ${EXITCODE} -else - exit 0 -fi +exit ${EXITCODE} diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 01764411346..e91443ddba8 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -66,7 +66,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.13 +- polars>=1.11,<1.14 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<19.0.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 9074e6332d9..2dccb595e59 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -64,7 +64,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.13 +- polars>=1.11,<1.14 - pre-commit - pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml index edf92b930d9..7a477291e7a 100644 --- a/conda/recipes/cudf-polars/meta.yaml +++ b/conda/recipes/cudf-polars/meta.yaml @@ -43,7 +43,7 @@ requirements: run: - python - pylibcudf ={{ version }} - - polars >=1.11,<1.12 + - polars >=1.11,<1.14 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/dependencies.yaml b/dependencies.yaml index e47e0c7523c..b5165f82d5f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -734,7 +734,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.11,<1.13 + - polars>=1.11,<1.14 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index ff4933c7564..d085f21e0ad 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -148,12 +148,7 @@ def _callback( return ir.evaluate(cache={}).to_polars() -def execute_with_cudf( - nt: NodeTraverser, - *, - config: GPUEngine, - exception: type[Exception] | tuple[type[Exception], ...] = Exception, -) -> None: +def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None: """ A post optimization callback that attempts to execute the plan with cudf. @@ -165,10 +160,15 @@ def execute_with_cudf( config GPUEngine configuration object - exception - Optional exception, or tuple of exceptions, to catch during - translation. Defaults to ``Exception``. + Raises + ------ + ValueError + If the config contains unsupported keys. + NotImplementedError + If translation of the plan is unsupported. + Notes + ----- The NodeTraverser is mutated if the libcudf executor can handle the plan. """ device = config.device @@ -178,38 +178,27 @@ def execute_with_cudf( raise ValueError( f"Engine configuration contains unsupported settings {unsupported}" ) - try: - with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): - translator = Translator(nt) - ir = translator.translate_ir() - ir_translation_errors = translator.errors - if len(ir_translation_errors): - # TODO: Display these errors in user-friendly way. - # tracked in https://github.com/rapidsai/cudf/issues/17051 - unique_errors = sorted(set(ir_translation_errors), key=str) - error_message = "Query contained unsupported operations" - verbose_error_message = ( - f"{error_message}\nThe errors were:\n{unique_errors}" - ) - unsupported_ops_exception = NotImplementedError( - error_message, unique_errors - ) - if bool(int(os.environ.get("POLARS_VERBOSE", 0))): - warnings.warn(verbose_error_message, UserWarning, stacklevel=2) - if raise_on_fail: - raise unsupported_ops_exception - else: - nt.set_udf( - partial( - _callback, ir, device=device, memory_resource=memory_resource - ) - ) - except exception as e: - if bool(int(os.environ.get("POLARS_VERBOSE", 0))): - warnings.warn( - f"Query execution with GPU not supported, reason: {type(e)}: {e}", - PerformanceWarning, - stacklevel=2, + with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): + translator = Translator(nt) + ir = translator.translate_ir() + ir_translation_errors = translator.errors + if len(ir_translation_errors): + # TODO: Display these errors in user-friendly way. + # tracked in https://github.com/rapidsai/cudf/issues/17051 + unique_errors = sorted(set(ir_translation_errors), key=str) + formatted_errors = "\n".join( + f"- {e.__class__.__name__}: {e}" for e in unique_errors + ) + error_message = ( + "Query execution with GPU not possible: unsupported operations." + f"\nThe errors were:\n{formatted_errors}" + ) + exception = NotImplementedError(error_message, unique_errors) + if bool(int(os.environ.get("POLARS_VERBOSE", 0))): + warnings.warn(error_message, PerformanceWarning, stacklevel=2) + if raise_on_fail: + raise exception + else: + nt.set_udf( + partial(_callback, ir, device=device, memory_resource=memory_resource) ) - if raise_on_fail: - raise diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 1f935190f28..98e8a83b04e 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -227,6 +227,7 @@ class ErrorNode(IR): def __init__(self, schema: Schema, error: str): self.schema = schema self.error = error + self.children = () class PythonScan(IR): @@ -546,7 +547,7 @@ def do_evaluate( # shifts the row index. # But prior to 1.13, polars had this wrong, so we match behaviour # https://github.com/pola-rs/polars/issues/19607 - offset += skip_rows # pragma: no cover; polars 1.13 not yet released + offset += skip_rows dtype = schema[name] step = plc.interop.from_arrow( pa.scalar(1, type=plc.interop.to_arrow(dtype)) diff --git a/python/cudf_polars/cudf_polars/dsl/nodebase.py b/python/cudf_polars/cudf_polars/dsl/nodebase.py index 228d300f467..dd5c40a00be 100644 --- a/python/cudf_polars/cudf_polars/dsl/nodebase.py +++ b/python/cudf_polars/cudf_polars/dsl/nodebase.py @@ -43,9 +43,7 @@ class Node(Generic[T]): def _ctor_arguments(self, children: Sequence[T]) -> Sequence[Any | T]: return (*(getattr(self, attr) for attr in self._non_child), *children) - def reconstruct( - self, children: Sequence[T] - ) -> Self: # pragma: no cover; not yet used + def reconstruct(self, children: Sequence[T]) -> Self: """ Rebuild this node with new children. diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 2f95cd38c57..080a1af6e19 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -40,7 +40,7 @@ def pytest_configure(config: pytest.Config) -> None: ) config.addinivalue_line( "filterwarnings", - "ignore:.*Query execution with GPU not supported", + "ignore:.*Query execution with GPU not possible", ) diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 32ea142a96c..785e87391e7 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.11,<1.13", + "polars>=1.11,<1.14", "pylibcudf==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index 9900f598e5f..25b71716eed 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -30,7 +30,7 @@ def raise_unimplemented(self, *args): pytest.raises(pl.exceptions.ComputeError), pytest.warns( pl.exceptions.PerformanceWarning, - match="Query execution with GPU not supported", + match="Query execution with GPU not possible", ), ): # And ensure that collecting issues the correct warning. From 487f97c036ae7919e98ddc8bf5412a8002a493c5 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 12 Nov 2024 15:20:58 -0800 Subject: [PATCH 19/19] Always prefer `device_read`s and `device_write`s when kvikIO is enabled (#17260) Issue #17259 Avoid checking `_gds_read_preferred_threshold` threshold when deciding whether `device_read`/device_write` is preferred to host IO + copy. The reasons are twofold: 1. KvikIO already has an internal threshold for GDS use so we don't need to check on our end as well. 2. Without actual GDS use, kvikIO uses a pinned bounce buffer to efficiently copy to/from the device. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Tianyu Liu (https://github.com/kingcrimsontianyu) - Basit Ayantunde (https://github.com/lamarrr) URL: https://github.com/rapidsai/cudf/pull/17260 --- cpp/src/io/utilities/data_sink.cpp | 8 ++++++-- cpp/src/io/utilities/datasource.cpp | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 15de5d85614..68377ad6d5f 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -72,8 +72,12 @@ class file_sink : public data_sink { [[nodiscard]] bool is_device_write_preferred(size_t size) const override { - if (size < _gds_write_preferred_threshold) { return false; } - return supports_device_write(); + if (!supports_device_write()) { return false; } + + // Always prefer device writes if kvikio is enabled + if (!_kvikio_file.closed()) { return true; } + + return size >= _gds_write_preferred_threshold; } std::future device_write_async(void const* gpu_data, diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 5ccc91e4220..0870e4a84a7 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -95,8 +95,12 @@ class file_source : public datasource { [[nodiscard]] bool is_device_read_preferred(size_t size) const override { - if (size < _gds_read_preferred_threshold) { return false; } - return supports_device_read(); + if (!supports_device_read()) { return false; } + + // Always prefer device reads if kvikio is enabled + if (!_kvikio_file.closed()) { return true; } + + return size >= _gds_read_preferred_threshold; } std::future device_read_async(size_t offset,