Skip to content

Commit

Permalink
Merge branch 'branch-24.12' into catboost-integration-tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Matt711 authored Nov 13, 2024
2 parents dbd6d90 + 487f97c commit b173250
Show file tree
Hide file tree
Showing 316 changed files with 6,237 additions and 1,208 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ repos:
entry: |
# Check for usage of default_rng without seeding
default_rng\(\)|
# Check for usage of np.random.seed
np.random.seed\(
# Check for usage of np.random.seed (NPY002 only disallows this being called)
np.random.seed
language: pygrep
types: [python]
- id: cmake-format
Expand Down
2 changes: 2 additions & 0 deletions ci/build_wheel_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,6 @@ python -m auditwheel repair \
-w ${package_dir}/final_dist \
${package_dir}/dist/*

./ci/validate_wheel.sh ${package_dir} final_dist

RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
1 change: 1 addition & 0 deletions ci/build_wheel_cudf_polars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set -euo pipefail
package_dir="python/cudf_polars"

./ci/build_wheel.sh cudf-polars ${package_dir}
./ci/validate_wheel.sh ${package_dir} dist

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
1 change: 1 addition & 0 deletions ci/build_wheel_dask_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set -euo pipefail
package_dir="python/dask_cudf"

./ci/build_wheel.sh dask-cudf ${package_dir}
./ci/validate_wheel.sh ${package_dir} dist

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
2 changes: 2 additions & 0 deletions ci/build_wheel_libcudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,6 @@ python -m auditwheel repair \
-w ${package_dir}/final_dist \
${package_dir}/dist/*

./ci/validate_wheel.sh ${package_dir} final_dist

RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist"
2 changes: 2 additions & 0 deletions ci/build_wheel_pylibcudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,6 @@ python -m auditwheel repair \
-w ${package_dir}/final_dist \
${package_dir}/dist/*

./ci/validate_wheel.sh ${package_dir} final_dist

RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
23 changes: 1 addition & 22 deletions ci/test_cudf_polars_polars_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,6 @@

set -eou pipefail

# We will only fail these tests if the PR touches code in pylibcudf
# or cudf_polars itself.
# Note, the three dots mean we are doing diff between the merge-base
# of upstream and HEAD. So this is asking, "does _this branch_ touch
# files in cudf_polars/pylibcudf", rather than "are there changes
# between upstream and this branch which touch cudf_polars/pylibcudf"
# TODO: is the target branch exposed anywhere in an environment variable?
if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
then
HAS_CHANGES=1
rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
else
HAS_CHANGES=0
rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
fi

rapids-logger "Download wheels"

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
Expand Down Expand Up @@ -63,9 +47,4 @@ if [ ${EXITCODE} != 0 ]; then
else
rapids-logger "Running polars test suite PASSED"
fi

if [ ${HAS_CHANGES} == 1 ]; then
exit ${EXITCODE}
else
exit 0
fi
exit ${EXITCODE}
23 changes: 1 addition & 22 deletions ci/test_wheel_cudf_polars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,6 @@

set -eou pipefail

# We will only fail these tests if the PR touches code in pylibcudf
# or cudf_polars itself.
# Note, the three dots mean we are doing diff between the merge-base
# of upstream and HEAD. So this is asking, "does _this branch_ touch
# files in cudf_polars/pylibcudf", rather than "are there changes
# between upstream and this branch which touch cudf_polars/pylibcudf"
# TODO: is the target branch exposed anywhere in an environment variable?
if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/pylibcudf/)" ];
then
HAS_CHANGES=1
rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
else
HAS_CHANGES=0
rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
fi

rapids-logger "Download wheels"

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
Expand Down Expand Up @@ -65,9 +49,4 @@ if [ ${EXITCODE} != 0 ]; then
else
rapids-logger "Testing PASSED"
fi

if [ ${HAS_CHANGES} == 1 ]; then
exit ${EXITCODE}
else
exit 0
fi
exit ${EXITCODE}
21 changes: 21 additions & 0 deletions ci/validate_wheel.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
# Copyright (c) 2024, NVIDIA CORPORATION.

set -euo pipefail

package_dir=$1
wheel_dir_relative_path=$2

cd "${package_dir}"

rapids-logger "validate packages with 'pydistcheck'"

pydistcheck \
--inspect \
"$(echo ${wheel_dir_relative_path}/*.whl)"

rapids-logger "validate packages with 'twine'"

twine check \
--strict \
"$(echo ${wheel_dir_relative_path}/*.whl)"
4 changes: 2 additions & 2 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ dependencies:
- nbsphinx
- ninja
- notebook
- numba-cuda>=0.0.13
- numba-cuda>=0.0.13,<0.0.18
- numpy>=1.23,<3.0a0
- numpydoc
- nvcc_linux-64=11.8
Expand All @@ -66,7 +66,7 @@ dependencies:
- pandas
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.11,<1.13
- polars>=1.11,<1.14
- pre-commit
- ptxcompiler
- pyarrow>=14.0.0,<19.0.0a0
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ dependencies:
- nbsphinx
- ninja
- notebook
- numba-cuda>=0.0.13
- numba-cuda>=0.0.13,<0.0.18
- numpy>=1.23,<3.0a0
- numpydoc
- nvcomp==4.1.0.6
Expand All @@ -64,7 +64,7 @@ dependencies:
- pandas
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.11,<1.13
- polars>=1.11,<1.14
- pre-commit
- pyarrow>=14.0.0,<19.0.0a0
- pydata-sphinx-theme!=0.14.2
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf-polars/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ requirements:
run:
- python
- pylibcudf ={{ version }}
- polars >=1.11,<1.12
- polars >=1.11,<1.14
- {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}

test:
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ requirements:
- typing_extensions >=4.0.0
- pandas >=2.0,<2.2.4dev0
- cupy >=12.0.0
- numba-cuda >=0.0.13
- numba-cuda >=0.0.13,<0.0.18
- numpy >=1.23,<3.0a0
- pyarrow>=14.0.0,<18.0.0a0
- libcudf ={{ version }}
Expand Down
18 changes: 17 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,12 @@ option(
mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL)
option(CUDF_STATIC_LINTERS "Enable static linters during compilation" OFF)

option(
CUDF_KVIKIO_REMOTE_IO
"Enable remote IO (e.g. AWS S3) support through KvikIO. If disabled, cudf-python will still be able to do remote IO through fsspec."
ON
)

message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}")
message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
message(VERBOSE "CUDF: Configure CMake to build (google & nvbench) benchmarks: ${BUILD_BENCHMARKS}")
Expand All @@ -109,6 +115,9 @@ message(
"CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler): ${CUDA_ENABLE_LINEINFO}"
)
message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
message(VERBOSE
"CUDF: Build with remote IO (e.g. AWS S3) support through KvikIO: ${CUDF_KVIKIO_REMOTE_IO}"
)

# Set a default build type if none was specified
rapids_cmake_build_type("Release")
Expand Down Expand Up @@ -394,11 +403,14 @@ add_library(
src/filling/repeat.cu
src/filling/sequence.cu
src/groupby/groupby.cu
src/groupby/hash/compute_aggregations.cu
src/groupby/hash/compute_aggregations_null.cu
src/groupby/hash/compute_global_memory_aggs.cu
src/groupby/hash/compute_global_memory_aggs_null.cu
src/groupby/hash/compute_groupby.cu
src/groupby/hash/compute_mapping_indices.cu
src/groupby/hash/compute_mapping_indices_null.cu
src/groupby/hash/compute_shared_memory_aggs.cu
src/groupby/hash/compute_single_pass_aggs.cu
src/groupby/hash/create_sparse_results_table.cu
src/groupby/hash/flatten_single_pass_aggs.cpp
src/groupby/hash/groupby.cu
Expand Down Expand Up @@ -693,6 +705,7 @@ add_library(
src/strings/replace/replace_slice.cu
src/strings/reverse.cu
src/strings/scan/scan_inclusive.cu
src/strings/search/contains_multiple.cu
src/strings/search/findall.cu
src/strings/search/find.cu
src/strings/search/find_multiple.cu
Expand Down Expand Up @@ -887,6 +900,9 @@ target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL
# Define spdlog level
target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}")

# Enable remote IO through KvikIO
target_compile_definitions(cudf PRIVATE $<$<BOOL:${CUDF_KVIKIO_REMOTE_IO}>:CUDF_KVIKIO_REMOTE_IO>)

# Compile stringified JIT sources first
add_dependencies(cudf jitify_preprocess_run)

Expand Down
5 changes: 3 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,8 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
ConfigureBench(TEXT_BENCH text/subword.cpp)

ConfigureNVBench(
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
)

# ##################################################################################################
Expand All @@ -375,6 +375,7 @@ ConfigureNVBench(
string/count.cpp
string/extract.cpp
string/find.cpp
string/find_multiple.cpp
string/join_strings.cpp
string/lengths.cpp
string/like.cpp
Expand Down
14 changes: 3 additions & 11 deletions cpp/benchmarks/string/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@
#include <cudf_test/column_wrapper.hpp>

#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/combine.hpp>
#include <cudf/strings/find.hpp>
#include <cudf/strings/find_multiple.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

Expand All @@ -44,15 +42,13 @@ static void bench_find_string(nvbench::state& state)
auto const col = create_string_column(n_rows, row_width, hit_rate);
auto const input = cudf::strings_column_view(col->view());

std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
cudf::string_scalar target(h_targets[2]);
cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
cudf::string_scalar target("0987 5W43");

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
auto const chars_size = input.chars_size(stream);
state.add_element_count(chars_size, "chars_size");
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
if (api.substr(0, 4) == "find") {
if (api == "find") {
state.add_global_memory_writes<nvbench::int32_t>(input.size());
} else {
state.add_global_memory_writes<nvbench::int8_t>(input.size());
Expand All @@ -61,10 +57,6 @@ static void bench_find_string(nvbench::state& state)
if (api == "find") {
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { cudf::strings::find(input, target); });
} else if (api == "find_multi") {
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
});
} else if (api == "contains") {
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
Expand All @@ -79,7 +71,7 @@ static void bench_find_string(nvbench::state& state)

NVBENCH_BENCH(bench_find_string)
.set_name("find_string")
.add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"})
.add_string_axis("api", {"find", "contains", "starts_with", "ends_with"})
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
.add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
.add_int64_axis("hit_rate", {20, 80}); // percentage
77 changes: 77 additions & 0 deletions cpp/benchmarks/string/find_multiple.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <cudf/strings/find.hpp>
#include <cudf/strings/find_multiple.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

static void bench_find_string(nvbench::state& state)
{
auto const n_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const hit_rate = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
auto const target_count = static_cast<cudf::size_type>(state.get_int64("targets"));
auto const api = state.get_string("api");

auto const stream = cudf::get_default_stream();
auto const col = create_string_column(n_rows, row_width, hit_rate);
auto const input = cudf::strings_column_view(col->view());

// Note that these all match the first row of the raw_data in create_string_column.
// This is so the hit_rate can properly accounted for.
std::vector<std::string> const target_data(
{" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"});
auto h_targets = std::vector<std::string>{};
for (cudf::size_type i = 0; i < target_count; i++) {
h_targets.emplace_back(target_data[i % target_data.size()]);
}
cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
auto const chars_size = input.chars_size(stream);
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
if (api == "find") {
state.add_global_memory_writes<nvbench::int32_t>(input.size());
} else {
state.add_global_memory_writes<nvbench::int8_t>(input.size());
}

if (api == "find") {
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
});
} else if (api == "contains") {
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::strings::contains_multiple(input, cudf::strings_column_view(targets));
});
}
}

NVBENCH_BENCH(bench_find_string)
.set_name("find_multiple")
.add_string_axis("api", {"find", "contains"})
.add_int64_axis("targets", {10, 20, 40})
.add_int64_axis("row_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_int64_axis("hit_rate", {20, 80}); // percentage
Loading

0 comments on commit b173250

Please sign in to comment.