Skip to content

Commit

Permalink
Added strings AST vs BINARY_OP benchmarks (#17128)
Browse files Browse the repository at this point in the history
This merge request implements benchmarks to compare the strings AST and BINARY_OPs. It also moves out the common string input generator function to a common benchmarks header as it is repeated across other benchmarks.

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #17128
  • Loading branch information
lamarrr authored Oct 28, 2024
1 parent abecd0b commit 4c04b7c
Show file tree
Hide file tree
Showing 8 changed files with 246 additions and 123 deletions.
95 changes: 94 additions & 1 deletion cpp/benchmarks/ast/transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,29 @@

#include <benchmarks/common/generate_input.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <cudf/ast/expressions.hpp>
#include <cudf/column/column.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table.hpp>
#include <cudf/transform.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <thrust/iterator/counting_iterator.h>

#include <nvbench/nvbench.cuh>
#include <nvbench/types.cuh>

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <iterator>
#include <list>
#include <memory>
#include <optional>
Expand Down Expand Up @@ -86,7 +99,71 @@ static void BM_ast_transform(nvbench::state& state)
auto const& expression_tree_root = expressions.back();

// Use the number of bytes read from global memory
state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
state.add_global_memory_reads<key_type>(static_cast<size_t>(table_size) * (tree_levels + 1));
state.add_global_memory_writes<key_type>(table_size);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
}

template <cudf::ast::ast_operator cmp_op, cudf::ast::ast_operator reduce_op>
static void BM_string_compare_ast_transform(nvbench::state& state)
{
auto const string_width = static_cast<cudf::size_type>(state.get_int64("string_width"));
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const num_comparisons = static_cast<cudf::size_type>(state.get_int64("num_comparisons"));
auto const hit_rate = static_cast<cudf::size_type>(state.get_int64("hit_rate"));

CUDF_EXPECTS(num_comparisons > 0, "benchmarks require 1 or more comparisons");

// Create table data
auto const num_cols = num_comparisons * 2;
std::vector<std::unique_ptr<cudf::column>> columns;
std::for_each(
thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
columns.emplace_back(create_string_column(num_rows, string_width, hit_rate));
});

cudf::table table{std::move(columns)};
cudf::table_view const table_view = table.view();

int64_t const chars_size = std::accumulate(
table_view.begin(),
table_view.end(),
static_cast<int64_t>(0),
[](int64_t size, auto& column) -> int64_t {
return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream());
});

// Create column references
auto column_refs = std::vector<cudf::ast::column_reference>();
std::transform(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(num_cols),
std::back_inserter(column_refs),
[](auto const& column_id) { return cudf::ast::column_reference(column_id); });

// Create expression trees
std::list<cudf::ast::operation> expressions;

// Construct AST tree (a == b && c == d && e == f && ...)

expressions.emplace_back(cudf::ast::operation(cmp_op, column_refs[0], column_refs[1]));

std::for_each(thrust::make_counting_iterator(1),
thrust::make_counting_iterator(num_comparisons),
[&](size_t idx) {
auto const& lhs = expressions.back();
auto const& rhs = expressions.emplace_back(
cudf::ast::operation(cmp_op, column_refs[idx * 2], column_refs[idx * 2 + 1]));
expressions.emplace_back(cudf::ast::operation(reduce_op, lhs, rhs));
});

auto const& expression_tree_root = expressions.back();

// Use the number of bytes read from global memory
state.add_element_count(chars_size, "chars_size");
state.add_global_memory_reads<nvbench::uint8_t>(chars_size);
state.add_global_memory_writes<nvbench::int32_t>(num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
Expand Down Expand Up @@ -115,3 +192,19 @@ AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true);
AST_TRANSFORM_BENCHMARK_DEFINE(
ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true);

#define AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \
static void name(::nvbench::state& st) \
{ \
::BM_string_compare_ast_transform<cmp_op, reduce_op>(st); \
} \
NVBENCH_BENCH(name) \
.set_name(#name) \
.add_int64_axis("string_width", {32, 64, 128, 256}) \
.add_int64_axis("num_rows", {32768, 262144, 2097152}) \
.add_int64_axis("num_comparisons", {1, 2, 3, 4}) \
.add_int64_axis("hit_rate", {50, 100})

AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(ast_string_equal_logical_and,
cudf::ast::ast_operator::EQUAL,
cudf::ast::ast_operator::LOGICAL_AND);
82 changes: 80 additions & 2 deletions cpp/benchmarks/binaryop/binaryop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,18 @@
#include <benchmarks/common/generate_input.hpp>

#include <cudf/binaryop.hpp>
#include <cudf/column/column.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>

#include <thrust/iterator/counting_iterator.h>

#include <nvbench/nvbench.cuh>

#include <algorithm>
#include <cstddef>
#include <memory>

// This set of benchmarks is designed to be a comparison for the AST benchmarks

Expand All @@ -44,7 +50,8 @@ static void BM_binaryop_transform(nvbench::state& state)
cudf::table_view table{*source_table};

// Use the number of bytes read from global memory
state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
state.add_global_memory_reads<key_type>(static_cast<size_t>(table_size) * (tree_levels + 1));
state.add_global_memory_writes<key_type>(table_size);

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
// Execute tree that chains additions like (((a + b) + c) + d)
Expand All @@ -64,11 +71,65 @@ static void BM_binaryop_transform(nvbench::state& state)
});
}

template <cudf::binary_operator cmp_op, cudf::binary_operator reduce_op>
static void BM_string_compare_binaryop_transform(nvbench::state& state)
{
auto const string_width = static_cast<cudf::size_type>(state.get_int64("string_width"));
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const num_comparisons = static_cast<cudf::size_type>(state.get_int64("num_comparisons"));
auto const hit_rate = static_cast<cudf::size_type>(state.get_int64("hit_rate"));

CUDF_EXPECTS(num_comparisons > 0, "benchmarks require 1 or more comparisons");

// Create table data
auto const num_cols = num_comparisons * 2;
std::vector<std::unique_ptr<cudf::column>> columns;
std::for_each(
thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
columns.emplace_back(create_string_column(num_rows, string_width, hit_rate));
});

cudf::table table{std::move(columns)};
cudf::table_view const table_view = table.view();

int64_t const chars_size = std::accumulate(
table_view.begin(), table_view.end(), static_cast<int64_t>(0), [](int64_t size, auto& column) {
return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream());
});

// Create column references

// Use the number of bytes read from global memory
state.add_element_count(chars_size, "chars_size");
state.add_global_memory_reads<nvbench::uint8_t>(chars_size);
state.add_global_memory_writes<nvbench::int32_t>(num_rows);

// Construct binary operations (a == b && c == d && e == f && ...)
auto constexpr bool_type = cudf::data_type{cudf::type_id::BOOL8};

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream{launch.get_stream().get_stream()};
std::unique_ptr<cudf::column> reduction =
cudf::binary_operation(table.get_column(0), table.get_column(1), cmp_op, bool_type, stream);
std::for_each(
thrust::make_counting_iterator(1),
thrust::make_counting_iterator(num_comparisons),
[&](size_t idx) {
std::unique_ptr<cudf::column> comparison = cudf::binary_operation(
table.get_column(idx * 2), table.get_column(idx * 2 + 1), cmp_op, bool_type, stream);
std::unique_ptr<cudf::column> reduced =
cudf::binary_operation(*comparison, *reduction, reduce_op, bool_type, stream);
stream.synchronize();
reduction = std::move(reduced);
});
});
}

#define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \
\
static void name(::nvbench::state& st) \
{ \
BM_binaryop_transform<key_type, tree_type, reuse_columns>(st); \
::BM_binaryop_transform<key_type, tree_type, reuse_columns>(st); \
} \
NVBENCH_BENCH(name) \
.add_int64_axis("tree_levels", {1, 2, 5, 10}) \
Expand All @@ -86,3 +147,20 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique,
double,
TreeType::IMBALANCED_LEFT,
false);

#define STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \
\
static void name(::nvbench::state& st) \
{ \
::BM_string_compare_binaryop_transform<cmp_op, reduce_op>(st); \
} \
NVBENCH_BENCH(name) \
.set_name(#name) \
.add_int64_axis("string_width", {32, 64, 128, 256}) \
.add_int64_axis("num_rows", {32768, 262144, 2097152}) \
.add_int64_axis("num_comparisons", {1, 2, 3, 4}) \
.add_int64_axis("hit_rate", {50, 100})

STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(string_compare_binaryop_transform,
cudf::binary_operator::EQUAL,
cudf::binary_operator::LOGICAL_AND);
2 changes: 1 addition & 1 deletion cpp/benchmarks/binaryop/compiled_binaryop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
// use number of bytes read and written to global memory
state.add_global_memory_reads<TypeLhs>(table_size);
state.add_global_memory_reads<TypeRhs>(table_size);
state.add_global_memory_reads<TypeOut>(table_size);
state.add_global_memory_writes<TypeOut>(table_size);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); });
Expand Down
56 changes: 56 additions & 0 deletions cpp/benchmarks/common/generate_input.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,17 @@
#include "generate_input.hpp"
#include "random_distribution_factory.cuh"

#include <cudf_test/column_wrapper.hpp>

#include <cudf/column/column.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/valid_if.cuh>
#include <cudf/filling.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/strings/combine.hpp>
#include <cudf/table/table.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -918,6 +922,58 @@ std::unique_ptr<cudf::table> create_sequence_table(std::vector<cudf::type_id> co
return std::make_unique<cudf::table>(std::move(columns));
}

std::unique_ptr<cudf::column> create_string_column(cudf::size_type num_rows,
cudf::size_type row_width,
int32_t hit_rate)
{
// build input table using the following data
auto raw_data = cudf::test::strings_column_wrapper(
{
"123 abc 4567890 DEFGHI 0987 5W43", // matches both patterns;
"012345 6789 01234 56789 0123 456", // the rest do not match
"abc 4567890 DEFGHI 0987 Wxyz 123",
"abcdefghijklmnopqrstuvwxyz 01234",
"",
"AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
"9876543210,abcdefghijklmnopqrstU",
"9876543210,abcdefghijklmnopqrstU",
"123 édf 4567890 DéFG 0987 X5",
"1",
})
.release();

if (row_width / 32 > 1) {
std::vector<cudf::column_view> columns;
for (int i = 0; i < row_width / 32; ++i) {
columns.push_back(raw_data->view());
}
raw_data = cudf::strings::concatenate(cudf::table_view(columns));
}
auto data_view = raw_data->view();

// compute number of rows in n_rows that should match
auto const num_matches = (static_cast<int64_t>(num_rows) * hit_rate) / 100;

// Create a randomized gather-map to build a column out of the strings in data.
data_profile gather_profile =
data_profile_builder().cardinality(0).null_probability(0.0).distribution(
cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
auto gather_table =
create_random_table({cudf::type_id::INT32}, row_count{num_rows}, gather_profile);
gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);

// Create scatter map by placing 0-index values throughout the gather-map
auto scatter_data = cudf::sequence(num_matches,
cudf::numeric_scalar<int32_t>(0),
cudf::numeric_scalar<int32_t>(num_rows / num_matches));
auto zero_scalar = cudf::numeric_scalar<int32_t>(0);
auto table = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
auto gather_map = table->view().column(0);
table = cudf::gather(cudf::table_view({data_view}), gather_map);

return std::move(table->release().front());
}

std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(
cudf::size_type size, std::optional<double> null_probability, unsigned seed)
{
Expand Down
12 changes: 12 additions & 0 deletions cpp/benchmarks/common/generate_input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -670,6 +670,18 @@ std::unique_ptr<cudf::column> create_random_column(cudf::type_id dtype_id,
data_profile const& data_params = data_profile{},
unsigned seed = 1);

/**
* @brief Deterministically generates a large string column filled with data with the given
* parameters.
*
* @param num_rows Number of rows in the output column
* @param row_width Width of each string in the column
* @param hit_rate The hit rate percentage, ranging from 0 to 100
*/
std::unique_ptr<cudf::column> create_string_column(cudf::size_type num_rows,
cudf::size_type row_width,
int32_t hit_rate);

/**
* @brief Generate sequence columns starting with value 0 in first row and increasing by 1 in
* subsequent rows.
Expand Down
Loading

0 comments on commit 4c04b7c

Please sign in to comment.