From 1b925bfc7741eb22fed0a978fa0e1d0d5dfee601 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 25 Sep 2023 13:09:16 -0700 Subject: [PATCH] Add Parquet reader benchmarks for row selection (#14147) Re-enabled the group of benchmarks that compares row selection options in Parquet reader. Use `read_parquet_metadata` to get the column names and number of row groups. Clean up read chunk computation for ORC and Parquet benchmarks. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - https://github.com/nvdbaranec - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14147 --- cpp/benchmarks/io/cuio_common.cpp | 18 ++--- cpp/benchmarks/io/orc/orc_reader_options.cpp | 12 ++-- .../io/parquet/parquet_reader_options.cpp | 65 +++++++++++-------- 3 files changed, 53 insertions(+), 42 deletions(-) diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 6b8af91b842..b1aaef41340 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -141,17 +142,18 @@ std::vector select_column_names(std::vector const& col return col_names_to_read; } -std::vector segments_in_chunk(int num_segments, int num_chunks, int chunk) +std::vector segments_in_chunk(int num_segments, int num_chunks, int chunk_idx) { CUDF_EXPECTS(num_segments >= num_chunks, "Number of chunks cannot be greater than the number of segments in the file"); - auto start_segment = [num_segments, num_chunks](int chunk) { - return num_segments * chunk / num_chunks; - }; - std::vector selected_segments; - for (auto segment = start_segment(chunk); segment < start_segment(chunk + 1); ++segment) { - selected_segments.push_back(segment); - } + CUDF_EXPECTS(chunk_idx < num_chunks, + "Chunk index must be smaller than the number of chunks in the file"); + + auto const segments_in_chunk = cudf::util::div_rounding_up_unsafe(num_segments, num_chunks); + auto const begin_segment = std::min(chunk_idx * segments_in_chunk, num_segments); + auto const end_segment = std::min(begin_segment + segments_in_chunk, num_segments); + std::vector selected_segments(end_segment - begin_segment); + std::iota(selected_segments.begin(), selected_segments.end(), begin_segment); return selected_segments; } diff --git a/cpp/benchmarks/io/orc/orc_reader_options.cpp b/cpp/benchmarks/io/orc/orc_reader_options.cpp index 647a411c89d..1f656f7ea70 100644 --- a/cpp/benchmarks/io/orc/orc_reader_options.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_options.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -30,7 +31,7 @@ constexpr int64_t data_size = 512 << 20; // The number of separate read calls to use when reading files in multiple chunks // Each call reads roughly equal amounts of data -constexpr int32_t chunked_read_num_chunks = 8; +constexpr int32_t chunked_read_num_chunks = 4; std::vector get_top_level_col_names(cudf::io::source_info const& source) { @@ -88,7 +89,7 @@ void BM_orc_read_varying_options(nvbench::state& state, auto const num_stripes = cudf::io::read_orc_metadata(source_sink.make_source_info()).num_stripes(); - cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks; + auto const chunk_row_cnt = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks); auto mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -99,7 +100,6 @@ void BM_orc_read_varying_options(nvbench::state& state, timer.start(); cudf::size_type rows_read = 0; for (int32_t chunk = 0; chunk < num_chunks; ++chunk) { - auto const is_last_chunk = chunk == (num_chunks - 1); switch (RowSelection) { case row_selection::ALL: break; case row_selection::STRIPES: @@ -108,7 +108,6 @@ void BM_orc_read_varying_options(nvbench::state& state, case row_selection::NROWS: read_options.set_skip_rows(chunk * chunk_row_cnt); read_options.set_num_rows(chunk_row_cnt); - if (is_last_chunk) read_options.set_num_rows(-1); break; default: CUDF_FAIL("Unsupported row selection method"); } @@ -132,9 +131,6 @@ using col_selections = nvbench::enum_type_list; -using row_selections = - nvbench::enum_type_list; - NVBENCH_BENCH_TYPES(BM_orc_read_varying_options, NVBENCH_TYPE_AXES(col_selections, nvbench::enum_type_list, @@ -146,6 +142,8 @@ NVBENCH_BENCH_TYPES(BM_orc_read_varying_options, {"column_selection", "row_selection", "uses_index", "uses_numpy_dtype", "timestamp_type"}) .set_min_samples(4); +using row_selections = + nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_orc_read_varying_options, NVBENCH_TYPE_AXES(nvbench::enum_type_list, row_selections, diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp index 4105f2182d7..9f221de7da2 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -26,21 +27,21 @@ // Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput -constexpr std::size_t data_size = 512 << 20; -constexpr std::size_t row_group_size = 128 << 20; +constexpr std::size_t data_size = 512 << 20; +// The number of separate read calls to use when reading files in multiple chunks +// Each call reads roughly equal amounts of data +constexpr int32_t chunked_read_num_chunks = 4; std::vector get_top_level_col_names(cudf::io::source_info const& source) { - cudf::io::parquet_reader_options const read_options = - cudf::io::parquet_reader_options::builder(source); - auto const schema = cudf::io::read_parquet(read_options).metadata.schema_info; - - std::vector names; - names.reserve(schema.size()); - std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) { - return c.name; - }); - return names; + auto const top_lvl_cols = cudf::io::read_parquet_metadata(source).schema().root().children(); + std::vector col_names; + std::transform(top_lvl_cols.cbegin(), + top_lvl_cols.cend(), + std::back_inserter(col_names), + [](auto const& col_meta) { return col_meta.name(); }); + + return col_names; } template , nvbench::enum_type>) { + auto const num_chunks = RowSelection == row_selection::ALL ? 1 : chunked_read_num_chunks; + auto constexpr str_to_categories = ConvertsStrings == converts_strings::YES; auto constexpr uses_pd_metadata = UsesPandasMetadata == uses_pandas_metadata::YES; @@ -87,9 +90,8 @@ void BM_parquet_read_options(nvbench::state& state, .use_pandas_metadata(uses_pd_metadata) .timestamp_type(ts_type); - // TODO: add read_parquet_metadata to properly calculate #row_groups - auto constexpr num_row_groups = data_size / row_group_size; - auto constexpr num_chunks = 1; + auto const num_row_groups = read_parquet_metadata(source_sink.make_source_info()).num_rowgroups(); + auto const chunk_row_cnt = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks); auto mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -100,18 +102,15 @@ void BM_parquet_read_options(nvbench::state& state, timer.start(); cudf::size_type rows_read = 0; for (int32_t chunk = 0; chunk < num_chunks; ++chunk) { - auto const is_last_chunk = chunk == (num_chunks - 1); switch (RowSelection) { case row_selection::ALL: break; case row_selection::ROW_GROUPS: { - auto row_groups_to_read = segments_in_chunk(num_row_groups, num_chunks, chunk); - if (is_last_chunk) { - // Need to assume that an additional "overflow" row group is present - row_groups_to_read.push_back(num_row_groups); - } - read_options.set_row_groups({row_groups_to_read}); + read_options.set_row_groups({segments_in_chunk(num_row_groups, num_chunks, chunk)}); } break; - case row_selection::NROWS: [[fallthrough]]; + case row_selection::NROWS: + read_options.set_skip_rows(chunk * chunk_row_cnt); + read_options.set_num_rows(chunk_row_cnt); + break; default: CUDF_FAIL("Unsupported row selection method"); } @@ -130,14 +129,26 @@ void BM_parquet_read_options(nvbench::state& state, state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); } +using row_selections = + nvbench::enum_type_list; +NVBENCH_BENCH_TYPES(BM_parquet_read_options, + NVBENCH_TYPE_AXES(nvbench::enum_type_list, + row_selections, + nvbench::enum_type_list, + nvbench::enum_type_list, + nvbench::enum_type_list)) + .set_name("parquet_read_row_selection") + .set_type_axes_names({"column_selection", + "row_selection", + "str_to_categories", + "uses_pandas_metadata", + "timestamp_type"}) + .set_min_samples(4); + using col_selections = nvbench::enum_type_list; - -// TODO: row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file -// and determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863 - NVBENCH_BENCH_TYPES(BM_parquet_read_options, NVBENCH_TYPE_AXES(col_selections, nvbench::enum_type_list,