From 826fdb7dd9eb1c82640820651a165cb5a6eca588 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 7 Aug 2024 15:30:05 +0800 Subject: [PATCH 1/4] Parquet metadata Printing sort-columns if having --- cpp/src/parquet/printer.cc | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 33df5925a1cf1..2f6ba6e0b4840 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -128,6 +128,15 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n"; stream << "--- Total Compressed Bytes: " << group_metadata->total_compressed_size() << " ---\n"; + auto sorting_columns = group_metadata->sorting_columns(); + if (!sorting_columns.empty()) { + stream << "--- Sort Columns:\n"; + for (auto column : sorting_columns) { + stream << "column_idx: " << column.column_idx + << ", descending: " << column.descending + << ", nulls_first: " << column.nulls_first << "\n"; + } + } stream << "--- Rows: " << group_metadata->num_rows() << " ---\n"; // Print column metadata @@ -267,6 +276,20 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", "; stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size() << "\", "; + auto row_group_sorting_columns = group_metadata->sorting_columns(); + if (!row_group_sorting_columns.empty()) { + stream << " \"SortColumns\": ["; + for (size_t i = 0; i < row_group_sorting_columns.size(); i++) { + stream << "{\"column_idx\":" << row_group_sorting_columns[i].column_idx + << ", \"descending\":" << row_group_sorting_columns[i].descending + << ", \"nulls_first\": " << row_group_sorting_columns[i].nulls_first + << "}"; + if (i + 1 != row_group_sorting_columns.size()) { + stream << ", "; + } + } + stream << "], "; + } stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n"; // Print column metadata From 4aaa382659678c09cff1a01cb1230a4541a441f2 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 8 Nov 2024 13:20:57 +0800 Subject: [PATCH 2/4] add testing, and format json output --- cpp/src/parquet/printer.cc | 9 +++++---- cpp/src/parquet/reader_test.cc | 12 +++++++++++- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 4d2cb75b15e64..82029edcb208c 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -296,17 +296,18 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected << "\", "; auto row_group_sorting_columns = group_metadata->sorting_columns(); if (!row_group_sorting_columns.empty()) { - stream << " \"SortColumns\": ["; + stream << " \"SortColumns\": [\n"; for (size_t i = 0; i < row_group_sorting_columns.size(); i++) { - stream << "{\"column_idx\":" << row_group_sorting_columns[i].column_idx + stream << " {\"column_idx\":" << row_group_sorting_columns[i].column_idx << ", \"descending\":" << row_group_sorting_columns[i].descending << ", \"nulls_first\": " << row_group_sorting_columns[i].nulls_first << "}"; if (i + 1 != row_group_sorting_columns.size()) { - stream << ", "; + stream << ","; } + stream << '\n'; } - stream << "], "; + stream << " ], "; } stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n"; diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index 688c875b9ec0f..df8c5c63bfeea 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -1180,6 +1180,16 @@ TEST_F(TestJSONWithLocalFile, JSONOutputFLBA) { EXPECT_THAT(json_content, testing::HasSubstr(json_contains)); } +TEST_F(TestJSONWithLocalFile, JSONOutputSortColumns) { + std::string json_content = ReadFromLocalFile("sort_columns.parquet"); + + std::string json_contains = R"###("SortColumns": [ + {"column_idx":0, "descending":1, "nulls_first": 1}, + {"column_idx":1, "descending":0, "nulls_first": 0} + ])###"; + EXPECT_THAT(json_content, testing::HasSubstr(json_contains)); +} + // GH-44101: Test that JSON output is valid JSON TEST_F(TestJSONWithLocalFile, ValidJsonOutput) { auto check_json_valid = [](std::string_view json_string) -> ::arrow::Status { @@ -1196,7 +1206,7 @@ TEST_F(TestJSONWithLocalFile, ValidJsonOutput) { std::vector check_file_lists = { "data_index_bloom_encoding_with_length.parquet", "data_index_bloom_encoding_stats.parquet", "alltypes_tiny_pages_plain.parquet", - "concatenated_gzip_members.parquet", "nulls.snappy.parquet"}; + "concatenated_gzip_members.parquet", "nulls.snappy.parquet", "sort_columns.parquet"}; for (const auto& file : check_file_lists) { std::string json_content = ReadFromLocalFile(file); ASSERT_OK(check_json_valid(json_content)) From 9a98505bc10327d20f621f6842139073fe765383 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 8 Nov 2024 15:50:18 +0800 Subject: [PATCH 3/4] more indent --- cpp/src/parquet/printer.cc | 4 ++-- cpp/src/parquet/reader_test.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 82029edcb208c..730e1e17ab23d 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -298,8 +298,8 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected if (!row_group_sorting_columns.empty()) { stream << " \"SortColumns\": [\n"; for (size_t i = 0; i < row_group_sorting_columns.size(); i++) { - stream << " {\"column_idx\":" << row_group_sorting_columns[i].column_idx - << ", \"descending\":" << row_group_sorting_columns[i].descending + stream << " {\"column_idx\": " << row_group_sorting_columns[i].column_idx + << ", \"descending\": " << row_group_sorting_columns[i].descending << ", \"nulls_first\": " << row_group_sorting_columns[i].nulls_first << "}"; if (i + 1 != row_group_sorting_columns.size()) { diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index df8c5c63bfeea..accfec0cf7723 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -1184,8 +1184,8 @@ TEST_F(TestJSONWithLocalFile, JSONOutputSortColumns) { std::string json_content = ReadFromLocalFile("sort_columns.parquet"); std::string json_contains = R"###("SortColumns": [ - {"column_idx":0, "descending":1, "nulls_first": 1}, - {"column_idx":1, "descending":0, "nulls_first": 0} + {"column_idx": 0, "descending": 1, "nulls_first": 1}, + {"column_idx": 1, "descending": 0, "nulls_first": 0} ])###"; EXPECT_THAT(json_content, testing::HasSubstr(json_contains)); } From 5ed5caeb9af76ce750ea45262997b52555b1ea8d Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 8 Nov 2024 17:08:12 +0800 Subject: [PATCH 4/4] fix lint --- cpp/src/parquet/reader_test.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index accfec0cf7723..62a971799c2db 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -1205,8 +1205,11 @@ TEST_F(TestJSONWithLocalFile, ValidJsonOutput) { }; std::vector check_file_lists = { "data_index_bloom_encoding_with_length.parquet", - "data_index_bloom_encoding_stats.parquet", "alltypes_tiny_pages_plain.parquet", - "concatenated_gzip_members.parquet", "nulls.snappy.parquet", "sort_columns.parquet"}; + "data_index_bloom_encoding_stats.parquet", + "alltypes_tiny_pages_plain.parquet", + "concatenated_gzip_members.parquet", + "nulls.snappy.parquet", + "sort_columns.parquet"}; for (const auto& file : check_file_lists) { std::string json_content = ReadFromLocalFile(file); ASSERT_OK(check_json_valid(json_content))