From 111caf65e623ea2d8c850cd1a8d527a2839af277 Mon Sep 17 00:00:00 2001 From: David Sisson Date: Fri, 23 Jun 2023 01:22:20 -0700 Subject: [PATCH] feat: add root names to the textplan Root names are added to both the parser and converter in this PR. The root relation is not being treated as a relation internally and instead is treated merely as annotation containing a list of names. This is primarily because the relation-related codepaths make assumptions that wouldn't apply to the root (such as having a valid Relation proto as its data type). --- src/substrait/textplan/SymbolTable.h | 1 + src/substrait/textplan/SymbolTablePrinter.cpp | 57 +++++++++++++++++++ .../converter/InitialPlanProtoVisitor.cpp | 9 +++ .../parser/SubstraitPlanRelationVisitor.cpp | 3 + .../textplan/parser/SubstraitPlanVisitor.cpp | 31 ++++++++++ .../parser/grammar/SubstraitPlanLexer.g4 | 1 + .../parser/grammar/SubstraitPlanParser.g4 | 3 + .../parser/tests/TextPlanParserTest.cpp | 2 +- 8 files changed, 106 insertions(+), 1 deletion(-) diff --git a/src/substrait/textplan/SymbolTable.h b/src/substrait/textplan/SymbolTable.h index 19e6d419..a0b4ad58 100644 --- a/src/substrait/textplan/SymbolTable.h +++ b/src/substrait/textplan/SymbolTable.h @@ -25,6 +25,7 @@ enum class SymbolType { kSource = 7, kSourceDetail = 8, kField = 9, + kRoot = 10, kUnknown = -1, }; diff --git a/src/substrait/textplan/SymbolTablePrinter.cpp b/src/substrait/textplan/SymbolTablePrinter.cpp index 121274a6..2faa1c85 100644 --- a/src/substrait/textplan/SymbolTablePrinter.cpp +++ b/src/substrait/textplan/SymbolTablePrinter.cpp @@ -158,6 +158,44 @@ std::string outputRelationsSection(const SymbolTable& symbolTable) { return text.str(); } +std::string outputRootSection(const SymbolTable& symbolTable) { + std::stringstream text; + bool hasPreviousText = false; + for (const SymbolInfo& info : symbolTable) { + if (info.type != SymbolType::kRoot) { + continue; + } + auto names = ANY_CAST(std::vector, info.blob); + if (names.empty()) { + // No point in printing an empty section. + continue; + } + if (hasPreviousText) { + text << "\n"; + } + text << "root {" + << "\n"; + text << " names = ["; + bool hadName = false; + for (const auto& name : names) { + if (hadName) { + text << ",\n"; + } else { + text << "\n"; + } + text << " " << name; + hadName = true; + } + if (hadName) { + text << "\n"; + } + text << " ]\n"; + text << "}\n"; + hasPreviousText = true; + } + return text.str(); +} + std::string outputSchemaSection(const SymbolTable& symbolTable) { std::stringstream text; bool hasPreviousText = false; @@ -427,6 +465,15 @@ std::string SymbolTablePrinter::outputToText(const SymbolTable& symbolTable) { hasPreviousText = true; } + newText = outputRootSection(symbolTable); + if (!newText.empty()) { + if (hasPreviousText) { + text << "\n"; + } + text << newText; + hasPreviousText = true; + } + newText = outputSchemaSection(symbolTable); if (!newText.empty()) { if (hasPreviousText) { @@ -676,6 +723,16 @@ ::substrait::proto::Plan SymbolTablePrinter::outputToBinaryPlan( addInputsToRelation( *relationData->newPipelines[0], relation->mutable_root()->mutable_input()); + + const auto& rootSymbol = + symbolTable.nthSymbolByType(0, SymbolType::kRoot); + if (rootSymbol != SymbolInfo::kUnknown) { + const auto& rootNames = + ANY_CAST(std::vector, rootSymbol.blob); + for (const auto& name : rootNames) { + relation->mutable_root()->add_names(name); + } + } } } diff --git a/src/substrait/textplan/converter/InitialPlanProtoVisitor.cpp b/src/substrait/textplan/converter/InitialPlanProtoVisitor.cpp index 53a5ce24..566bc77b 100644 --- a/src/substrait/textplan/converter/InitialPlanProtoVisitor.cpp +++ b/src/substrait/textplan/converter/InitialPlanProtoVisitor.cpp @@ -167,6 +167,15 @@ std::any InitialPlanProtoVisitor::visitRelation( std::any InitialPlanProtoVisitor::visitRelationRoot( const ::substrait::proto::RelRoot& relation) { + auto uniqueName = symbolTable_->getUniqueName("root"); + std::vector names; + names.insert(names.end(), relation.names().begin(), relation.names().end()); + symbolTable_->defineSymbol( + uniqueName, + PROTO_LOCATION(relation), + SymbolType::kRoot, + SourceType::kUnknown, + names); BasePlanProtoVisitor::visitRelationRoot(relation); return std::nullopt; } diff --git a/src/substrait/textplan/parser/SubstraitPlanRelationVisitor.cpp b/src/substrait/textplan/parser/SubstraitPlanRelationVisitor.cpp index 471c1b29..9ed04f84 100644 --- a/src/substrait/textplan/parser/SubstraitPlanRelationVisitor.cpp +++ b/src/substrait/textplan/parser/SubstraitPlanRelationVisitor.cpp @@ -258,6 +258,9 @@ std::any SubstraitPlanRelationVisitor::visitRelation( // This error has been previously dealt with thus we can safely skip it. return defaultResult(); } + if (symbol->type == SymbolType::kRoot) { + return defaultResult(); + } auto relationData = ANY_CAST(std::shared_ptr, symbol->blob); ::substrait::proto::Rel relation; diff --git a/src/substrait/textplan/parser/SubstraitPlanVisitor.cpp b/src/substrait/textplan/parser/SubstraitPlanVisitor.cpp index 33657ad3..dd12f569 100644 --- a/src/substrait/textplan/parser/SubstraitPlanVisitor.cpp +++ b/src/substrait/textplan/parser/SubstraitPlanVisitor.cpp @@ -9,10 +9,13 @@ #include "substrait/textplan/Finally.h" #include "substrait/textplan/Location.h" #include "substrait/textplan/StructuredSymbolData.h" +#include "substrait/textplan/SymbolTable.h" #include "substrait/type/Type.h" namespace io::substrait::textplan { +const std::string kRootName{"root"}; + // Removes leading and trailing quotation marks. std::string extractFromString(std::string s) { if (s.size() < 2) { @@ -160,6 +163,34 @@ std::any SubstraitPlanVisitor::visitSchema_item( std::any SubstraitPlanVisitor::visitRelation( SubstraitPlanParser::RelationContext* ctx) { + if (!ctx->id().empty()) { + auto prevRoot = symbolTable_->lookupSymbolByName(kRootName); + if (prevRoot != nullptr) { + if (prevRoot->type == SymbolType::kRoot) { + errorListener_->addError( + ctx->getStart(), "A root relation was already defined."); + } else { + errorListener_->addError( + ctx->getStart(), "A relation named root was already defined."); + } + return nullptr; + } + std::vector names; + for (const auto& id : ctx->id()) { + names.push_back(id->getText()); + } + symbolTable_->defineSymbol( + kRootName, + Location(ctx), + SymbolType::kRoot, + SourceType::kUnknown, + names); + return nullptr; + } + if (ctx->relation_type() == nullptr) { + errorListener_->addError(ctx->getStart(), "Could not parse this relation."); + return nullptr; + } auto relType = ANY_CAST(RelationType, visitRelation_type(ctx->relation_type())); if (ctx->relation_ref() == nullptr) { diff --git a/src/substrait/textplan/parser/grammar/SubstraitPlanLexer.g4 b/src/substrait/textplan/parser/grammar/SubstraitPlanLexer.g4 index e96edf5f..1ad91127 100644 --- a/src/substrait/textplan/parser/grammar/SubstraitPlanLexer.g4 +++ b/src/substrait/textplan/parser/grammar/SubstraitPlanLexer.g4 @@ -50,6 +50,7 @@ NAMED_TABLE: 'NAMED_TABLE'; EXTENSION_TABLE: 'EXTENSION_TABLE'; SOURCE: 'SOURCE'; +ROOT: 'ROOT'; ITEMS: 'ITEMS'; NAMES: 'NAMES'; URI_FILE: 'URI_FILE'; diff --git a/src/substrait/textplan/parser/grammar/SubstraitPlanParser.g4 b/src/substrait/textplan/parser/grammar/SubstraitPlanParser.g4 index f5520241..8124f64c 100644 --- a/src/substrait/textplan/parser/grammar/SubstraitPlanParser.g4 +++ b/src/substrait/textplan/parser/grammar/SubstraitPlanParser.g4 @@ -40,6 +40,7 @@ pipeline // TODO -- Make the token order involving ids consistent between relations and other top-level entities. relation : relation_type RELATION relation_ref LEFTBRACE relation_detail* RIGHTBRACE + | ROOT LEFTBRACE NAMES EQUAL LEFTBRACKET id (COMMA id)* COMMA? RIGHTBRACKET RIGHTBRACE ; relation_type @@ -209,6 +210,8 @@ id simple_id : IDENTIFIER | FILTER + | ROOT + | SOURCE | SCHEMA | NULLVAL | SORT diff --git a/src/substrait/textplan/parser/tests/TextPlanParserTest.cpp b/src/substrait/textplan/parser/tests/TextPlanParserTest.cpp index d6c0f169..25e3baa6 100644 --- a/src/substrait/textplan/parser/tests/TextPlanParserTest.cpp +++ b/src/substrait/textplan/parser/tests/TextPlanParserTest.cpp @@ -1001,7 +1001,7 @@ std::vector getTestCases() { "1:0 → extraneous input 'relation' expecting {, " "'EXTENSION_SPACE', 'SCHEMA', 'PIPELINES', 'FILTER', " "'GROUPING', 'MEASURE', 'SORT', 'COUNT', 'TYPE', 'SOURCE', " - "'NULL', IDENTIFIER}", + "'ROOT', 'NULL', IDENTIFIER}", "1:24 → mismatched input '{' expecting 'RELATION'", "1:9 → Unrecognized relation type: notyperelation", }),