diff --git a/src/web/vaev-markup/html.cpp b/src/web/vaev-markup/html.cpp index 70aa7270..66c58fa6 100644 --- a/src/web/vaev-markup/html.cpp +++ b/src/web/vaev-markup/html.cpp @@ -3963,6 +3963,137 @@ static void insertAComment(HtmlParser &b, HtmlToken const &t) { location.insert(comment); } +// https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately +static void resetTheInsertionModeAppropriately(HtmlParser &b) { + // 1. Let last be false. + bool _last = false; + + // 2. Let node be the last node in the stack of open elements. + // 3. Loop: If node is the first node in the stack of open elements, then set last to true, and, + // if the parser was created as part of the HTML fragment parsing algorithm (fragment case), + // set node to the context element passed to that algorithm. + + auto nodeIdx = b._openElements.len() - 1; + while (true) { + auto node = b._openElements[nodeIdx]; + + if (nodeIdx == 0) + _last = true; + + // 4. If node is a select element, run these substeps: + if (node->tagName == Html::SELECT) { + // 4.1 If last is true, jump to the step below labeled done. + if (_last) { + b._switchTo(HtmlParser::Mode::IN_SELECT); + return; + } + + // 4.2 Let ancestor be node. + auto ancestorIdx = b._openElements.len() - 1; + + // 4.3 Loop: If ancestor is the first node in the stack of open elements, jump to the step below labeled done. + while (ancestorIdx != 0) { + // 4.4 Let ancestor be the node before ancestor in the stack of open elements. + ancestorIdx--; + + // 4.5 If ancestor is a template node, jump to the step below labeled done. + if (b._openElements[ancestorIdx]->tagName == Html::SELECT) + break; + + // 4.6 If ancestor is a table node, switch the insertion mode to "in select in table" and return. + if (b._openElements[ancestorIdx]->tagName == Html::TABLE) { + b._switchTo(HtmlParser::Mode::IN_SELECT_IN_TABLE); + return; + } + + // 4.7 Jump back to the step labeled loop. + } + + // 4.8 Done: Switch the insertion mode to "in select" and return.] + b._switchTo(HtmlParser::Mode::IN_SELECT); + return; + } + + // 5. If node is a td or th element and last is false, then switch the insertion mode to "in cell" and return. + if ((node->tagName == Html::TD or node->tagName == Html::TH) and not _last) { + b._switchTo(HtmlParser::Mode::IN_CELL); + return; + } + + // 6. If node is a tr element, then switch the insertion mode to "in row" and return. + if (node->tagName == Html::TR) { + b._switchTo(HtmlParser::Mode::IN_ROW); + return; + } + + // 7. If node is a tbody, thead, or tfoot element, then switch the insertion mode to "in table body" and return. + if (node->tagName == Html::TBODY or node->tagName == Html::THEAD or node->tagName == Html::TFOOT) { + b._switchTo(HtmlParser::Mode::IN_TABLE_BODY); + return; + } + + // 8. If node is a caption element, then switch the insertion mode to "in caption" and return. + if (node->tagName == Html::CAPTION) { + b._switchTo(HtmlParser::Mode::IN_CAPTION); + return; + } + + // 9. If node is a colgroup element, then switch the insertion mode to "in column group" and return. + if (node->tagName == Html::COLGROUP) { + b._switchTo(HtmlParser::Mode::IN_COLUMN_GROUP); + return; + } + + // 10. If node is a table element, then switch the insertion mode to "in table" and return. + if (node->tagName == Html::TABLE) { + b._switchTo(HtmlParser::Mode::IN_TABLE); + return; + } + + // 11. If node is a template element, then switch the insertion mode to the current template insertion mode and return. + + // 12. If node is a head element and last is false, then switch the insertion mode to "in head" and return. + if (node->tagName == Html::HEAD and not _last) { + b._switchTo(HtmlParser::Mode::IN_HEAD); + return; + } + + // 13. If node is a body element, then switch the insertion mode to "in body" and return. + if (node->tagName == Html::BODY) { + b._switchTo(HtmlParser::Mode::IN_BODY); + return; + } + + // 14. If node is a frameset element, then switch the insertion mode to "in frameset" and return. (fragment case) + if (node->tagName == Html::FRAMESET) { + b._switchTo(HtmlParser::Mode::IN_FRAMESET); + return; + } + + // 15. If node is an html element, run these substeps: + if (node->tagName == Html::HTML) { + // 15.1 If the head element pointer is null, switch the insertion mode to "before head" and return. (fragment case) + if (not b._headElement) + b._switchTo(HtmlParser::Mode::BEFORE_HEAD); + + // 15.2 Otherwise, the head element pointer is not null, switch the insertion mode to "after head" and return. + else + b._switchTo(HtmlParser::Mode::AFTER_HEAD); + + return; + } + + // 16. If last is true, then switch the insertion mode to "in body" and return. (fragment case) + if (_last) + b._switchTo(HtmlParser::Mode::IN_BODY); + + // 17. Let node now be the node before node in the stack of open elements. + nodeIdx--; + + // 18. Return to the step labeled loop. + } +} + // 13.2.6.2 MARK: Parsing elements that contain only text // https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text @@ -3986,7 +4117,7 @@ static constexpr Array IMPLIED_END_TAGS = { Html::DD, Html::DT, Html::LI, Html::OPTION, Html::OPTGROUP, Html::P, Html::RB, Html::RP, Html::RT, Html::RTC }; -static void generateImpliedEndTags(HtmlParser &b, TagName except) { +static void generateImpliedEndTags(HtmlParser &b, Opt except = NONE) { while (contains(IMPLIED_END_TAGS, last(b._openElements)->tagName) and last(b._openElements)->tagName != except) { b._openElements.popBack(); @@ -4671,6 +4802,19 @@ void HtmlParser::_handleInBody(HtmlToken const &t) { // TODO: An end tag token whose tag name is one of: "applet", "marquee", "object" // TODO: A start tag whose tag name is "table" + else if (t.type == HtmlToken::START_TAG and t.name == "table") { + // TODO: If the Document is not set to quirks mode, + // and the stack of open elements has a p element in button scope, then close a p element. + + // Insert an HTML element for the token. + insertHtmlElement(*this, t); + + // Set the frameset-ok flag to "not ok". + _framesetOk = false; + + // Switch the insertion mode to "in table". + _switchTo(Mode::IN_TABLE); + } // TODO: An end tag whose tag name is "br" @@ -4814,6 +4958,641 @@ void HtmlParser::_handleText(HtmlToken const &t) { // FIXME: Implement the rest of the rules } +static void _inTableModeAnythingElse(HtmlParser &b, HtmlToken const &t) { + // Parse error. + b._raise(); + + // Enable foster parenting, + b._fosterParenting = true; + + // process the token using the rules for the "in body" insertion mode, + b._acceptIn(HtmlParser::Mode::IN_BODY, t); + + // and then disable foster parenting. + b._fosterParenting = false; +} + +// 13.2.6.4.9 MARK: The "in table" insertion mode +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intable +void HtmlParser::_handleInTable(HtmlToken const &t) { + auto _clearTheStackBackToATableContext = [&]() { + while (last(_openElements)->tagName != Html::TABLE and + last(_openElements)->tagName != Html::TEMPLATE and + last(_openElements)->tagName != Html::HTML) { + + _openElements.popBack(); + } + }; + + // A character token, if the current node is table, tbody, template, tfoot, thead, or tr element + if (t.type == HtmlToken::CHARACTER and + (last(_openElements)->tagName == Html::TABLE or last(_openElements)->tagName == Html::TBODY or + last(_openElements)->tagName == Html::TEMPLATE or last(_openElements)->tagName == Html::TFOOT or + last(_openElements)->tagName == Html::THEAD or last(_openElements)->tagName == Html::TR + )) { + // Let the pending table character tokens be an empty list of tokens. + _pendingTableCharacterTokens.clear(); + + // Let the original insertion mode be the current insertion mode. + _originalInsertionMode = _insertionMode; + + // Switch the insertion mode to "in table text" and reprocess the token. + _switchTo(Mode::IN_TABLE_TEXT); + accept(t); + } + + // A comment token + else if (t.type == HtmlToken::COMMENT) { + // Insert a comment. + insertAComment(*this, t); + } + + // A DOCTYPE token + else if (t.type == HtmlToken::DOCTYPE) { + // Parse error. Ignore the token. + _raise(); + } + + // A start tag whose tag name is "caption" + else if (t.type == HtmlToken::START_TAG and t.name == "caption") { + // Clear the stack back to a table context. (See below.) + _clearTheStackBackToATableContext(); + + // TODO: Insert a marker at the end of the list of active formatting elements. + + // Insert an HTML element for the token, then switch the insertion mode to "in caption". + insertHtmlElement(*this, t); + _switchTo(Mode::IN_CAPTION); + } + + // A start tag whose tag name is "colgroup" + else if (t.type == HtmlToken::START_TAG and t.name == "colgroup") { + // Clear the stack back to a table context. (See below.) + _clearTheStackBackToATableContext(); + + // Insert an HTML element for the token, then switch the insertion mode to "in column group". + insertHtmlElement(*this, t); + _switchTo(Mode::IN_COLUMN_GROUP); + } + + // A start tag whose tag name is "col" + else if (t.type == HtmlToken::START_TAG and t.name == "col") { + // Clear the stack back to a table context. (See below.) + _clearTheStackBackToATableContext(); + + // Insert an HTML element for a "colgroup" start tag token with no attributes, then switch the insertion mode to "in column group". + HtmlToken colGroupToken; + colGroupToken.type = HtmlToken::START_TAG; + colGroupToken.name = String{"colgroup"}; + insertAForeignElement(*this, colGroupToken, Vaev::HTML); + _switchTo(Mode::IN_COLUMN_GROUP); + + // Reprocess the current token. + accept(t); + } + + // A start tag whose tag name is one of: "tbody", "tfoot", "thead" + else if (t.type == HtmlToken::START_TAG and + (t.name == "tbody" or t.name == "tfoot" or t.name == "thead")) { + // Clear the stack back to a table context. (See below.) + _clearTheStackBackToATableContext(); + + // Insert an HTML element for the token, then switch the insertion mode to "in table body". + insertHtmlElement(*this, t); + _switchTo(Mode::IN_TABLE_BODY); + } + + // A start tag whose tag name is one of: "td", "th", "tr" + else if (t.type == HtmlToken::START_TAG and + (t.name == "td" or t.name == "th" or t.name == "tr")) { + // Clear the stack back to a table context. (See below.) + _clearTheStackBackToATableContext(); + + // Insert an HTML element for a "tbody" start tag token with no attributes, then switch the insertion mode to "in table body". + HtmlToken TableBodyToken; + TableBodyToken.type = HtmlToken::START_TAG; + TableBodyToken.name = "tbody"s; + insertAForeignElement(*this, TableBodyToken, Vaev::HTML); + _switchTo(Mode::IN_TABLE_BODY); + + // Reprocess the current token. + accept(t); + } + + // A start tag whose tag name is "table" + else if (t.type == HtmlToken::START_TAG and t.name == "table") { + // Parse error. + _raise(); + + // If the stack of open elements does not have a table element in table scope, ignore the token. + if (not _hasElementInTableScope(Html::TABLE)) + return; + + // Otherwise: + + // Pop elements from this stack until a table element has been popped from the stack. + while (Karm::any(_openElements) and _openElements.popBack()->tagName != Html::TABLE) { + // do nothing + } + + // Reset the insertion mode appropriately. + resetTheInsertionModeAppropriately(*this); + + // Reprocess the token. + accept(t); + } + + // An end tag whose tag name is "table" + else if (t.type == HtmlToken::END_TAG and t.name == "table") { + // If the stack of open elements does not have a table element in table scope, this is a parse error; + // ignore the token. + if (not _hasElementInTableScope(Html::TABLE)) { + _raise(); + return; + } + + // Pop elements from this stack until a table element has been popped from the stack. + while (Karm::any(_openElements) and _openElements.popBack()->tagName != Html::TABLE) { + // do nothing + } + + // Reset the insertion mode appropriately. + resetTheInsertionModeAppropriately(*this); + } + + // An end tag whose tag name is one of: "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" + else if (t.type == HtmlToken::END_TAG and + (t.name == "body" or t.name == "caption" or t.name == "col" or + t.name == "colgroup" or t.name == "html" or t.name == "tbody" or + t.name == "td" or t.name == "tfoot" or t.name == "th" or + t.name == "thead" or t.name == "tr" + )) { + // Parse error. Ignore the token. + _raise(); + } + + // A start tag whose tag name is one of: "style", "script", "template" + else if (t.type == HtmlToken::START_TAG and + (t.name == "style" or t.name == "script" or t.name == "template")) { + // Process the token using the rules for the "in head" insertion mode. + _acceptIn(Mode::IN_HEAD, t); + } + + // An end tag whose tag name is "template" + else if (t.type == HtmlToken::END_TAG and t.name == "template") { + // Process the token using the rules for the "in head" insertion mode. + _acceptIn(Mode::IN_HEAD, t); + } + + // TODO: A start tag whose tag name is "input" + else if (t.type == HtmlToken::START_TAG and t.name == "input") { + + // If the token does not have an attribute with the name "type", + // or if it does, but that attribute's value is not an ASCII case-insensitive match for the string "hidden", + bool hasHiddenAsTypeAttrValue = false; + for (auto &[name, value] : t.attrs) { + if (name == "type") { + // TODO: ASCII case-insensitive match + if (value == "hidden") { + hasHiddenAsTypeAttrValue = true; + } + + break; + } + } + + // then: act as described in the "anything else" entry below. + if (hasHiddenAsTypeAttrValue) { + _inTableModeAnythingElse(*this, t); + return; + } + + // Parse error. + _raise(); + + // Insert an HTML element for the token. + insertHtmlElement(*this, t); + + // Pop that input element off the stack of open elements. + _openElements.popBack(); + + // TODO: Acknowledge the token's self-closing flag, if it is set. + } + + // A start tag whose tag name is "form" + else if (t.type == HtmlToken::START_TAG and t.name == "form") { + // Parse error. + _raise(); + + // If there is a template element on the stack of open elements, or if the form element pointer is not null, ignore the token. + for (auto &el : _openElements) { + if (el->tagName == Html::TEMPLATE) + return; + } + + if (not _formElement) + return; + + // Insert an HTML element for the token, and set the form element pointer to point to the element created. + HtmlToken formToken; + formToken.type = HtmlToken::START_TAG; + formToken.name = "form"s; + + _formElement = insertAForeignElement(*this, formToken, Vaev::HTML); + + // Pop that form element off the stack of open elements. + _openElements.popBack(); + } + + // An end-of-file token + else if (t.type == HtmlToken::END_OF_FILE) { + // Process the token using the rules for the "in body" insertion mode. + _acceptIn(Mode::IN_BODY, t); + } + + // Anything else + else { + _inTableModeAnythingElse(*this, t); + } +} + +// 13.2.6.4.10 MARK: The "in table text" insertion mode +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intabletext +void HtmlParser::_handleInTableText(HtmlToken const &t) { + + // A character token that is U+0000 NULL + if (t.type == HtmlToken::CHARACTER and t.rune == '\0') { + // Parse error. Ignore the token. + _raise(); + } + + // Any other character token + else if (t.type == HtmlToken::CHARACTER) { + // Append the character token to the pending table character tokens list. + _pendingTableCharacterTokens.pushBack(t); + } + + else { + // If any of the tokens in the pending table character tokens list are character tokens that are not ASCII + // whitespace, + // then this is a parse error: + bool hasNonWhitespace = false; + for (auto const &token : _pendingTableCharacterTokens) { + if (token.rune != '\t' and token.rune != '\n' and + token.rune != '\f' and token.rune != '\r' and token.rune != ' ') { + hasNonWhitespace = true; + break; + } + } + + if (hasNonWhitespace) { + // reprocess the character tokens in the pending table character tokens list using the rules given in + // the "anything else" entry in the "in table" insertion mode. + for (auto const &token : _pendingTableCharacterTokens) { + _inTableModeAnythingElse(*this, token); + } + } else { + // Otherwise, insert the characters given by the pending table character tokens list. + for (auto const &token : _pendingTableCharacterTokens) { + insertACharacter(*this, token); + } + } + + // Switch the insertion mode to the original insertion mode and reprocess the token. + _switchTo(_originalInsertionMode); + accept(t); + } +} + +// 13.2.6.4.13 MARK: The "in table body" insertion mode +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intbody +void HtmlParser::_handleInTableBody(HtmlToken const &t) { + auto _clearTheStackBackToATableBodyContext = [&]() { + while (last(_openElements)->tagName != Html::TBODY and + last(_openElements)->tagName != Html::TFOOT and + last(_openElements)->tagName != Html::THEAD and + last(_openElements)->tagName != Html::TEMPLATE and + last(_openElements)->tagName != Html::HTML) { + + _openElements.popBack(); + } + }; + + // A start tag whose tag name is "tr" + if (t.type == HtmlToken::START_TAG and t.name == "tr") { + // Clear the stack back to a table body context. (See below.) + _clearTheStackBackToATableBodyContext(); + + // Insert an HTML element for the token, then switch the insertion mode to "in row". + insertHtmlElement(*this, t); + _switchTo(Mode::IN_ROW); + } + + // A start tag whose tag name is one of: "th", "td" + else if (t.type == HtmlToken::START_TAG and (t.name == "th" or t.name == "td")) { + _raise(); + + // Clear the stack back to a table body context. (See below.) + _clearTheStackBackToATableBodyContext(); + + // Insert an HTML element for a "tr" start tag token with no attributes, then switch the insertion mode to "in row". + HtmlToken tableRowToken; + tableRowToken.type = HtmlToken::START_TAG; + tableRowToken.name = "tr"s; + insertAForeignElement(*this, tableRowToken, Vaev::HTML); + + _switchTo(Mode::IN_ROW); + + accept(t); + } + + else if (t.type == HtmlToken::END_TAG and (t.name == "tbody" or t.name == "tfoot" or t.name == "thead")) { + // If the stack of open elements does not have an element in table scope that is an HTML element with the same + // tag name as the token, this is a parse error; ignore the token. + + if (not _hasElementInTableScope(TagName::make(t.name, Vaev::HTML))) { + _raise(); + return; + } + + // Clear the stack back to a table body context. (See below.) + _clearTheStackBackToATableBodyContext(); + + // Pop the current node from the stack of open elements. Switch the insertion mode to "in table". + _openElements.popBack(); + _switchTo(Mode::IN_TABLE); + } + + else if ((t.type == HtmlToken::START_TAG and + (t.name == "caption" or t.name == "col" or t.name == "colgroup" or + t.name == "tbody" or t.name == "tfoot" or t.name == "thead" + )) or + (t.type == HtmlToken::END_TAG and t.name == "table" + )) { + + // If the stack of open elements does not have a tbody, thead, or tfoot element in table scope, + // TODO: consider refactor so _hasElementInScope accepts list instead of single element + if (not _hasElementInTableScope(Html::TBODY) and + not _hasElementInTableScope(Html::THEAD) and + not _hasElementInTableScope(Html::TFOOT)) { + // this is a parse error; ignore the token. + _raise(); + return; + } + + // Otherwise: + + // Clear the stack back to a table body context. (See below.) + _clearTheStackBackToATableBodyContext(); + + // Pop the current node from the stack of open elements. Switch the insertion mode to "in table". + _openElements.popBack(); + _switchTo(Mode::IN_TABLE); + + // Reprocess the token. + accept(t); + } + + // An end tag whose tag name is one of: "body", "caption", "col", "colgroup", "html", "td", "th", "tr" + else if (t.type == HtmlToken::END_TAG and + (t.name == "body" or t.name == "caption" or t.name == "col" or + t.name == "colgroup" or t.name == "html" or + t.name == "td" or t.name == "th" or t.name == "tr" + )) { + // Parse error. Ignore the token. + _raise(); + } + + else { + // Process the token using the rules for the "in table" insertion mode. + _acceptIn(Mode::IN_TABLE, t); + } +} + +// 13.2.6.4.14 MARK: The "in row" insertion mode +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intr +void HtmlParser::_handleInTableRow(HtmlToken const &t) { + auto _clearTheStackBackToATableRowContext = [&]() { + while (last(_openElements)->tagName != Html::TR and + last(_openElements)->tagName != Html::TEMPLATE and + last(_openElements)->tagName != Html::HTML) { + + _openElements.popBack(); + } + }; + + // A start tag whose tag name is one of: "th", "td" + if (t.type == HtmlToken::START_TAG and (t.name == "th" or t.name == "td")) { + // Clear the stack back to a table row context. (See below.) + _clearTheStackBackToATableRowContext(); + + // Insert an HTML element for the token, then switch the insertion mode to "in cell". + insertHtmlElement(*this, t); + _switchTo(Mode::IN_CELL); + + // TODO: Insert a marker at the end of the list of active formatting elements. + } + + // An end tag whose tag name is "tr" + else if (t.type == HtmlToken::END_TAG and t.name == "tr") { + if (not _hasElementInTableScope(Html::TR)) { + _raise(); + return; + } + + // Otherwise: + + // Clear the stack back to a table row context. (See below.) + _clearTheStackBackToATableRowContext(); + + // Pop the current node (which will be a tr element) from the stack of open elements. + _openElements.popBack(); + + // Switch the insertion mode to "in table body". + _switchTo(Mode::IN_TABLE_BODY); + } + + // A start tag whose tag name is one of: "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr" + // An end tag whose tag name is "table" + else if ((t.type == HtmlToken::START_TAG and + (t.name == "caption" or t.name == "col" or t.name == "colgroup" or + t.name == "tbody" or t.name == "tfoot" or t.name == "thead" or t.name == "tr" + )) or + (t.type == HtmlToken::END_TAG and t.name == "table" + )) { + + // If the stack of open elements does not have a tr element in table scope, + if (not _hasElementInTableScope(Html::TR)) { + // this is a parse error; ignore the token. + _raise(); + return; + } + + // Otherwise: + + // Clear the stack back to a table row context. (See below.) + _clearTheStackBackToATableRowContext(); + + // Pop the current node (which will be a tr element) from the stack of open elements. + _openElements.popBack(); + + // Switch the insertion mode to "in table body". + _switchTo(Mode::IN_TABLE_BODY); + + // Reprocess the token. + accept(t); + } + + // An end tag whose tag name is one of: "tbody", "tfoot", "thead" + else if (t.type == HtmlToken::END_TAG and (t.name == "tbody" or t.name == "tfoot" or t.name == "thead")) { + // If the stack of open elements does not have an element in table scope that is an HTML element with the same + // tag name as the token, + + if (not _hasElementInTableScope(TagName::make(t.name, Vaev::HTML))) { + // this is a parse error; ignore the token. + _raise(); + return; + } + + // Clear the stack back to a table body context. (See below.) + _clearTheStackBackToATableRowContext(); + + // Pop the current node (which will be a tr element) from the stack of open elements. + _openElements.popBack(); + + // Switch the insertion mode to "in table body". + _switchTo(Mode::IN_TABLE_BODY); + + // Reprocess the token. + accept(t); + } + + // An end tag whose tag name is one of: "body", "caption", "col", "colgroup", "html", "td", "th" + else if (t.type == HtmlToken::END_TAG and + (t.name == "body" or t.name == "caption" or t.name == "col" or + t.name == "colgroup" or t.name == "html" or + t.name == "td" or t.name == "th" + )) { + // Parse error. Ignore the token. + _raise(); + } + + else { + // Process the token using the rules for the "in table" insertion mode. + _acceptIn(Mode::IN_TABLE, t); + } +} + +// 13.2.6.4.15 MARK: The "in cell" insertion mode +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intd +void HtmlParser::_handleInCell(HtmlToken const &t) { + auto _closeTheCell = [&]() { + // Generate implied end tags. + generateImpliedEndTags(*this); + + // If the current node is not now a td element or a th element, then this is a parse error. + if (last(_openElements)->tagName != Html::TD and last(_openElements)->tagName != Html::TR) { + _raise(); + } + + // Pop elements from the stack of open elements until a td element or a th element has been popped from the stack. + while (Karm::any(_openElements)) { + auto poppedEl = _openElements.popBack(); + if (poppedEl == Html::TD or poppedEl == Html::TH) + break; + } + + // TODO: Clear the list of active formatting elements up to the last marker. + + // Switch the insertion mode to "in row". + _switchTo(Mode::IN_ROW); + }; + + // An end tag whose tag name is one of: "td", "th" + if (t.type == HtmlToken::END_TAG and (t.name == "td" or t.name == "th")) { + // If the stack of open elements does not have an element in table scope that is an HTML element with the same + // tag name as that of the token, + TagName tokenTagName{TagName::make(t.name, Vaev::HTML)}; + + if (not _hasElementInTableScope(tokenTagName)) { + // this is a parse error; ignore the token. + _raise(); + return; + } + + // Otherwise: + + // Generate implied end tags. + generateImpliedEndTags(*this); + + // Now, if the current node is not an HTML element with the same tag name as the token, + if (last(_openElements)->tagName != tokenTagName) { + // then this is a parse error. + _raise(); + } + + // Pop elements from the stack of open elements until an HTML element with the same tag name as + // the token has been popped from the stack. + while (Karm::any(_openElements) and _openElements.popBack()->tagName != tokenTagName) { + // do nothing + } + + // TODO: Clear the list of active formatting elements up to the last marker. + + // Switch the insertion mode to "in row". + _switchTo(Mode::IN_ROW); + } + + // A start tag whose tag name is one of: "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr" + else if (t.type == HtmlToken::START_TAG and + (t.name == "caption" or t.name == "col" or t.name == "colgroup" or + t.name == "tbody" or t.name == "td" or t.name == "tfoot" or + t.name == "th" or t.name == "thead" or t.name == "tr" + )) { + + // Assert: The stack of open elements has a td or th element in table scope. + if (not _hasElementInTableScope(Html::TD) and not _hasElementInTableScope(Html::TR)) { + _raise(); + // FIXME: should this be a panic()? + } + + // Close the cell (see below) and reprocess the token. + _closeTheCell(); + accept(t); + } + + // An end tag whose tag name is one of: "body", "caption", "col", "colgroup", "html" + else if (t.type == HtmlToken::END_TAG and + (t.name == "body" or t.name == "caption" or t.name == "col" or + t.name == "colgroup" or t.name == "html" + )) { + // Parse error. Ignore the token. + _raise(); + } + + // An end tag whose tag name is one of: "table", "tbody", "tfoot", "thead", "tr" + else if (t.type == HtmlToken::END_TAG and + (t.name == "table" or t.name == "tbody" or t.name == "tfoot" or t.name == "thead" or t.name == "tr")) { + + // If the stack of open elements does not have an element in table scope that is an HTML element with the same + // tag name as the token, + if (not _hasElementInTableScope(TagName::make(t.name, Vaev::HTML))) { + // this is a parse error; ignore the token. + _raise(); + return; + } + + // Otherwise, close the cell (see below) and reprocess the token. + _closeTheCell(); + accept(t); + } + + else { + // Process the token using the rules for the "in body" insertion mode. + _acceptIn(Mode::IN_BODY, t); + } +} + // 3.2.6.4.22 MARK: The "after after body" insertion mode // https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-body-insertion-mode void HtmlParser::_handleAfterBody(HtmlToken const &t) { @@ -4886,12 +5665,12 @@ void HtmlParser::_acceptIn(Mode mode, HtmlToken const &t) { _handleText(t); break; - // TODO: https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intable case Mode::IN_TABLE: + _handleInTable(t); break; - // TODO: https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intabletext case Mode::IN_TABLE_TEXT: + _handleInTableText(t); break; // TODO: https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incaption @@ -4902,16 +5681,16 @@ void HtmlParser::_acceptIn(Mode mode, HtmlToken const &t) { case Mode::IN_COLUMN_GROUP: break; - // TODO: https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intablebody case Mode::IN_TABLE_BODY: + _handleInTableBody(t); break; - // TODO: https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inrow case Mode::IN_ROW: + _handleInTableRow(t); break; - // TODO: https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incell case Mode::IN_CELL: + _handleInCell(t); break; // TODO: https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselect diff --git a/src/web/vaev-markup/html.h b/src/web/vaev-markup/html.h index fec5f803..a0b26560 100644 --- a/src/web/vaev-markup/html.h +++ b/src/web/vaev-markup/html.h @@ -228,6 +228,7 @@ struct HtmlParser : public HtmlSink { bool _scriptingEnabled = false; bool _framesetOk = true; + bool _fosterParenting = false; Mode _insertionMode = Mode::INITIAL; Mode _originalInsertionMode = Mode::INITIAL; @@ -238,6 +239,8 @@ struct HtmlParser : public HtmlSink { Opt> _headElement; Opt> _formElement; + Vec _pendingTableCharacterTokens; + HtmlParser(Strong document) : _document(document) { _lexer.bind(*this); @@ -321,6 +324,16 @@ struct HtmlParser : public HtmlSink { void _handleText(HtmlToken const &t); + void _handleInTable(HtmlToken const &t); + + void _handleInTableText(HtmlToken const &t); + + void _handleInTableBody(HtmlToken const &t); + + void _handleInTableRow(HtmlToken const &t); + + void _handleInCell(HtmlToken const &t); + void _handleAfterBody(HtmlToken const &t); void _switchTo(Mode mode); diff --git a/src/web/vaev-markup/tests/test-html-parser.cpp b/src/web/vaev-markup/tests/test-html-parser.cpp index 55419151..767651a5 100644 --- a/src/web/vaev-markup/tests/test-html-parser.cpp +++ b/src/web/vaev-markup/tests/test-html-parser.cpp @@ -324,5 +324,108 @@ test$("parse-input-element") { return Ok(); } +test$("parse-empty-table-element") { + auto dom = makeStrong(Mime::Url()); + Markup::HtmlParser parser{dom}; + + parser.write("
"); + + expect$(dom->nodeType() == NodeType::DOCUMENT); + expect$(dom->hasChildren()); + + auto html = try$(dom->firstChild().cast()); + expect$(html->tagName == Html::HTML); + expect$(html->children().len() == 2); + + auto body = try$(html->firstChild()->nextSibling().cast()); + expect$(body->tagName == Html::BODY); + expect$(body->children().len() == 1); + + auto table = try$(body->firstChild().cast()); + expect$(table->tagName == Html::TABLE); + expect$(not table->hasChildren()); + + return Ok(); +} + +test$("parse-table-element") { + auto dom = makeStrong(Mime::Url()); + Markup::HtmlParser parser{dom}; + + parser.write("
hi
"); + + expect$(dom->nodeType() == NodeType::DOCUMENT); + expect$(dom->hasChildren()); + + auto html = try$(dom->firstChild().cast()); + expect$(html->tagName == Html::HTML); + expect$(html->children().len() == 2); + + auto body = try$(html->firstChild()->nextSibling().cast()); + expect$(body->tagName == Html::BODY); + expect$(body->children().len() == 1); + + auto table = try$(body->firstChild().cast()); + expect$(table->tagName == Html::TABLE); + expect$(table->children().len() == 1); + + auto thead = try$(table->firstChild().cast()); + expect$(thead->tagName == Html::THEAD); + expect$(thead->children().len() == 1); + + auto headerRow = try$(thead->firstChild().cast()); + expect$(headerRow->tagName == Html::TR); + expect$(headerRow->children().len() == 1); + + auto headerCell = try$(headerRow->firstChild().cast()); + expect$(headerCell->tagName == Html::TH); + expect$(headerCell->children().len() == 1); + + auto text = headerCell->firstChild(); + expect$(text->nodeType() == NodeType::TEXT); + expect$(try$(text.cast())->data == "hi"); + + return Ok(); +} + +test$("parse-table-element-create-body-tr-scope") { + auto dom = makeStrong(Mime::Url()); + Markup::HtmlParser parser{dom}; + + parser.write("
hi
"); + + expect$(dom->nodeType() == NodeType::DOCUMENT); + expect$(dom->hasChildren()); + + auto html = try$(dom->firstChild().cast()); + expect$(html->tagName == Html::HTML); + expect$(html->children().len() == 2); + + auto body = try$(html->firstChild()->nextSibling().cast()); + expect$(body->tagName == Html::BODY); + expect$(body->children().len() == 1); + + auto table = try$(body->firstChild().cast()); + expect$(table->tagName == Html::TABLE); + expect$(table->children().len() == 1); + + auto tbody = try$(table->firstChild().cast()); + expect$(tbody->tagName == Html::TBODY); + expect$(tbody->children().len() == 1); + + auto row = try$(tbody->firstChild().cast()); + expect$(row->tagName == Html::TR); + expect$(row->children().len() == 1); + + auto header = try$(row->firstChild().cast()); + expect$(header->tagName == Html::TH); + expect$(header->children().len() == 1); + + auto text = header->firstChild(); + expect$(text->nodeType() == NodeType::TEXT); + expect$(try$(text.cast())->data == "hi"); + + return Ok(); +} } // namespace Vaev::Markup::Tests