Skip to content

Commit

Permalink
vaev-markup: create own buffer for peeking in html parser
Browse files Browse the repository at this point in the history
Avoid mixup between compliant temporary buffer and our own workaround buffer
for "peeking"
  • Loading branch information
pauloamed committed Jan 6, 2025
1 parent 457eef8 commit d74fe13
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 8 deletions.
17 changes: 9 additions & 8 deletions src/web/vaev-markup/html.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1778,27 +1778,28 @@ void HtmlLexer::consume(Rune rune, bool isEof) {
// 13.2.5.42 MARK: Markup declaration open state
// If the next few characters are:

_temp.append(rune);
peekerForSingleState.append(rune);

// Two U+002D HYPHEN-MINUS characters (-)
// Consume those two characters, create a comment token whose data
// is the empty string, and switch to the comment start state.
if (auto r = startWith("--"s, _temp.str()); r != Match::NO) {
if (auto r = startWith("--"s, peekerForSingleState.str()); r != Match::NO) {
if (r == Match::PARTIAL)
break;

_temp.clear();
peekerForSingleState.clear();
_begin(HtmlToken::COMMENT);
_switchTo(State::COMMENT_START);
}

// ASCII case-insensitive match for the word "DOCTYPE"
// Consume those characters and switch to the DOCTYPE state.

else if (auto r = startWith("DOCTYPE"s, _temp.str()); r != Match::NO) {
else if (auto r = startWith("DOCTYPE"s, peekerForSingleState.str()); r != Match::NO) {
if (r == Match::PARTIAL)
break;

_temp.clear();
peekerForSingleState.clear();
_switchTo(State::DOCTYPE);
}

Expand All @@ -1810,12 +1811,12 @@ void HtmlLexer::consume(Rune rune, bool isEof) {
// error. Create a comment token whose data is the "[CDATA[" string.
// Switch to the bogus comment state.

else if (auto r = startWith("[CDATA["s, _temp.str()); r != Match::NO) {
else if (auto r = startWith("[CDATA["s, peekerForSingleState.str()); r != Match::NO) {
if (r == Match::PARTIAL)
break;

// NOSPEC: This is in reallity more complicated
_temp.clear();
peekerForSingleState.clear();
_switchTo(State::CDATA_SECTION);
}

Expand All @@ -1824,7 +1825,7 @@ void HtmlLexer::consume(Rune rune, bool isEof) {
// comment token whose data is the empty string. Switch to the bogus
// comment state (don't consume anything in the current state).
else {
_temp.clear();
peekerForSingleState.clear();
_raise("incorrectly-opened-comment");
_begin(HtmlToken::COMMENT);
_reconsumeIn(State::BOGUS_COMMENT, rune);
Expand Down
1 change: 1 addition & 0 deletions src/web/vaev-markup/html.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ struct HtmlLexer {
Rune _currChar = 0;
StringBuilder _builder, _commentBuilder;
StringBuilder _temp;
StringBuilder peekerForSingleState;

HtmlToken &_begin(HtmlToken::Type type) {
_token = HtmlToken{};
Expand Down
29 changes: 29 additions & 0 deletions src/web/vaev-markup/tests/test-xhtml-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,33 @@ test$("parse-doctype") {
return Ok();
}

test$("parse-title") {
auto s = Io::SScan("<title>the title</title>");
XmlParser p{};
auto dom = makeStrong<Markup::Document>(Mime::Url());
try$(p.parse(s, Vaev::HTML, *dom));
expect$(dom->title() == "the title");
return Ok();
}

test$("parse-comment-with-gt-symb") {
auto s = Io::SScan(
"<title>im a title!</title>"
"<!-- a b <meta> c d -->"
);
XmlParser p{};
auto dom = makeStrong<Markup::Document>(Mime::Url());
try$(p.parse(s, Vaev::HTML, *dom));

expect$(dom->hasChildren());
auto title = dom->firstChild();
expect$(title->nodeType() == NodeType::ELEMENT);

auto comment = title->nextSibling();
expect$(comment->nodeType() == NodeType::COMMENT);
expect$(try$(comment.cast<Comment>())->data == " a b <meta> c d ");

return Ok();
}

} // namespace Vaev::Markup::Tests

0 comments on commit d74fe13

Please sign in to comment.