Skip to content

Commit

Permalink
Improve how overlapping quotes are parsed in HTML elements
Browse files Browse the repository at this point in the history
  • Loading branch information
Blake-Madden committed Dec 4, 2023
1 parent cd5219f commit 57b6b80
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 14 deletions.
24 changes: 10 additions & 14 deletions src/import/html_extract_text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1334,30 +1334,26 @@ namespace lily_of_the_valley
else if (text[0] == L'<')
{ ++text; }

bool is_inside_of_quotes = false;
bool is_inside_of_single_quotes = false;
bool is_inside_of_double_quotes{ false };
bool is_inside_of_single_quotes{ false };
long openTagCount{ 0 };
while (text)
{
if (text[0] == 0)
{ return nullptr; }
else if (text[0] == 0x22) // double quote
// flip the state of double or single quote if not inside of
// the other type of quotes
else if (!is_inside_of_single_quotes && text[0] == L'\"')
{
is_inside_of_quotes = !is_inside_of_quotes;
// whether this double quote ends a quote pair or starts a new one, turn this flag
// off. This means that a double quote can close a single quote.
is_inside_of_single_quotes = false;
is_inside_of_double_quotes = !is_inside_of_double_quotes;
}
// if a single quote already started a quote pair (and this is closing it) or
// we are not inside of a double quote then count single quotes
else if ((!is_inside_of_quotes || is_inside_of_single_quotes) && text[0] == 0x27) //single quote
else if (!is_inside_of_double_quotes && text[0] == L'\'')
{
is_inside_of_quotes = !is_inside_of_quotes;
is_inside_of_single_quotes = true;
is_inside_of_single_quotes = !is_inside_of_single_quotes;
}
else if (!is_inside_of_quotes && text[0] == L'<')
else if (!is_inside_of_double_quotes && !is_inside_of_single_quotes && text[0] == L'<')
{ ++openTagCount; }
else if (!is_inside_of_quotes && text[0] == L'>')
else if (!is_inside_of_double_quotes && !is_inside_of_single_quotes && text[0] == L'>')
{
if (openTagCount == 0)
{ return text; }
Expand Down
50 changes: 50 additions & 0 deletions tests/htmlimporttests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1008,6 +1008,56 @@ TEST_CASE("HTML Parser", "[html import]")
CHECK(std::wcscmp(p, L"List. (pane)") == 0);
delete[] text;
}
SECTION("Embedded JS Quotes")
{
html_extract_text filter_html;
const wchar_t* text = LR"(<html>Hello <input type='submit' id='gform_submit_button_12' class='gform_button button' value='Submit' onclick='if(window["gf_submitting_12"]){return false;} window["gf_submitting_12"]=true; ' onkeypress='if( event.keyCode == 13 ){ if(window["gf_submitting_12"]){return false;} window["gf_submitting_12"]=true; jQuery("#gform_12").trigger("submit",[true]); }' />there</html>)";
const wchar_t* p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"Hello \n\nthere") == p);
}
SECTION("Embedded JS Quotes 2")
{
html_extract_text filter_html;
const wchar_t* text = LR"(<html>Hello <input type="submit" id="gform_submit_button_12" class="gform_button button" value="Submit" onclick="if(window['gf_submitting_12']){return false;} window['gf_submitting_12']=true; " onkeypress="if( event.keyCode == 13 ){ if(window["gf_submitting_12"]){return false;} window["gf_submitting_12"]=true; jQuery('#gform_12').trigger('submit', [true]); }" />there</html>)";
const wchar_t* p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"Hello \n\nthere") == p);
}
SECTION("Elements with Quotes")
{
html_extract_text filter_html;
const wchar_t* text = LR"(Hello <a hef="submit">there)";
const wchar_t* p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"Hello there") == p);

text = LR"(Hello <a hef='submit'>there)";
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"Hello there") == p);

text = LR"(Hello <a hef='su"b"mit'>there)";
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"Hello there") == p);

text = LR"(Hello <a hef="sub'm'it">there)";
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"Hello there") == p);

text = LR"(Hello <a hef='submit' name="name" value="5">there)";
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"Hello there") == p);

text = LR"(Hello <a hef='su<>bmit' name="na<>me" value="<5">there)";
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"Hello there") == p);

// Mismatch, will be trash. Just read what we can.
text = LR"(Hello <a hef='submit">there)";
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"Hello ") == p);

text = LR"(Hello <a hef="submit'>there)";
p = filter_html(text, std::wcslen(text), true, false);
CHECK(std::wstring(L"Hello ") == p);
}
SECTION("Compare Entities")
{
const wchar_t* text = L"<span>List.</span> \r\n (pane)";
Expand Down

0 comments on commit 57b6b80

Please sign in to comment.