From 57b6b80e2783fc485db3d9496bfb82bb82415004 Mon Sep 17 00:00:00 2001 From: Blake-Madden Date: Mon, 4 Dec 2023 16:38:51 -0500 Subject: [PATCH] Improve how overlapping quotes are parsed in HTML elements --- src/import/html_extract_text.cpp | 24 +++++++-------- tests/htmlimporttests.cpp | 50 ++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/src/import/html_extract_text.cpp b/src/import/html_extract_text.cpp index 4f50153d..bf110116 100644 --- a/src/import/html_extract_text.cpp +++ b/src/import/html_extract_text.cpp @@ -1334,30 +1334,26 @@ namespace lily_of_the_valley else if (text[0] == L'<') { ++text; } - bool is_inside_of_quotes = false; - bool is_inside_of_single_quotes = false; + bool is_inside_of_double_quotes{ false }; + bool is_inside_of_single_quotes{ false }; long openTagCount{ 0 }; while (text) { if (text[0] == 0) { return nullptr; } - else if (text[0] == 0x22) // double quote + // flip the state of double or single quote if not inside of + // the other type of quotes + else if (!is_inside_of_single_quotes && text[0] == L'\"') { - is_inside_of_quotes = !is_inside_of_quotes; - // whether this double quote ends a quote pair or starts a new one, turn this flag - // off. This means that a double quote can close a single quote. - is_inside_of_single_quotes = false; + is_inside_of_double_quotes = !is_inside_of_double_quotes; } - // if a single quote already started a quote pair (and this is closing it) or - // we are not inside of a double quote then count single quotes - else if ((!is_inside_of_quotes || is_inside_of_single_quotes) && text[0] == 0x27) //single quote + else if (!is_inside_of_double_quotes && text[0] == L'\'') { - is_inside_of_quotes = !is_inside_of_quotes; - is_inside_of_single_quotes = true; + is_inside_of_single_quotes = !is_inside_of_single_quotes; } - else if (!is_inside_of_quotes && text[0] == L'<') + else if (!is_inside_of_double_quotes && !is_inside_of_single_quotes && text[0] == L'<') { ++openTagCount; } - else if (!is_inside_of_quotes && text[0] == L'>') + else if (!is_inside_of_double_quotes && !is_inside_of_single_quotes && text[0] == L'>') { if (openTagCount == 0) { return text; } diff --git a/tests/htmlimporttests.cpp b/tests/htmlimporttests.cpp index 3d4e2a26..097f8fd6 100644 --- a/tests/htmlimporttests.cpp +++ b/tests/htmlimporttests.cpp @@ -1008,6 +1008,56 @@ TEST_CASE("HTML Parser", "[html import]") CHECK(std::wcscmp(p, L"List. (pane)") == 0); delete[] text; } + SECTION("Embedded JS Quotes") + { + html_extract_text filter_html; + const wchar_t* text = LR"(Hello there)"; + const wchar_t* p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"Hello \n\nthere") == p); + } + SECTION("Embedded JS Quotes 2") + { + html_extract_text filter_html; + const wchar_t* text = LR"(Hello there)"; + const wchar_t* p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"Hello \n\nthere") == p); + } + SECTION("Elements with Quotes") + { + html_extract_text filter_html; + const wchar_t* text = LR"(Hello there)"; + const wchar_t* p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"Hello there") == p); + + text = LR"(Hello there)"; + p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"Hello there") == p); + + text = LR"(Hello there)"; + p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"Hello there") == p); + + text = LR"(Hello there)"; + p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"Hello there") == p); + + text = LR"(Hello there)"; + p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"Hello there") == p); + + text = LR"(Hello there)"; + p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"Hello there") == p); + + // Mismatch, will be trash. Just read what we can. + text = LR"(Hello there)"; + p = filter_html(text, std::wcslen(text), true, false); + CHECK(std::wstring(L"Hello ") == p); + } SECTION("Compare Entities") { const wchar_t* text = L"List. \r\n (pane)";