From aa630bd2b316478b20595ddec830f0f360d077a0 Mon Sep 17 00:00:00 2001 From: Blake-Madden Date: Mon, 4 Dec 2023 16:39:59 -0500 Subject: [PATCH] Improvements to MD parser Better handling of overlapping styles, simplify header parsing --- src/import/markdown_extract_text.cpp | 238 ++++++++++++++++++++++++--- src/import/markdown_extract_text.h | 60 +++---- tests/mdparsetests.cpp | 66 ++++---- 3 files changed, 264 insertions(+), 100 deletions(-) diff --git a/src/import/markdown_extract_text.cpp b/src/import/markdown_extract_text.cpp index 573419fc..6530f168 100644 --- a/src/import/markdown_extract_text.cpp +++ b/src/import/markdown_extract_text.cpp @@ -39,6 +39,72 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std:: bool headerMode{ false }; wchar_t previousChar{ L'\n' }; + const auto parseStyledText = [&](const wchar_t tag) + { + // not styling text, just an orphan character that should be processed as-is + if (*start == tag && + start + 1 < endSentinel) + { + if (start[1] != tag && + !std::iswalnum(start[1]) && + start[1] != L'`') + { + add_character(*start); + previousChar = *start; + ++start; + return true; + } + } + while (*start == tag && + start < endSentinel) + { ++start; } + auto endOfTag = string_util::find_unescaped_char(start, tag); + if (endOfTag == nullptr || (endOfTag >= endSentinel)) + { + log_message(L"Missing matching styling tag in markdown file."); + return false; + } + while (*endOfTag == tag && + (endOfTag < endSentinel)) + { ++endOfTag; } + while (endOfTag < endSentinel && + (std::iswalnum(*endOfTag) || *endOfTag == L'`')) + { + endOfTag = string_util::find_unescaped_char(++endOfTag, tag); + if (endOfTag == nullptr || + endOfTag + 1 >= endSentinel) + { + log_message(L"Missing matching styling tag in markdown file."); + return false; + } + ++endOfTag; + } + // in case we stepped ahead, step back to the last closing tag + if (*endOfTag != tag) + { + --endOfTag; + } + // ...and then move back to the first tag of the consecutive closing tags + while (endOfTag > start && *endOfTag == tag) + { + --endOfTag; + } + if (*endOfTag != tag) + { + ++endOfTag; + } + + [[maybe_unused]] auto retval = m_subParser->operator()( + { start, static_cast(std::distance(start, endOfTag)) }); + add_characters( + { m_subParser->get_filtered_text(), m_subParser->get_filtered_text_length() }); + start = endOfTag; + while (*start == tag && + (start < endSentinel)) + { ++start; } + return true; + }; + while (start != nullptr && *start != 0 && (start < endSentinel)) { if (*start == L'\\') @@ -70,7 +136,7 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std:: start += END_FIGURE.length(); continue; } - else if (std::wcsncmp(start, L"\\@ref(", 6) == 0) + else if (std::wcsncmp(start, L"\\@ref(", 6) == 0) { start += 6; auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'(', L')'); @@ -88,6 +154,13 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std:: add_characters(L"\n\n"); continue; } + else if (start < endSentinel && + (start[1] == L'\n' || start[1] == L'\r')) + { + headerMode = true; + ++start; + continue; + } // actually is an escape character isEscaping = true; previousChar = *start; @@ -183,6 +256,19 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std:: log_message(L"Bad fenced code block in markdown file."); break; } + bool isMultiline{ false }; + auto scanAhead{ start }; + while (scanAhead < endOfTag) + { + if (*scanAhead == L'\r' || + *scanAhead == L'\n') + { + isMultiline = true; + break; + } + ++scanAhead; + } + bool pastFirstLine{ false }; // tab over each line inside of the code block while (start < endOfTag) { @@ -197,9 +283,13 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std:: ++start; } add_character(L'\t'); + pastFirstLine = true; continue; } - add_character(*start); + if (!isMultiline || pastFirstLine) + { + add_character(*start); + } ++start; } start = endOfTag + 3; @@ -317,29 +407,65 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std:: } continue; } - else if (std::wcsncmp(start, L"`r ", 3) == 0) - { start += 3; } - else if (std::wcsncmp(start, L"`python ", 8) == 0) - { start += 8; } // read content as-is otherwise else { - ++start; - while (start < endSentinel && *start != '`') + if (std::wcsncmp(start, L"`r ", 3) == 0) + { start += 3; } + else if (std::wcsncmp(start, L"`python ", 8) == 0) + { start += 8; } + else { - previousChar = *start; - add_character(*start); ++start; } + // `` section, which can have embedded backticks if (*start == '`') { - ++start; + auto endOfTag = std::wcsstr(++start, L"``"); + if (endOfTag == nullptr || + endOfTag > endSentinel) + { + log_message(L"Bad inline `` code block in markdown file."); + break; + } + // read in content verbatim + while (start < endOfTag) + { + previousChar = *start; + add_character(*start); + ++start; + } + start += 2; + } + // just a single backtick block, should be on one line + else + { + while (start < endSentinel && *start != '`') + { + // inline blocks should be on one line, so bail if we hit a new line + // as this would probably be a missing closing backtick + if (*start == L'\n' || *start == L'\r') + { + log_message(L"Unterminated inline ` code block in markdown file."); + previousChar = *start; + add_character(*start); + ++start; + break; + } + previousChar = *start; + add_character(*start); + ++start; + } + if (*start == '`') + { + ++start; + } } continue; } } } - // images + // images (we don't read in the alt text inside of the [], just skip everything) else if (*start == L'!') { if (!isEscaping && @@ -347,20 +473,24 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std:: start[1] == L'[') { start += 2; - auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'[', L']'); + auto endOfTag = string_util::find_unescaped_matching_close_tag_same_line(start, L'[', L']'); if (endOfTag == nullptr) { log_message(L"Bad image command in markdown file."); - break; + previousChar = L'['; + add_character(L'['); + continue; } start = ++endOfTag; if (*start == L'(') { - endOfTag = string_util::find_unescaped_matching_close_tag(++start, L'(', L')'); + endOfTag = string_util::find_unescaped_matching_close_tag_same_line(++start, L'(', L')'); if (endOfTag == nullptr) { log_message(L"Bad image command in markdown file."); - break; + previousChar = L'('; + add_character(L'('); + continue; } start = ++endOfTag; } @@ -373,21 +503,28 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std:: if (!isEscaping) { auto labelStart{ ++start }; - auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'[', L']'); + auto endOfTag = string_util::find_unescaped_matching_close_tag_same_line(start, L'[', L']'); if (endOfTag == nullptr) { - log_message(L"Bad link command in markdown file."); - break; + log_message(L"Bad link command in markdown file. Missing closing ']'"); + // just treat it like a stray '[' and keep going + previousChar = L'['; + add_character(L'['); + continue; } start = ++endOfTag; if (*start == L'(') { auto labelEnd{ start - 1}; - endOfTag = string_util::find_unescaped_matching_close_tag(++start, L'(', L')'); + endOfTag = string_util::find_unescaped_matching_close_tag_same_line(++start, L'(', L')'); if (endOfTag == nullptr) { - log_message(L"Bad link command in markdown file."); - break; + log_message(L"Bad link command in markdown file. Missing closing ')'"); + // read in the label and '(' section after it as-is if closing ')' is missing + start = labelStart; + previousChar = L'['; + add_character(L'['); + continue; } start = ++endOfTag; if (labelStart < labelEnd) @@ -469,6 +606,22 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std:: add_characters(L"\n\n"); continue; } + else if (!isEscaping && + std::wcsncmp(start, L"< br/>", 6) == 0) + { + start += 6; + previousChar = L'\n'; + add_characters(L"\n\n"); + continue; + } + else if (!isEscaping && + std::wcsncmp(start, L"
", 5) == 0) + { + start += 5; + previousChar = L'\n'; + add_characters(L"\n\n"); + continue; + } else if (*start == L'<') { if (!isEscaping && @@ -595,12 +748,43 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std:: } // styling tags that just get removed from raw text else if (!isEscaping && - (*start == L'*' || *start == L'_' || *start == L'~') ) + std::iswspace(previousChar) && + *start == L'*') { - while ((*start == L'*' || *start == L'_' || *start == L'~') && - (start < endSentinel)) - { ++start; } - continue; + if (parseStyledText(L'*')) + { + continue; + } + else + { + break; + } + } + else if (!isEscaping && + std::iswspace(previousChar) && + *start == L'_') + { + if (parseStyledText(L'_')) + { + continue; + } + else + { + break; + } + } + else if (!isEscaping && + std::iswspace(previousChar) && + *start == L'~') + { + if (parseStyledText(L'~')) + { + continue; + } + else + { + break; + } } // table else if (!isEscaping && diff --git a/src/import/markdown_extract_text.h b/src/import/markdown_extract_text.h index df89df2d..1fc62868 100644 --- a/src/import/markdown_extract_text.h +++ b/src/import/markdown_extract_text.h @@ -31,19 +31,15 @@ namespace lily_of_the_valley #endif /** @returns @c true if text marks the start of a Markdown metadata section. @param md_text The start of the Markdown text. - @note Multimarkdown, YAML, and Pandoc sections are supported. + @note YAML sections are supported. @warning @c md_text must be the start of the Markdown document.*/ [[nodiscard]] bool has_metadata_section(const wchar_t* md_text) const { - if (!md_text) + if (md_text == nullptr) { return false; } - // will just be reviewing the first line of text - auto eol = string_util::strcspn_pointer(md_text, L"\n\r", 2); - if (!eol) - { return false; } - - return std::regex_match(md_text, eol, m_metadataSectionStart); + // does the first line start with --- + return (std::wcsncmp(md_text, L"---", 3) == 0); } /** @brief Metadata sections end on the first blank like, so moves to that. @@ -53,39 +49,33 @@ namespace lily_of_the_valley [[nodiscard]] static const wchar_t* find_metadata_section_end(const wchar_t* md_text) noexcept { - if (!md_text) + if (md_text == nullptr) { return nullptr; } - while (md_text[0] != 0 && md_text[1] != 0) + // step over first line + auto eol = string_util::strcspn_pointer(md_text, L"\r\n", 2); + if (eol == nullptr) { - // if a Windows \r\n combination, then if followed by either - // \r or \n means we have a blank line - if (md_text[0] == L'\r' && md_text[1] == L'\n') - { - if (md_text[2] == 0) - { return md_text+2; } - else if (string_util::is_either(md_text[2], L'\n', L'\r')) - { return md_text+3; } - } - // otherwise, if 2 consecutive breaks then we have a blank line - else if (string_util::is_either(md_text[0], L'\n', L'\r') && - string_util::is_either(md_text[1], L'\n', L'\r')) - { return md_text+2; } - ++md_text; + return md_text; + } + // ...and find the terminating --- line + auto endOfYaml = std::wcsstr(eol, L"\n---"); + if (endOfYaml == nullptr) + { + return md_text; + } + endOfYaml = string_util::strcspn_pointer(endOfYaml + 4, L"\r\n", 2); + if (endOfYaml == nullptr) + { + return md_text; + } + while (*endOfYaml == L'\r' || *endOfYaml == L'\n') + { + ++endOfYaml; } - // if stopped on the last character because the whole file is just metadata, - // then step to the end. - if (md_text[0] != 0 && md_text[1] == 0) - { return ++md_text; } - return md_text; + return endOfYaml; } - /** Metadata section types start with these: - - Pandoc: % and a space. - - YAML header: ---*/ - std::wregex m_metadataSectionStart - { LR"(^(%[[:space:]]+|---).*$)" }; - std::unique_ptr m_subParser{ nullptr }; }; } diff --git a/tests/mdparsetests.cpp b/tests/mdparsetests.cpp index 1e6ab2fd..570d82f7 100644 --- a/tests/mdparsetests.cpp +++ b/tests/mdparsetests.cpp @@ -18,52 +18,25 @@ TEST_CASE("Markdown Parser", "[md import]") SECTION("Meta Sections") { lily_of_the_valley::markdown_extract_text md; - // pandoc - auto mdText = L"% title: my book\n\nHere is the *actual* \\*text to **review**."; - CHECK(md.has_metadata_section(mdText)); // YAML - mdText = L"---\n title:my book\n\nHere is the *actual* \\*text to **review**."; + auto mdText = L"---\n title:my book\n\nHere is the *actual* \\*text to **review**."; CHECK(md.has_metadata_section(mdText)); } SECTION("Meta Section End") { - // test \n - auto mdText = L"keyvalue:12\nKey2: re\n\nHere is the *actual* \\*text to **review**."; lily_of_the_valley::markdown_extract_text md; - auto end = md.find_metadata_section_end(mdText); - CHECK(end == mdText+22); - - // test \r - mdText = L"keyvalue:12\rKey2: re\r\rHere is the *actual* \\*text to **review**."; - end = md.find_metadata_section_end(mdText); - CHECK(end == mdText+22); - - // test \r\n combinations - mdText = L"keyvalue:12\r\nKey2: re\r\n\r\nHere is the *actual* \\*text to **review**."; - end = md.find_metadata_section_end(mdText); - CHECK(end == mdText+24); - - mdText = L"keyvalue:12\r\nKey2: re\r\n\nHere is the *actual* \\*text to **review**."; - end = md.find_metadata_section_end(mdText); - CHECK(end == mdText+24); - - // test being at the end of the text - mdText = L"title\r\n"; - end = md.find_metadata_section_end(mdText); - CHECK(end == mdText+7); - - // test being at the end of the text, because there was no blank lines - mdText = L"title"; - end = md.find_metadata_section_end(mdText); - CHECK(end == mdText+5); + CHECK(std::wstring{ md({ L"---\ntitle:my book\n---\nHere is the *actual* text to **review**." }) } == + std::wstring{ L"Here is the actual text to review." }); } SECTION("Newlines") { lily_of_the_valley::markdown_extract_text md; CHECK(std::wstring{ md({ L"# Header\nThis is\na line.\r\nThis is the same line. \nThis is a new line.\r\n\r\nAnother line. \nSame line." }) } == - std::wstring{ L"Header\n\nThis is a line. This is the same line. \n\nThis is a new line.\n\nAnother line. Same line." }); + std::wstring{ L"Header\n\nThis is a line. This is the same line. \n\nThis is a new line.\n\nAnother line. Same line." }); + CHECK(std::wstring{ md({ L"*`where`*\\\nNext line" }) } == + std::wstring{ L"where\n\nNext line" }); } SECTION("Header") @@ -82,8 +55,15 @@ TEST_CASE("Markdown Parser", "[md import]") lily_of_the_valley::markdown_extract_text md; CHECK(std::wstring{ md({ L"This is *italic* and **bold** and also __italic__. 2 \\* 2." }) } == std::wstring{ L"This is italic and bold and also italic. 2 * 2." }); + CHECK(std::wstring{ md({ L"This is _italic and **bold** text_." }) } == + std::wstring{ L"This is italic and bold text." }); + // can't handle "This is *italic and **bold** text*", will have to be a known limitation CHECK(std::wstring{ md({ L"**PGF\\_HOT**" }) } == std::wstring{ L"PGF_HOT" }); + CHECK(std::wstring{ md({ L"TIFF _spe_ci**f**i_c_ *options*" }) } == + std::wstring{ L"TIFF spe_ci**f**i_c options" }); + CHECK(std::wstring{ md({ L"2 * 2" }) } == + std::wstring{ L"2 * 2" }); } SECTION("Blockquoes") @@ -111,14 +91,22 @@ TEST_CASE("Markdown Parser", "[md import]") std::wstring{ L"This is code." }); CHECK(std::wstring{ md({ L"Code `r 2+2`." }) } == std::wstring{ L"Code 2+2." }); + CHECK(std::wstring{ md({ L"``2`2`` `shared_ptr`" }) } == + std::wstring{ L"2`2 shared_ptr" }); + CHECK(std::wstring{ md({ L"### `std::basic_istream::read` processing of `\\r\\n`` =>`\\n`\n `shared_ptr`" }) } == + std::wstring{ L"std::basic_istream::read processing of \\r\\n =>n\n shared_ptr" }); } SECTION("Code block") { lily_of_the_valley::markdown_extract_text md; - auto blah = std::wstring{ md({ L"This\n```\nis code\r\nhere```\n." }) }; + // inline (you aren't supposed to do this with ```, but people do) CHECK(std::wstring{ md({ L"This\n```\nis code\r\nhere```\n." }) } == std::wstring{ L"This \n\tis code\r\n\there\n\n." }); + // remove lang info + auto blah = std::wstring{ md({ L"This\n```cpp\nis code\r\nhere\n```\n." }) }; + CHECK(std::wstring{ md({ L"This\n```cpp\nis code\r\nhere\n```\n." }) } == + std::wstring{ L"This \n\tis code\r\n\there\n\t\n\n." }); } SECTION("Images") @@ -128,9 +116,9 @@ TEST_CASE("Markdown Parser", "[md import]") std::wstring{ L"Tux the penguin." }); // malformed CHECK(std::wstring{ md({ L"Tux ![Tux, the Linux mascot" }) } == - std::wstring{ L"Tux " }); + std::wstring{ L"Tux [Tux, the Linux mascot" }); CHECK(std::wstring{ md({ L"Tux ![Tux, the Linux mascot](/assets/tux.png" }) } == - std::wstring{ L"Tux " }); + std::wstring{ L"Tux (/assets/tux.png" }); } SECTION("Links") @@ -144,9 +132,11 @@ TEST_CASE("Markdown Parser", "[md import]") std::wstring{ L"Tux the Linux mascot the penguin." }); // malformed CHECK(std::wstring{ md({ L"Tux [the Linux mascot" }) } == - std::wstring{ L"Tux " }); + std::wstring{ L"Tux [the Linux mascot" }); CHECK(std::wstring{ md({ L"Tux [the Linux mascot](/assets/tux.png" }) } == - std::wstring{ L"Tux " }); + std::wstring{ L"Tux [the Linux mascot](/assets/tux.png" }); + CHECK(std::wstring{ md({ L"The third member function inserts the sequence [`first`, `last`). You use it" }) } == + std::wstring{ L"The third member function inserts the sequence [first, last). You use it" }); } SECTION("Unordered Lists")