From aa630bd2b316478b20595ddec830f0f360d077a0 Mon Sep 17 00:00:00 2001
From: Blake-Madden <madindayton@outlook.com>
Date: Mon, 4 Dec 2023 16:39:59 -0500
Subject: [PATCH] Improvements to MD parser Better handling of overlapping
 styles, simplify header parsing

---
 src/import/markdown_extract_text.cpp | 238 ++++++++++++++++++++++++---
 src/import/markdown_extract_text.h   |  60 +++----
 tests/mdparsetests.cpp               |  66 ++++----
 3 files changed, 264 insertions(+), 100 deletions(-)

diff --git a/src/import/markdown_extract_text.cpp b/src/import/markdown_extract_text.cpp
index 573419fc..6530f168 100644
--- a/src/import/markdown_extract_text.cpp
+++ b/src/import/markdown_extract_text.cpp
@@ -39,6 +39,72 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std::
     bool headerMode{ false };
     wchar_t previousChar{ L'\n' };
 
+    const auto parseStyledText = [&](const wchar_t tag)
+        {
+        // not styling text, just an orphan character that should be processed as-is
+        if (*start == tag &&
+            start + 1 < endSentinel)
+            {
+            if (start[1] != tag &&
+                !std::iswalnum(start[1]) &&
+                start[1] != L'`')
+                {
+                add_character(*start);
+                previousChar = *start;
+                ++start;
+                return true;
+                }
+            }
+        while (*start == tag &&
+            start < endSentinel)
+            { ++start; }
+        auto endOfTag = string_util::find_unescaped_char(start, tag);
+        if (endOfTag == nullptr || (endOfTag >= endSentinel))
+            {
+            log_message(L"Missing matching styling tag in markdown file.");
+            return false;
+            }
+        while (*endOfTag == tag &&
+            (endOfTag < endSentinel))
+            { ++endOfTag; }
+        while (endOfTag < endSentinel &&
+            (std::iswalnum(*endOfTag) || *endOfTag == L'`'))
+            {
+            endOfTag = string_util::find_unescaped_char(++endOfTag, tag);
+            if (endOfTag == nullptr ||
+                endOfTag + 1 >= endSentinel)
+                {
+                log_message(L"Missing matching styling tag in markdown file.");
+                return false;
+                }
+            ++endOfTag;
+            }
+        // in case we stepped ahead, step back to the last closing tag
+        if (*endOfTag != tag)
+            {
+            --endOfTag;
+            }
+        // ...and then move back to the first tag of the consecutive closing tags
+        while (endOfTag > start && *endOfTag == tag)
+            {
+            --endOfTag;
+            }
+        if (*endOfTag != tag)
+            {
+            ++endOfTag;
+            }
+
+        [[maybe_unused]] auto retval = m_subParser->operator()(
+            { start, static_cast<size_t>(std::distance(start, endOfTag)) });
+        add_characters(
+            { m_subParser->get_filtered_text(), m_subParser->get_filtered_text_length() });
+        start = endOfTag;
+        while (*start == tag &&
+            (start < endSentinel))
+            { ++start; }
+        return true;
+        };
+
     while (start != nullptr && *start != 0 && (start < endSentinel))
         {
         if (*start == L'\\')
@@ -70,7 +136,7 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std::
                     start += END_FIGURE.length();
                     continue;
                     }
-                 else if (std::wcsncmp(start, L"\\@ref(", 6) == 0)
+                else if (std::wcsncmp(start, L"\\@ref(", 6) == 0)
                     {
                     start += 6;
                     auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'(', L')');
@@ -88,6 +154,13 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std::
                     add_characters(L"\n\n");
                     continue;
                     }
+                else if (start < endSentinel &&
+                    (start[1] == L'\n' || start[1] == L'\r'))
+                    {
+                    headerMode = true;
+                    ++start;
+                    continue;
+                    }
                 // actually is an escape character
                 isEscaping = true;
                 previousChar = *start;
@@ -183,6 +256,19 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std::
                     log_message(L"Bad fenced code block in markdown file.");
                     break;
                     }
+                bool isMultiline{ false };
+                auto scanAhead{ start };
+                while (scanAhead < endOfTag)
+                    {
+                    if (*scanAhead == L'\r' ||
+                        *scanAhead == L'\n')
+                        {
+                        isMultiline = true;
+                        break;
+                        }
+                    ++scanAhead;
+                    }
+                bool pastFirstLine{ false };
                 // tab over each line inside of the code block
                 while (start < endOfTag)
                     {
@@ -197,9 +283,13 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std::
                             ++start;
                             }
                         add_character(L'\t');
+                        pastFirstLine = true;
                         continue;
                         }
-                    add_character(*start);
+                    if (!isMultiline || pastFirstLine)
+                        {
+                        add_character(*start);
+                        }
                     ++start;
                     }
                 start = endOfTag + 3;
@@ -317,29 +407,65 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std::
                         }
                     continue;
                     }
-                else if (std::wcsncmp(start, L"`r ", 3) == 0)
-                    { start += 3; }
-                else if (std::wcsncmp(start, L"`python ", 8) == 0)
-                    { start += 8; }
                 // read content as-is otherwise
                 else
                     {
-                    ++start;
-                    while (start < endSentinel && *start != '`')
+                    if (std::wcsncmp(start, L"`r ", 3) == 0)
+                        { start += 3; }
+                    else if (std::wcsncmp(start, L"`python ", 8) == 0)
+                        { start += 8; }
+                    else
                         {
-                        previousChar = *start;
-                        add_character(*start);
                         ++start;
                         }
+                    // `` section, which can have embedded backticks
                     if (*start == '`')
                         {
-                        ++start;
+                        auto endOfTag = std::wcsstr(++start, L"``");
+                        if (endOfTag == nullptr ||
+                            endOfTag > endSentinel)
+                            {
+                            log_message(L"Bad inline `` code block in markdown file.");
+                            break;
+                            }
+                        // read in content verbatim
+                        while (start < endOfTag)
+                            {
+                            previousChar = *start;
+                            add_character(*start);
+                            ++start;
+                            }
+                        start += 2;
+                        }
+                    // just a single backtick block, should be on one line
+                    else
+                        {
+                        while (start < endSentinel && *start != '`')
+                            {
+                            // inline blocks should be on one line, so bail if we hit a new line
+                            // as this would probably be a missing closing backtick
+                            if (*start == L'\n' || *start == L'\r')
+                                {
+                                log_message(L"Unterminated inline ` code block in markdown file.");
+                                previousChar = *start;
+                                add_character(*start);
+                                ++start;
+                                break;
+                                }
+                            previousChar = *start;
+                            add_character(*start);
+                            ++start;
+                            }
+                        if (*start == '`')
+                            {
+                            ++start;
+                            }
                         }
                     continue;
                     }
                 }
             }
-        // images
+        // images (we don't read in the alt text inside of the [], just skip everything)
         else if (*start == L'!')
             {
             if (!isEscaping &&
@@ -347,20 +473,24 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std::
                 start[1] == L'[')
                 {
                 start += 2;
-                auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'[', L']');
+                auto endOfTag = string_util::find_unescaped_matching_close_tag_same_line(start, L'[', L']');
                 if (endOfTag == nullptr)
                     {
                     log_message(L"Bad image command in markdown file.");
-                    break;
+                    previousChar = L'[';
+                    add_character(L'[');
+                    continue;
                     }
                 start = ++endOfTag;
                 if (*start == L'(')
                     {
-                    endOfTag = string_util::find_unescaped_matching_close_tag(++start, L'(', L')');
+                    endOfTag = string_util::find_unescaped_matching_close_tag_same_line(++start, L'(', L')');
                     if (endOfTag == nullptr)
                         {
                         log_message(L"Bad image command in markdown file.");
-                        break;
+                        previousChar = L'(';
+                        add_character(L'(');
+                        continue;
                         }
                     start = ++endOfTag;
                     }
@@ -373,21 +503,28 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std::
             if (!isEscaping)
                 {
                 auto labelStart{ ++start };
-                auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'[', L']');
+                auto endOfTag = string_util::find_unescaped_matching_close_tag_same_line(start, L'[', L']');
                 if (endOfTag == nullptr)
                     {
-                    log_message(L"Bad link command in markdown file.");
-                    break;
+                    log_message(L"Bad link command in markdown file. Missing closing ']'");
+                    // just treat it like a stray '[' and keep going
+                    previousChar = L'[';
+                    add_character(L'[');
+                    continue;
                     }
                 start = ++endOfTag;
                 if (*start == L'(')
                     {
                     auto labelEnd{ start - 1};
-                    endOfTag = string_util::find_unescaped_matching_close_tag(++start, L'(', L')');
+                    endOfTag = string_util::find_unescaped_matching_close_tag_same_line(++start, L'(', L')');
                     if (endOfTag == nullptr)
                         {
-                        log_message(L"Bad link command in markdown file.");
-                        break;
+                        log_message(L"Bad link command in markdown file. Missing closing ')'");
+                        // read in the label and '(' section after it as-is if closing ')' is missing
+                        start = labelStart;
+                        previousChar = L'[';
+                        add_character(L'[');
+                        continue;
                         }
                     start = ++endOfTag;
                     if (labelStart < labelEnd)
@@ -469,6 +606,22 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std::
             add_characters(L"\n\n");
             continue;
             }
+        else if (!isEscaping &&
+            std::wcsncmp(start, L"< br/>", 6) == 0)
+            {
+            start += 6;
+            previousChar = L'\n';
+            add_characters(L"\n\n");
+            continue;
+            }
+        else if (!isEscaping &&
+            std::wcsncmp(start, L"<br/>", 5) == 0)
+            {
+            start += 5;
+            previousChar = L'\n';
+            add_characters(L"\n\n");
+            continue;
+            }
         else if (*start == L'<')
             {
             if (!isEscaping &&
@@ -595,12 +748,43 @@ const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std::
             }
         // styling tags that just get removed from raw text
         else if (!isEscaping &&
-                 (*start == L'*' || *start == L'_' || *start == L'~') )
+                 std::iswspace(previousChar) &&
+                 *start == L'*')
             {
-            while ((*start == L'*' || *start == L'_' || *start == L'~') &&
-                   (start < endSentinel))
-                { ++start; }
-            continue;
+            if (parseStyledText(L'*'))
+                {
+                continue;
+                }
+            else
+                {
+                break;
+                }
+            }
+        else if (!isEscaping &&
+                 std::iswspace(previousChar) &&
+                 *start == L'_')
+            {
+            if (parseStyledText(L'_'))
+                {
+                continue;
+                }
+            else
+                {
+                break;
+                }
+            }
+        else if (!isEscaping &&
+                 std::iswspace(previousChar) &&
+                 *start == L'~')
+            {
+            if (parseStyledText(L'~'))
+                {
+                continue;
+                }
+            else
+                {
+                break;
+                }
             }
         // table
         else if (!isEscaping &&
diff --git a/src/import/markdown_extract_text.h b/src/import/markdown_extract_text.h
index df89df2d..1fc62868 100644
--- a/src/import/markdown_extract_text.h
+++ b/src/import/markdown_extract_text.h
@@ -31,19 +31,15 @@ namespace lily_of_the_valley
 #endif
         /** @returns @c true if text marks the start of a Markdown metadata section.
             @param md_text The start of the Markdown text.
-            @note Multimarkdown, YAML, and Pandoc sections are supported.
+            @note YAML sections are supported.
             @warning @c md_text must be the start of the Markdown document.*/
         [[nodiscard]]
         bool has_metadata_section(const wchar_t* md_text) const
             {
-            if (!md_text)
+            if (md_text == nullptr)
                 { return false; }
-            // will just be reviewing the first line of text
-            auto eol = string_util::strcspn_pointer(md_text, L"\n\r", 2);
-            if (!eol)
-                { return false; }
-
-            return std::regex_match(md_text, eol, m_metadataSectionStart);
+            // does the first line start with ---
+            return (std::wcsncmp(md_text, L"---", 3) == 0);
             }
 
         /** @brief Metadata sections end on the first blank like, so moves to that.
@@ -53,39 +49,33 @@ namespace lily_of_the_valley
         [[nodiscard]]
         static const wchar_t* find_metadata_section_end(const wchar_t* md_text) noexcept
             {
-            if (!md_text)
+            if (md_text == nullptr)
                 { return nullptr; }
-            while (md_text[0] != 0 && md_text[1] != 0)
+            // step over first line
+            auto eol = string_util::strcspn_pointer(md_text, L"\r\n", 2);
+            if (eol == nullptr)
                 {
-                // if a Windows \r\n combination, then if followed by either 
-                // \r or \n means we have a blank line
-                if (md_text[0] == L'\r' && md_text[1] == L'\n')
-                    {
-                    if (md_text[2] == 0)
-                        { return md_text+2; }
-                    else if (string_util::is_either(md_text[2], L'\n', L'\r'))
-                        { return md_text+3; }
-                    }
-                // otherwise, if 2 consecutive breaks then we have a blank line
-                else if (string_util::is_either(md_text[0], L'\n', L'\r') &&
-                    string_util::is_either(md_text[1], L'\n', L'\r'))
-                    { return md_text+2; }
-                ++md_text;
+                return md_text;
+                }
+            // ...and find the terminating --- line
+            auto endOfYaml = std::wcsstr(eol, L"\n---");
+            if (endOfYaml == nullptr)
+                {
+                return md_text;
+                }
+            endOfYaml = string_util::strcspn_pointer(endOfYaml + 4, L"\r\n", 2);
+            if (endOfYaml == nullptr)
+                {
+                return md_text;
+                }
+            while (*endOfYaml == L'\r' || *endOfYaml == L'\n')
+                {
+                ++endOfYaml;
                 }
-            // if stopped on the last character because the whole file is just metadata,
-            // then step to the end.
-            if (md_text[0] != 0 && md_text[1] == 0)
-                { return ++md_text; }
 
-            return md_text;
+            return endOfYaml;
             }
 
-        /** Metadata section types start with these:
-            - Pandoc: % and a space.
-            - YAML header: ---*/
-        std::wregex m_metadataSectionStart
-            { LR"(^(%[[:space:]]+|---).*$)" };
-
         std::unique_ptr<markdown_extract_text> m_subParser{ nullptr };
         };
     }
diff --git a/tests/mdparsetests.cpp b/tests/mdparsetests.cpp
index 1e6ab2fd..570d82f7 100644
--- a/tests/mdparsetests.cpp
+++ b/tests/mdparsetests.cpp
@@ -18,52 +18,25 @@ TEST_CASE("Markdown Parser", "[md import]")
     SECTION("Meta Sections")
         {
         lily_of_the_valley::markdown_extract_text md;
-        // pandoc
-        auto mdText = L"% title: my book\n\nHere is the *actual* \\*text to **review**.";
-        CHECK(md.has_metadata_section(mdText));
         // YAML
-        mdText = L"---\n   title:my book\n\nHere is the *actual* \\*text to **review**.";
+        auto mdText = L"---\n   title:my book\n\nHere is the *actual* \\*text to **review**.";
         CHECK(md.has_metadata_section(mdText));
         }
 
     SECTION("Meta Section End")
         {
-        // test \n
-        auto mdText = L"keyvalue:12\nKey2: re\n\nHere is the *actual* \\*text to **review**.";
         lily_of_the_valley::markdown_extract_text md;
-        auto end = md.find_metadata_section_end(mdText);
-        CHECK(end == mdText+22);
-
-        // test \r
-        mdText = L"keyvalue:12\rKey2: re\r\rHere is the *actual* \\*text to **review**.";
-        end = md.find_metadata_section_end(mdText);
-        CHECK(end == mdText+22);
-
-        // test \r\n combinations
-        mdText = L"keyvalue:12\r\nKey2: re\r\n\r\nHere is the *actual* \\*text to **review**.";
-        end = md.find_metadata_section_end(mdText);
-        CHECK(end == mdText+24);
-
-        mdText = L"keyvalue:12\r\nKey2: re\r\n\nHere is the *actual* \\*text to **review**.";
-        end = md.find_metadata_section_end(mdText);
-        CHECK(end == mdText+24);
-
-        // test being at the end of the text
-        mdText = L"title\r\n";
-        end = md.find_metadata_section_end(mdText);
-        CHECK(end == mdText+7);
-
-        // test being at the end of the text, because there was no blank lines
-        mdText = L"title";
-        end = md.find_metadata_section_end(mdText);
-        CHECK(end == mdText+5);
+        CHECK(std::wstring{ md({ L"---\ntitle:my book\n---\nHere is the *actual* text to **review**." }) } ==
+            std::wstring{ L"Here is the actual text to review." });
         }
 
     SECTION("Newlines")
         {
         lily_of_the_valley::markdown_extract_text md;
         CHECK(std::wstring{ md({ L"# Header\nThis is\na line.\r\nThis is the same line.  \nThis is a new line.\r\n\r\nAnother line. \nSame line." }) } ==
-              std::wstring{ L"Header\n\nThis is a line. This is the same line.  \n\nThis is a new line.\n\nAnother line.  Same line." });
+             std::wstring{ L"Header\n\nThis is a line. This is the same line.  \n\nThis is a new line.\n\nAnother line.  Same line." });
+        CHECK(std::wstring{ md({ L"*`where`*\\\nNext line" }) } ==
+            std::wstring{ L"where\n\nNext line" });
         }
 
     SECTION("Header")
@@ -82,8 +55,15 @@ TEST_CASE("Markdown Parser", "[md import]")
         lily_of_the_valley::markdown_extract_text md;
         CHECK(std::wstring{ md({ L"This is *italic* and **bold** and also __italic__. 2 \\* 2." }) } ==
               std::wstring{ L"This is italic and bold and also italic. 2 * 2." });
+        CHECK(std::wstring{ md({ L"This is _italic and **bold** text_." }) } ==
+            std::wstring{ L"This is italic and bold text." });
+        // can't handle "This is *italic and **bold** text*", will have to be a known limitation
         CHECK(std::wstring{ md({ L"**PGF\\_HOT**" }) } ==
             std::wstring{ L"PGF_HOT" });
+        CHECK(std::wstring{ md({ L"TIFF _spe_ci**f**i_c_ *options*" }) } ==
+            std::wstring{ L"TIFF spe_ci**f**i_c options" });
+        CHECK(std::wstring{ md({ L"2 * 2" }) } ==
+            std::wstring{ L"2 * 2" });
         }
 
     SECTION("Blockquoes")
@@ -111,14 +91,22 @@ TEST_CASE("Markdown Parser", "[md import]")
             std::wstring{ L"This is code." });
         CHECK(std::wstring{ md({ L"Code `r 2+2`." }) } ==
               std::wstring{ L"Code 2+2." });
+        CHECK(std::wstring{ md({ L"``2`2`` `shared_ptr`" }) } ==
+              std::wstring{ L"2`2 shared_ptr" });
+        CHECK(std::wstring{ md({ L"### `std::basic_istream::read` processing of `\\r\\n`` =>`\\n`\n `shared_ptr`" }) } ==
+              std::wstring{ L"std::basic_istream::read processing of \\r\\n =>n\n shared_ptr" });
         }
 
     SECTION("Code block")
         {
         lily_of_the_valley::markdown_extract_text md;
-        auto blah = std::wstring{ md({ L"This\n```\nis code\r\nhere```\n." }) };
+        // inline (you aren't supposed to do this with ```, but people do)
         CHECK(std::wstring{ md({ L"This\n```\nis code\r\nhere```\n." }) } ==
               std::wstring{ L"This \n\tis code\r\n\there\n\n." });
+        // remove lang info
+        auto blah = std::wstring{ md({ L"This\n```cpp\nis code\r\nhere\n```\n." }) };
+        CHECK(std::wstring{ md({ L"This\n```cpp\nis code\r\nhere\n```\n." }) } ==
+            std::wstring{ L"This \n\tis code\r\n\there\n\t\n\n." });
         }
 
     SECTION("Images")
@@ -128,9 +116,9 @@ TEST_CASE("Markdown Parser", "[md import]")
               std::wstring{ L"Tux  the penguin." });
         // malformed
         CHECK(std::wstring{ md({ L"Tux ![Tux, the Linux mascot" }) } ==
-            std::wstring{ L"Tux " });
+            std::wstring{ L"Tux [Tux, the Linux mascot" });
         CHECK(std::wstring{ md({ L"Tux ![Tux, the Linux mascot](/assets/tux.png" }) } ==
-            std::wstring{ L"Tux " });
+            std::wstring{ L"Tux (/assets/tux.png" });
         }
 
     SECTION("Links")
@@ -144,9 +132,11 @@ TEST_CASE("Markdown Parser", "[md import]")
             std::wstring{ L"Tux the Linux mascot the penguin." });
         // malformed
         CHECK(std::wstring{ md({ L"Tux [the Linux mascot" }) } ==
-            std::wstring{ L"Tux " });
+            std::wstring{ L"Tux [the Linux mascot" });
         CHECK(std::wstring{ md({ L"Tux [the Linux mascot](/assets/tux.png" }) } ==
-            std::wstring{ L"Tux " });
+            std::wstring{ L"Tux [the Linux mascot](/assets/tux.png" });
+        CHECK(std::wstring{ md({ L"The third member function inserts the sequence [`first`, `last`). You use it" }) } ==
+            std::wstring{ L"The third member function inserts the sequence [first, last). You use it" });
         }
 
     SECTION("Unordered Lists")