From 99e22d51becf63a37a4a54950fac00fe32f0f27e Mon Sep 17 00:00:00 2001
From: Blake-Madden <madindayton@outlook.com>
Date: Wed, 22 Nov 2023 06:51:25 -0500
Subject: [PATCH] Implement markdown parser Handles most Markdown features,
 includes basic RMarkdown (bookdown) and Quarto features.

---
 src/import/markdown_extract_text.cpp | 486 +++++++++++++++++++++++++++
 src/import/markdown_extract_text.h   |  46 +--
 tests/CMakeLists.txt                 |   1 +
 tests/mdparsetests.cpp               | 102 +++++-
 tools/files.cmake                    |   1 +
 tools/guilibfiles_testing.cmake      |   1 +
 tools/libfiles.cmake                 |   1 +
 7 files changed, 593 insertions(+), 45 deletions(-)
 create mode 100644 src/import/markdown_extract_text.cpp

diff --git a/src/import/markdown_extract_text.cpp b/src/import/markdown_extract_text.cpp
new file mode 100644
index 00000000..559b1793
--- /dev/null
+++ b/src/import/markdown_extract_text.cpp
@@ -0,0 +1,486 @@
+#include "markdown_extract_text.h"
+
+const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std::wstring_view md_text)
+    {
+    clear_log();
+    if (md_text.empty())
+        {
+        set_filtered_text_length(0);
+        return nullptr;
+        }
+
+    if (m_subParser == nullptr)
+        { m_subParser = std::make_unique<markdown_extract_text>(); }
+
+    if (!allocate_text_buffer(md_text.length()))
+        {
+        set_filtered_text_length(0);
+        return nullptr;
+        }
+
+    // find the start of the text body and set up where we halt our searching
+    const wchar_t* const endSentinel = md_text.data() + md_text.length();
+    const wchar_t* start = md_text.data();
+    if (is_metadata_section(start))
+        { start = find_metadata_section_end(start); }
+    // in case metadata section ate up the whole file
+    // (or at least the part of the file requested to be reviewed)
+    if (start >= endSentinel)
+        { return endSentinel; }
+    while (start < endSentinel && *start != 0 && std::iswspace(*start))
+        { ++start; }
+
+    bool isEscaping{ false };
+    bool headerMode{ false };
+    wchar_t previousChar{ L'\n' };
+
+    while (start != nullptr && *start != 0 && (start < endSentinel))
+        {
+        if (*start == L'\\')
+            {
+            // Previous character was not \, but this one is.
+            // Skip and get ready to escape the next character.
+            if (!isEscaping)
+                {
+                // remove \index{} tags
+                if (std::wcsncmp(start, L"\\index{", 7) == 0)
+                    {
+                    start += 7;
+                    auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'{', L'}');
+                    if (endOfTag == nullptr)
+                        {
+                        log_message(L"Bad index{} command in markdown file.");
+                        break;
+                        }
+                    start = ++endOfTag;
+                    continue;
+                    }
+                 if (std::wcsncmp(start, L"\\@ref(", 6) == 0)
+                    {
+                    start += 6;
+                    auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'(', L')');
+                    if (endOfTag == nullptr)
+                        {
+                        log_message(L"Bad cross reference command in markdown file.");
+                        break;
+                        }
+                    start = ++endOfTag;
+                    continue;
+                    }
+                else if (std::wcsncmp(start, L"\\newpage", 8) == 0)
+                    {
+                    start += 8;
+                    add_characters(L"\n\n");
+                    continue;
+                    }
+                // actually is an escape character
+                isEscaping = true;
+                previousChar = *start;
+                ++start;
+                continue;
+                }
+            }
+        // skip over header tags
+        else if (*start == L'#')
+            {
+            if (!isEscaping &&
+                (previousChar == L'\n' || previousChar == L'\r'))
+                {
+                while (*start == L'#' &&
+                       (start < endSentinel))
+                    { ++start; }
+                // space between # and header text
+                while ((*start == L' ' || *start == L'\t') &&
+                       (start < endSentinel))
+                    { ++start; }
+                previousChar = *start;
+                headerMode = true;
+                continue;
+                }
+            }
+        // RMarkdown div fences
+        else if (*start == L':')
+            {
+            if (!isEscaping &&
+                (previousChar == L'\n' || previousChar == L'\r'))
+                {
+                // space between > and quote text
+                while ((*start == L':') &&
+                       (start < endSentinel))
+                    { ++start; }
+                continue;
+                }
+            }
+        // block quotes
+        else if (*start == L'>')
+            {
+            if (!isEscaping &&
+                (previousChar == L'\n' || previousChar == L'\r'))
+                {
+                size_t tabCount{ 0 };
+                while (*start == L'>' &&
+                       (start < endSentinel))
+                    {
+                    ++tabCount;
+                    ++start;
+                    }
+                // space between > and quote text
+                while ((*start == L' ' || *start == L'\t') &&
+                       (start < endSentinel))
+                    { ++start; }
+                add_character(L'\t', tabCount);
+                // Flags that we are still at the start of the line,
+                // so that headers and list items can still be parsed correctly.
+                previousChar = L'\n';
+                continue;
+                }
+            }
+        // code blocks
+        else if (*start == L'`')
+            {
+            // fenced section
+            if (!isEscaping && std::wcsncmp(start, L"```", 3) == 0)
+                {
+                start += 3;
+                auto endOfTag = std::wcsstr(start, L"```");
+                if (endOfTag == nullptr)
+                    {
+                    log_message(L"Bad fenced code block in markdown file.");
+                    break;
+                    }
+                start = endOfTag + 3;
+                continue;
+                }
+            // verbatim (inline) code
+            else if (!isEscaping)
+                {
+                // R code should be fully removed (or processed for known functions)
+                if (std::wcsncmp(start, L"`r keys(", 8) == 0)
+                    {
+                    start += 8;
+                    if (start + 1 < endSentinel &&
+                        (start[1] == L'\'' || start[1] == L'"'))
+                        { ++start; }
+                    if (*start == L'\'' || *start == L'"')
+                        {
+                        const auto quoteChar{ *start };
+                        auto endOfTag = string_util::find_unescaped_char(++start, quoteChar);
+                        if (endOfTag == nullptr)
+                            {
+                            log_message(L"Bad 'r keys' code block in markdown file.");
+                            break;
+                            }
+                        [[maybe_unused]] auto retval = m_subParser->operator()(
+                            { start, static_cast<size_t>(std::distance(start, endOfTag)) });
+                        add_character(L'"');
+                        add_characters(
+                            { m_subParser->get_filtered_text(), m_subParser->get_filtered_text_length() });
+                        add_character(L'"');
+                        if (m_subParser->get_filtered_text_length())
+                            {
+                            previousChar =
+                                m_subParser->get_filtered_text()[m_subParser->get_filtered_text_length() - 1];
+                            }
+                        endOfTag = string_util::find_unescaped_char(start, L'`');
+                        if (endOfTag == nullptr)
+                            {
+                            log_message(L"Bad 'r keys' code block in markdown file.");
+                            break;
+                            }
+                        start = ++endOfTag;
+                        }
+                    }
+                else if (std::wcsncmp(start, L"`r drop_cap(", 11) == 0)
+                    {
+                    start += 11;
+                    if (start + 1 < endSentinel &&
+                        (start[1] == L'\'' || start[1] == L'"'))
+                        { ++start; }
+                    if (*start == L'\'' || *start == L'"')
+                        {
+                        const auto quoteChar{ *start };
+                        auto endOfTag = string_util::find_unescaped_char(++start, quoteChar);
+                        if (endOfTag == nullptr)
+                            {
+                            log_message(L"Bad 'r dropcap' code block in markdown file.");
+                            break;
+                            }
+                        [[maybe_unused]] auto retval = m_subParser->operator()(
+                            { start, static_cast<size_t>(std::distance(start, endOfTag)) });
+                        add_character(L'"');
+                        add_characters(
+                            { m_subParser->get_filtered_text(), m_subParser->get_filtered_text_length() });
+                        add_character(L'"');
+                        if (m_subParser->get_filtered_text_length())
+                            {
+                            previousChar =
+                                m_subParser->get_filtered_text()[m_subParser->get_filtered_text_length() - 1];
+                            }
+                        endOfTag = string_util::find_unescaped_char(start, L'`');
+                        if (endOfTag == nullptr)
+                            {
+                            log_message(L"Bad 'r dropcap' code block in markdown file.");
+                            break;
+                            }
+                        start = ++endOfTag;
+                        }
+                    }
+                else if (std::wcsncmp(start, L"`r menu(", 8) == 0)
+                    {
+                    start += 8;
+                    if (start + 1 < endSentinel &&
+                        (start[0] == L'c') &&
+                        (start[1] == L'('))
+                        { start += 2; }
+                    if (*start == L'\'' || *start == L'"')
+                        {
+                        const auto quoteChar{ *start };
+                        auto endOfTag = std::wcsstr(start, L")`");
+                        if (endOfTag == nullptr)
+                            {
+                            log_message(L"Bad 'r menu' code block in markdown file.");
+                            break;
+                            }
+                        [[maybe_unused]] auto retval = m_subParser->operator()(
+                            { start, static_cast<size_t>(std::distance(start, endOfTag-1)) });
+                        add_characters(
+                            { m_subParser->get_filtered_text(), m_subParser->get_filtered_text_length() });
+                        if (m_subParser->get_filtered_text_length())
+                            {
+                            previousChar =
+                                m_subParser->get_filtered_text()[m_subParser->get_filtered_text_length() - 1];
+                            }
+                        start = endOfTag + 2;
+                        }
+                    }
+                // read content as-is otherwise
+                else
+                    { ++start; }
+                }
+            }
+        // images
+        else if (*start == L'!')
+            {
+            if (!isEscaping &&
+                (start +1 < endSentinel) &&
+                start[1] == L'[')
+                {
+                start += 2;
+                auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'[', L']');
+                if (endOfTag == nullptr)
+                    {
+                    log_message(L"Bad image command in markdown file.");
+                    break;
+                    }
+                start = ++endOfTag;
+                if (*start == L'(')
+                    {
+                    endOfTag = string_util::find_unescaped_matching_close_tag(++start, L'(', L')');
+                    if (endOfTag == nullptr)
+                        {
+                        log_message(L"Bad image command in markdown file.");
+                        break;
+                        }
+                    start = ++endOfTag;
+                    }
+                continue;
+                }
+            }
+        // links
+        else if (*start == L'[')
+            {
+            if (!isEscaping)
+                {
+                auto labelStart{ ++start };
+                auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'[', L']');
+                if (endOfTag == nullptr)
+                    {
+                    log_message(L"Bad link command in markdown file.");
+                    break;
+                    }
+                start = ++endOfTag;
+                if (*start == L'(')
+                    {
+                    auto labelEnd{ start - 1};
+                    endOfTag = string_util::find_unescaped_matching_close_tag(++start, L'(', L')');
+                    if (endOfTag == nullptr)
+                        {
+                        log_message(L"Bad link command in markdown file.");
+                        break;
+                        }
+                    start = ++endOfTag;
+                    if (labelStart < labelEnd)
+                        {
+                        [[maybe_unused]] auto retval = m_subParser->operator()(
+                            { labelStart, static_cast<size_t>(std::distance(labelStart, labelEnd)) });
+                        add_characters(
+                            { m_subParser->get_filtered_text(), m_subParser->get_filtered_text_length() });
+                        if (m_subParser->get_filtered_text_length())
+                            {
+                            previousChar =
+                                m_subParser->get_filtered_text()[m_subParser->get_filtered_text_length() - 1];
+                            }
+                        }
+                    }
+                continue;
+                }
+            }
+        // IDs
+        else if (*start == L'{')
+            {
+            if (!isEscaping)
+                {
+                // if quarto syntax
+                if (std::wcsncmp(start, L"{{< pagebreak >}}", 17) == 0)
+                    {
+                    start += 17;
+                    add_characters(L"\n\n");
+                    }
+                else
+                    {
+                    auto labelStart{ ++start };
+                    auto endOfTag =
+                        string_util::find_unescaped_matching_close_tag(start, L'{', L'}');
+                    if (endOfTag == nullptr)
+                        {
+                        log_message(L"Bad ID command in markdown file.");
+                        break;
+                        }
+                    start = ++endOfTag;
+                    }
+                continue;
+                }
+            }
+        // newlines
+        else if (*start == L'\n' || *start == L'\r')
+            {
+            // two (or more) spaces at the end of a line indicates a paragraph break
+            size_t newlineCount{ 0 };
+            if (previousChar == L' ' &&
+                std::distance(md_text.data(), start) > 2 &&
+                *(start-2) == L' ')
+                { ++newlineCount; }
+            // count the newlines (taking CRLF combos into account)
+            while ((*start == L'\n' || *start == L'\r') &&
+                    (start < endSentinel))
+                {
+                if (*start == L'\r' &&
+                    (start + 1 < endSentinel) &&
+                    start[1] == L'\n')
+                    {
+                    ++start;
+                    continue;
+                    }
+                ++newlineCount;
+                ++start;
+                // If the next line is a header line divider, then
+                // skip that, switch to header mode, and keep reading
+                // any more newlines
+                if (start + 1 < endSentinel &&
+                    (start[0] == L'=' && start[1] == L'=') ||
+                    (start[0] == L'-' && start[1] == L'-'))
+                    {
+                    while ((*start == L'=' || *start == L'-') &&
+                        (start < endSentinel))
+                        { ++start; }
+                    headerMode = true;
+                    }
+                }
+
+            auto scanAhead{ start };
+            while ((scanAhead < endSentinel) &&
+                (*scanAhead == L' ' || *scanAhead == L'\t'))
+                { ++scanAhead; }
+            // next line starts a list item, quoteblock, table, etc., so keep the newline as-is
+            if (newlineCount == 1 &&
+                string_util::is_one_of(*scanAhead, L">-*+|:^"))
+                {
+                add_character(L'\n', newlineCount);
+                previousChar = L'\n';
+                }
+            // same for an ordered list
+            else if (newlineCount == 1 &&
+                (start < endSentinel) &&
+                std::iswdigit(*start))
+                {
+                auto scanAheadDigit{ start };
+                while ((scanAheadDigit < endSentinel) &&
+                    std::iswdigit(*scanAheadDigit))
+                    { ++scanAheadDigit; }
+                if (*scanAheadDigit == L'.')
+                    {
+                    add_character(L'\n', newlineCount);
+                    previousChar = L'\n';
+                    }
+                // not an ordered list, default behavior to read as space
+                else
+                    {
+                    add_character(L' ');
+                    previousChar = L' ';
+                    }
+                }
+            // a single newline not a end of a self-contained line
+            // (e.g., a header) is seen as a space
+            else if (newlineCount == 1 && !headerMode)
+                {
+                add_character(L' ');
+                previousChar = L' ';
+                }
+            else if (newlineCount == 1 && headerMode)
+                {
+                add_characters(L"\n\n");
+                previousChar = L'\n';
+                }
+            else
+                {
+                add_character(L'\n', newlineCount);
+                previousChar = L'\n';
+                }
+            headerMode = false;
+            isEscaping = false;
+            continue;
+            }
+        // styling tags that just get removed from raw text
+        else if (!isEscaping &&
+                 (*start == L'*' || *start == L'_' || *start == L'~') )
+            {
+            while ((*start == L'*' || *start == L'_' || *start == L'~') &&
+                   (start < endSentinel))
+                { ++start; }
+            }
+        // table
+        else if (!isEscaping &&
+                 (*start == L'|') )
+            {
+            previousChar = L'\t';
+            add_character(L'\t');
+            ++start;
+            continue;
+            }
+        // newline hacks found in tables (just replace with space to keep the table structure).
+        else if (std::wcsncmp(start, L"<br>\\linebreak", 14) == 0)
+            {
+            start += 14;
+            previousChar = L' ';
+            add_character(L' ');
+            }
+        // RMarkdown (Pandoc) comment
+        else if (std::wcsncmp(start, L"<!--", 4) == 0)
+            {
+            const auto endOfTag = std::wcsstr(start, L"-->");
+            if (endOfTag == nullptr)
+                {
+                log_message(L"Bad comment block in markdown file.");
+                break;
+                }
+            start = endOfTag + 3;
+            }
+        // turn off escaping and load the character
+        isEscaping = false;
+        previousChar = *start;
+        add_character(*start);
+        ++start;
+        }
+
+    return get_filtered_text();
+    }
diff --git a/src/import/markdown_extract_text.h b/src/import/markdown_extract_text.h
index 86973d66..e11c6914 100644
--- a/src/import/markdown_extract_text.h
+++ b/src/import/markdown_extract_text.h
@@ -18,56 +18,14 @@
 namespace lily_of_the_valley
     {
     /// @brief Extracts plain text from a Markdown file.
-    /// @warning This is in alpha state. Do not use in productino.
     class markdown_extract_text final : public html_extract_text
         {
     public:
         /** @brief Main interface for extracting plain text from a Markdown file.
             @param md_text The markdown text to parse.
-            @param text_length The length of the text.
             @returns The parsed text from the Markdown stream.*/
         [[nodiscard]]
-        const wchar_t* operator()(const wchar_t* md_text,
-                                  const size_t text_length)
-            {
-            clear_log();
-            if (md_text == nullptr || md_text[0] == 0 || text_length == 0)
-                {
-                set_filtered_text_length(0);
-                return nullptr;
-                }
-
-            if (!allocate_text_buffer(text_length))
-                {
-                set_filtered_text_length(0);
-                return nullptr;
-                }
-
-            // find the start of the text body and set up where we halt our searching
-            const wchar_t* const endSentinel = md_text+text_length;
-            if (is_metadata_section(md_text))
-                { md_text = find_metadata_section_end(md_text); }
-            while (md_text < endSentinel && md_text[0] != 0 && std::iswspace(md_text[0]))
-                { ++md_text; }
-            // in case metadata section ate up the whole file
-            // (or at least the part of the file requested to be reviewed)
-            if (md_text >= endSentinel)
-                { return endSentinel; }
-            const wchar_t* start = md_text;
-            const wchar_t* end = md_text;
-
-            while (end != nullptr  && end[0] != 0 && (end < endSentinel))
-                { ++end; }
-            // pick up any remaining text at the end of the text stream..
-            if (end != nullptr && end[0] == 0)
-                { add_characters(start, end-start); }
-            // ...or if end is beyond where we should go in the buffer,
-            // copy up to the end of the buffer
-            else if (end >= endSentinel)
-                { add_characters(start, endSentinel-start); }
-
-            return get_filtered_text();
-            }
+        const wchar_t* operator()(const std::wstring_view md_text);
 #ifndef __UNITTEST
     private:
 #endif
@@ -129,6 +87,8 @@ namespace lily_of_the_valley
               followed by colon.*/
         std::wregex m_metadataSectionStart
             { LR"(^(%[[:space:]]+|---|[[:alpha:]]+([[:space:]]{1}[[:alpha:]]+)?[:]).*$)" };
+
+        std::unique_ptr<markdown_extract_text> m_subParser{ nullptr };
         };
     }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 5bc20179..6cc38738 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -36,6 +36,7 @@ SET(LIB_SRC_FILES
     ../src/import/docx_extract_text.cpp
     ../src/import/html_extract_text.cpp
     ../src/import/odt_odp_extract_text.cpp
+    ../src/import/markdown_extract_text.cpp
     ../src/import/postscript_extract_text.cpp
     ../src/import/xlsx_extract_text.cpp
     ../src/import/rtf_extract_text.cpp)
diff --git a/tests/mdparsetests.cpp b/tests/mdparsetests.cpp
index 41ddbef0..c4ea8d78 100644
--- a/tests/mdparsetests.cpp
+++ b/tests/mdparsetests.cpp
@@ -11,9 +11,8 @@ TEST_CASE("Markdown Parser", "[md import]")
         {
         lily_of_the_valley::markdown_extract_text md;
         CHECK(md.is_metadata_section(nullptr) == false);
-        CHECK(md(nullptr, 100) == nullptr);
         CHECK(md.find_metadata_section_end(nullptr) == nullptr);
-        CHECK(md(L"some MD text", 0) == nullptr);
+        CHECK(md({ L"some MD text", 0 }) == nullptr);
         }
 
     SECTION("Meta Sections")
@@ -62,4 +61,103 @@ TEST_CASE("Markdown Parser", "[md import]")
         end = md.find_metadata_section_end(mdText);
         CHECK(end == mdText+5);
         }
+
+    SECTION("Newlines")
+        {
+        lily_of_the_valley::markdown_extract_text md;
+        CHECK(std::wstring{ md({ L"# Header\nThis is\na line.\r\nThis is the same line.  \nThis is a new line.\r\n\r\nAnother line. \nSame line." }) } ==
+              std::wstring{ L"Header\n\nThis is a line. This is the same line.  \n\nThis is a new line.\n\nAnother line.  Same line." });
+        }
+
+    SECTION("Header")
+        {
+        lily_of_the_valley::markdown_extract_text md;
+        CHECK(std::wstring{ md({ L"# Header1\n### Header2\n Not a #header" }) } ==
+              std::wstring{ L"Header1\n\nHeader2\n\n Not a #header" });
+        CHECK(std::wstring{ md({ L"Header1\n=========\nHeader2\n--\nNot a =header" }) } ==
+            std::wstring{ L"Header1\n\nHeader2\n\nNot a =header" });
+        CHECK(std::wstring{ md({ L"# Header1 {.unnumbered}\nText" }) } ==
+            std::wstring{ L"Header1 \n\nText" });
+        }
+
+    SECTION("Emphasis")
+        {
+        lily_of_the_valley::markdown_extract_text md;
+        CHECK(std::wstring{ md({ L"This is *italic* and **bold** and also __italic__. 2 \\* 2." }) } ==
+              std::wstring{ L"This is italic and bold and also italic. 2 * 2." });
+        }
+
+    SECTION("Blockquoes")
+        {
+        lily_of_the_valley::markdown_extract_text md;
+        CHECK(std::wstring{ md({ L"> This is a quote\n\n>\n\n> End of Quote" }) } ==
+              std::wstring{ L"\tThis is a quote\n\n\t\n\n\tEnd of Quote" });
+        // nested
+        CHECK(std::wstring{ md({ L"> This is a quote\n\n>\n\n>> End of Quote" }) } ==
+            std::wstring{ L"\tThis is a quote\n\n\t\n\n\t\tEnd of Quote" });
+        // with header
+        CHECK(std::wstring{ md({ L"> # This is a quote header\n\n>\n\n>> End of Quote" }) } ==
+            std::wstring{ L"\tThis is a quote header\n\n\t\n\n\t\tEnd of Quote" });
+        }
+
+    SECTION("Inline code")
+        {
+        lily_of_the_valley::markdown_extract_text md;
+        CHECK(std::wstring{ md({ L"This `is code`." }) } ==
+              std::wstring{ L"This is code." });
+        }
+
+    SECTION("Images")
+        {
+        lily_of_the_valley::markdown_extract_text md;
+        CHECK(std::wstring{ md({ L"Tux ![Tux, the Linux mascot](/assets/tux.png) the penguin." }) } ==
+              std::wstring{ L"Tux  the penguin." });
+        // malformed
+        CHECK(std::wstring{ md({ L"Tux ![Tux, the Linux mascot" }) } ==
+            std::wstring{ L"Tux " });
+        CHECK(std::wstring{ md({ L"Tux ![Tux, the Linux mascot](/assets/tux.png" }) } ==
+            std::wstring{ L"Tux " });
+        }
+
+    SECTION("Links")
+        {
+        lily_of_the_valley::markdown_extract_text md;
+        CHECK(std::wstring{ md({ L"Tux [the Linux mascot](/assets/tux.png) the penguin." }) } ==
+              std::wstring{ L"Tux the Linux mascot the penguin." });
+        CHECK(std::wstring{ md({ L"Tux [the **Linux** mascot](/assets/tux.png) the penguin." }) } ==
+            std::wstring{ L"Tux the Linux mascot the penguin." });
+        // malformed
+        CHECK(std::wstring{ md({ L"Tux [the Linux mascot" }) } ==
+            std::wstring{ L"Tux " });
+        CHECK(std::wstring{ md({ L"Tux [the Linux mascot](/assets/tux.png" }) } ==
+            std::wstring{ L"Tux " });
+        }
+
+    SECTION("Unordered Lists")
+        {
+        lily_of_the_valley::markdown_extract_text md;
+        CHECK(std::wstring{ md({ L"- list one\nhere\n- Item 2\n\nSome -text." }) } ==
+              std::wstring{ L"- list one here\n- Item 2\n\nSome -text." });
+        }
+
+    SECTION("Unordered Nested Lists")
+        {
+        lily_of_the_valley::markdown_extract_text md;
+        CHECK(std::wstring{ md({ L"- list one\n  - Item 2" }) } ==
+              std::wstring{ L"- list one\n  - Item 2" });
+        }
+
+    SECTION("Ordered Lists")
+        {
+        lily_of_the_valley::markdown_extract_text md;
+        CHECK(std::wstring{ md({ L"1. list one\nhere\n256. Item\n2" }) } ==
+              std::wstring{ L"1. list one here\n256. Item 2" });
+        }
+
+    SECTION("Table")
+        {
+        lily_of_the_valley::markdown_extract_text md;
+        CHECK(std::wstring{ md({ L"| Syntax | Description |\n| --- | ----------- |\n| Header | Title |" }) } ==
+              std::wstring{ L"\t Syntax \t Description \t\n\t --- \t ----------- \t\n\t Header \t Title \t" });
+        }
     }
diff --git a/tools/files.cmake b/tools/files.cmake
index 9d749fd4..2b7d1d75 100644
--- a/tools/files.cmake
+++ b/tools/files.cmake
@@ -56,6 +56,7 @@ SET(WISTERIA_SRC
     src/import/doc_extract_text.cpp
     src/import/docx_extract_text.cpp
     src/import/html_extract_text.cpp
+    src/import/markdown_extract_text.cpp
     src/import/odt_odp_extract_text.cpp
     src/import/postscript_extract_text.cpp
     src/import/rtf_extract_text.cpp
diff --git a/tools/guilibfiles_testing.cmake b/tools/guilibfiles_testing.cmake
index 345a8115..9c210ec4 100644
--- a/tools/guilibfiles_testing.cmake
+++ b/tools/guilibfiles_testing.cmake
@@ -57,6 +57,7 @@ SET(WISTERIA_SRC
     ../../src/import/doc_extract_text.cpp
     ../../src/import/docx_extract_text.cpp
     ../../src/import/html_extract_text.cpp
+    ../../src/import/markdown_extract_text.cpp
     ../../src/import/odt_odp_extract_text.cpp
     ../../src/import/postscript_extract_text.cpp
     ../../src/import/rtf_extract_text.cpp
diff --git a/tools/libfiles.cmake b/tools/libfiles.cmake
index 5aa5cb9e..22a906a2 100644
--- a/tools/libfiles.cmake
+++ b/tools/libfiles.cmake
@@ -53,6 +53,7 @@ SET(WISTERIA_SRC
     src/graphs/wordcloud.cpp
     src/i18n-check/src/i18n_string_util.cpp
     src/import/html_extract_text.cpp
+    src/import/markdown_extract_text.cpp
     src/import/xlsx_extract_text.cpp
     src/ui/controls/thumbnail.cpp
     src/ui/dialogs/imageexportdlg.cpp