From 99e22d51becf63a37a4a54950fac00fe32f0f27e Mon Sep 17 00:00:00 2001 From: Blake-Madden Date: Wed, 22 Nov 2023 06:51:25 -0500 Subject: [PATCH] Implement markdown parser Handles most Markdown features, includes basic RMarkdown (bookdown) and Quarto features. --- src/import/markdown_extract_text.cpp | 486 +++++++++++++++++++++++++++ src/import/markdown_extract_text.h | 46 +-- tests/CMakeLists.txt | 1 + tests/mdparsetests.cpp | 102 +++++- tools/files.cmake | 1 + tools/guilibfiles_testing.cmake | 1 + tools/libfiles.cmake | 1 + 7 files changed, 593 insertions(+), 45 deletions(-) create mode 100644 src/import/markdown_extract_text.cpp diff --git a/src/import/markdown_extract_text.cpp b/src/import/markdown_extract_text.cpp new file mode 100644 index 00000000..559b1793 --- /dev/null +++ b/src/import/markdown_extract_text.cpp @@ -0,0 +1,486 @@ +#include "markdown_extract_text.h" + +const wchar_t* lily_of_the_valley::markdown_extract_text::operator()(const std::wstring_view md_text) + { + clear_log(); + if (md_text.empty()) + { + set_filtered_text_length(0); + return nullptr; + } + + if (m_subParser == nullptr) + { m_subParser = std::make_unique(); } + + if (!allocate_text_buffer(md_text.length())) + { + set_filtered_text_length(0); + return nullptr; + } + + // find the start of the text body and set up where we halt our searching + const wchar_t* const endSentinel = md_text.data() + md_text.length(); + const wchar_t* start = md_text.data(); + if (is_metadata_section(start)) + { start = find_metadata_section_end(start); } + // in case metadata section ate up the whole file + // (or at least the part of the file requested to be reviewed) + if (start >= endSentinel) + { return endSentinel; } + while (start < endSentinel && *start != 0 && std::iswspace(*start)) + { ++start; } + + bool isEscaping{ false }; + bool headerMode{ false }; + wchar_t previousChar{ L'\n' }; + + while (start != nullptr && *start != 0 && (start < endSentinel)) + { + if (*start == L'\\') + { + // Previous character was not \, but this one is. + // Skip and get ready to escape the next character. + if (!isEscaping) + { + // remove \index{} tags + if (std::wcsncmp(start, L"\\index{", 7) == 0) + { + start += 7; + auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'{', L'}'); + if (endOfTag == nullptr) + { + log_message(L"Bad index{} command in markdown file."); + break; + } + start = ++endOfTag; + continue; + } + if (std::wcsncmp(start, L"\\@ref(", 6) == 0) + { + start += 6; + auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'(', L')'); + if (endOfTag == nullptr) + { + log_message(L"Bad cross reference command in markdown file."); + break; + } + start = ++endOfTag; + continue; + } + else if (std::wcsncmp(start, L"\\newpage", 8) == 0) + { + start += 8; + add_characters(L"\n\n"); + continue; + } + // actually is an escape character + isEscaping = true; + previousChar = *start; + ++start; + continue; + } + } + // skip over header tags + else if (*start == L'#') + { + if (!isEscaping && + (previousChar == L'\n' || previousChar == L'\r')) + { + while (*start == L'#' && + (start < endSentinel)) + { ++start; } + // space between # and header text + while ((*start == L' ' || *start == L'\t') && + (start < endSentinel)) + { ++start; } + previousChar = *start; + headerMode = true; + continue; + } + } + // RMarkdown div fences + else if (*start == L':') + { + if (!isEscaping && + (previousChar == L'\n' || previousChar == L'\r')) + { + // space between > and quote text + while ((*start == L':') && + (start < endSentinel)) + { ++start; } + continue; + } + } + // block quotes + else if (*start == L'>') + { + if (!isEscaping && + (previousChar == L'\n' || previousChar == L'\r')) + { + size_t tabCount{ 0 }; + while (*start == L'>' && + (start < endSentinel)) + { + ++tabCount; + ++start; + } + // space between > and quote text + while ((*start == L' ' || *start == L'\t') && + (start < endSentinel)) + { ++start; } + add_character(L'\t', tabCount); + // Flags that we are still at the start of the line, + // so that headers and list items can still be parsed correctly. + previousChar = L'\n'; + continue; + } + } + // code blocks + else if (*start == L'`') + { + // fenced section + if (!isEscaping && std::wcsncmp(start, L"```", 3) == 0) + { + start += 3; + auto endOfTag = std::wcsstr(start, L"```"); + if (endOfTag == nullptr) + { + log_message(L"Bad fenced code block in markdown file."); + break; + } + start = endOfTag + 3; + continue; + } + // verbatim (inline) code + else if (!isEscaping) + { + // R code should be fully removed (or processed for known functions) + if (std::wcsncmp(start, L"`r keys(", 8) == 0) + { + start += 8; + if (start + 1 < endSentinel && + (start[1] == L'\'' || start[1] == L'"')) + { ++start; } + if (*start == L'\'' || *start == L'"') + { + const auto quoteChar{ *start }; + auto endOfTag = string_util::find_unescaped_char(++start, quoteChar); + if (endOfTag == nullptr) + { + log_message(L"Bad 'r keys' code block in markdown file."); + break; + } + [[maybe_unused]] auto retval = m_subParser->operator()( + { start, static_cast(std::distance(start, endOfTag)) }); + add_character(L'"'); + add_characters( + { m_subParser->get_filtered_text(), m_subParser->get_filtered_text_length() }); + add_character(L'"'); + if (m_subParser->get_filtered_text_length()) + { + previousChar = + m_subParser->get_filtered_text()[m_subParser->get_filtered_text_length() - 1]; + } + endOfTag = string_util::find_unescaped_char(start, L'`'); + if (endOfTag == nullptr) + { + log_message(L"Bad 'r keys' code block in markdown file."); + break; + } + start = ++endOfTag; + } + } + else if (std::wcsncmp(start, L"`r drop_cap(", 11) == 0) + { + start += 11; + if (start + 1 < endSentinel && + (start[1] == L'\'' || start[1] == L'"')) + { ++start; } + if (*start == L'\'' || *start == L'"') + { + const auto quoteChar{ *start }; + auto endOfTag = string_util::find_unescaped_char(++start, quoteChar); + if (endOfTag == nullptr) + { + log_message(L"Bad 'r dropcap' code block in markdown file."); + break; + } + [[maybe_unused]] auto retval = m_subParser->operator()( + { start, static_cast(std::distance(start, endOfTag)) }); + add_character(L'"'); + add_characters( + { m_subParser->get_filtered_text(), m_subParser->get_filtered_text_length() }); + add_character(L'"'); + if (m_subParser->get_filtered_text_length()) + { + previousChar = + m_subParser->get_filtered_text()[m_subParser->get_filtered_text_length() - 1]; + } + endOfTag = string_util::find_unescaped_char(start, L'`'); + if (endOfTag == nullptr) + { + log_message(L"Bad 'r dropcap' code block in markdown file."); + break; + } + start = ++endOfTag; + } + } + else if (std::wcsncmp(start, L"`r menu(", 8) == 0) + { + start += 8; + if (start + 1 < endSentinel && + (start[0] == L'c') && + (start[1] == L'(')) + { start += 2; } + if (*start == L'\'' || *start == L'"') + { + const auto quoteChar{ *start }; + auto endOfTag = std::wcsstr(start, L")`"); + if (endOfTag == nullptr) + { + log_message(L"Bad 'r menu' code block in markdown file."); + break; + } + [[maybe_unused]] auto retval = m_subParser->operator()( + { start, static_cast(std::distance(start, endOfTag-1)) }); + add_characters( + { m_subParser->get_filtered_text(), m_subParser->get_filtered_text_length() }); + if (m_subParser->get_filtered_text_length()) + { + previousChar = + m_subParser->get_filtered_text()[m_subParser->get_filtered_text_length() - 1]; + } + start = endOfTag + 2; + } + } + // read content as-is otherwise + else + { ++start; } + } + } + // images + else if (*start == L'!') + { + if (!isEscaping && + (start +1 < endSentinel) && + start[1] == L'[') + { + start += 2; + auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'[', L']'); + if (endOfTag == nullptr) + { + log_message(L"Bad image command in markdown file."); + break; + } + start = ++endOfTag; + if (*start == L'(') + { + endOfTag = string_util::find_unescaped_matching_close_tag(++start, L'(', L')'); + if (endOfTag == nullptr) + { + log_message(L"Bad image command in markdown file."); + break; + } + start = ++endOfTag; + } + continue; + } + } + // links + else if (*start == L'[') + { + if (!isEscaping) + { + auto labelStart{ ++start }; + auto endOfTag = string_util::find_unescaped_matching_close_tag(start, L'[', L']'); + if (endOfTag == nullptr) + { + log_message(L"Bad link command in markdown file."); + break; + } + start = ++endOfTag; + if (*start == L'(') + { + auto labelEnd{ start - 1}; + endOfTag = string_util::find_unescaped_matching_close_tag(++start, L'(', L')'); + if (endOfTag == nullptr) + { + log_message(L"Bad link command in markdown file."); + break; + } + start = ++endOfTag; + if (labelStart < labelEnd) + { + [[maybe_unused]] auto retval = m_subParser->operator()( + { labelStart, static_cast(std::distance(labelStart, labelEnd)) }); + add_characters( + { m_subParser->get_filtered_text(), m_subParser->get_filtered_text_length() }); + if (m_subParser->get_filtered_text_length()) + { + previousChar = + m_subParser->get_filtered_text()[m_subParser->get_filtered_text_length() - 1]; + } + } + } + continue; + } + } + // IDs + else if (*start == L'{') + { + if (!isEscaping) + { + // if quarto syntax + if (std::wcsncmp(start, L"{{< pagebreak >}}", 17) == 0) + { + start += 17; + add_characters(L"\n\n"); + } + else + { + auto labelStart{ ++start }; + auto endOfTag = + string_util::find_unescaped_matching_close_tag(start, L'{', L'}'); + if (endOfTag == nullptr) + { + log_message(L"Bad ID command in markdown file."); + break; + } + start = ++endOfTag; + } + continue; + } + } + // newlines + else if (*start == L'\n' || *start == L'\r') + { + // two (or more) spaces at the end of a line indicates a paragraph break + size_t newlineCount{ 0 }; + if (previousChar == L' ' && + std::distance(md_text.data(), start) > 2 && + *(start-2) == L' ') + { ++newlineCount; } + // count the newlines (taking CRLF combos into account) + while ((*start == L'\n' || *start == L'\r') && + (start < endSentinel)) + { + if (*start == L'\r' && + (start + 1 < endSentinel) && + start[1] == L'\n') + { + ++start; + continue; + } + ++newlineCount; + ++start; + // If the next line is a header line divider, then + // skip that, switch to header mode, and keep reading + // any more newlines + if (start + 1 < endSentinel && + (start[0] == L'=' && start[1] == L'=') || + (start[0] == L'-' && start[1] == L'-')) + { + while ((*start == L'=' || *start == L'-') && + (start < endSentinel)) + { ++start; } + headerMode = true; + } + } + + auto scanAhead{ start }; + while ((scanAhead < endSentinel) && + (*scanAhead == L' ' || *scanAhead == L'\t')) + { ++scanAhead; } + // next line starts a list item, quoteblock, table, etc., so keep the newline as-is + if (newlineCount == 1 && + string_util::is_one_of(*scanAhead, L">-*+|:^")) + { + add_character(L'\n', newlineCount); + previousChar = L'\n'; + } + // same for an ordered list + else if (newlineCount == 1 && + (start < endSentinel) && + std::iswdigit(*start)) + { + auto scanAheadDigit{ start }; + while ((scanAheadDigit < endSentinel) && + std::iswdigit(*scanAheadDigit)) + { ++scanAheadDigit; } + if (*scanAheadDigit == L'.') + { + add_character(L'\n', newlineCount); + previousChar = L'\n'; + } + // not an ordered list, default behavior to read as space + else + { + add_character(L' '); + previousChar = L' '; + } + } + // a single newline not a end of a self-contained line + // (e.g., a header) is seen as a space + else if (newlineCount == 1 && !headerMode) + { + add_character(L' '); + previousChar = L' '; + } + else if (newlineCount == 1 && headerMode) + { + add_characters(L"\n\n"); + previousChar = L'\n'; + } + else + { + add_character(L'\n', newlineCount); + previousChar = L'\n'; + } + headerMode = false; + isEscaping = false; + continue; + } + // styling tags that just get removed from raw text + else if (!isEscaping && + (*start == L'*' || *start == L'_' || *start == L'~') ) + { + while ((*start == L'*' || *start == L'_' || *start == L'~') && + (start < endSentinel)) + { ++start; } + } + // table + else if (!isEscaping && + (*start == L'|') ) + { + previousChar = L'\t'; + add_character(L'\t'); + ++start; + continue; + } + // newline hacks found in tables (just replace with space to keep the table structure). + else if (std::wcsncmp(start, L"
\\linebreak", 14) == 0) + { + start += 14; + previousChar = L' '; + add_character(L' '); + } + // RMarkdown (Pandoc) comment + else if (std::wcsncmp(start, L""); + if (endOfTag == nullptr) + { + log_message(L"Bad comment block in markdown file."); + break; + } + start = endOfTag + 3; + } + // turn off escaping and load the character + isEscaping = false; + previousChar = *start; + add_character(*start); + ++start; + } + + return get_filtered_text(); + } diff --git a/src/import/markdown_extract_text.h b/src/import/markdown_extract_text.h index 86973d66..e11c6914 100644 --- a/src/import/markdown_extract_text.h +++ b/src/import/markdown_extract_text.h @@ -18,56 +18,14 @@ namespace lily_of_the_valley { /// @brief Extracts plain text from a Markdown file. - /// @warning This is in alpha state. Do not use in productino. class markdown_extract_text final : public html_extract_text { public: /** @brief Main interface for extracting plain text from a Markdown file. @param md_text The markdown text to parse. - @param text_length The length of the text. @returns The parsed text from the Markdown stream.*/ [[nodiscard]] - const wchar_t* operator()(const wchar_t* md_text, - const size_t text_length) - { - clear_log(); - if (md_text == nullptr || md_text[0] == 0 || text_length == 0) - { - set_filtered_text_length(0); - return nullptr; - } - - if (!allocate_text_buffer(text_length)) - { - set_filtered_text_length(0); - return nullptr; - } - - // find the start of the text body and set up where we halt our searching - const wchar_t* const endSentinel = md_text+text_length; - if (is_metadata_section(md_text)) - { md_text = find_metadata_section_end(md_text); } - while (md_text < endSentinel && md_text[0] != 0 && std::iswspace(md_text[0])) - { ++md_text; } - // in case metadata section ate up the whole file - // (or at least the part of the file requested to be reviewed) - if (md_text >= endSentinel) - { return endSentinel; } - const wchar_t* start = md_text; - const wchar_t* end = md_text; - - while (end != nullptr && end[0] != 0 && (end < endSentinel)) - { ++end; } - // pick up any remaining text at the end of the text stream.. - if (end != nullptr && end[0] == 0) - { add_characters(start, end-start); } - // ...or if end is beyond where we should go in the buffer, - // copy up to the end of the buffer - else if (end >= endSentinel) - { add_characters(start, endSentinel-start); } - - return get_filtered_text(); - } + const wchar_t* operator()(const std::wstring_view md_text); #ifndef __UNITTEST private: #endif @@ -129,6 +87,8 @@ namespace lily_of_the_valley followed by colon.*/ std::wregex m_metadataSectionStart { LR"(^(%[[:space:]]+|---|[[:alpha:]]+([[:space:]]{1}[[:alpha:]]+)?[:]).*$)" }; + + std::unique_ptr m_subParser{ nullptr }; }; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5bc20179..6cc38738 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -36,6 +36,7 @@ SET(LIB_SRC_FILES ../src/import/docx_extract_text.cpp ../src/import/html_extract_text.cpp ../src/import/odt_odp_extract_text.cpp + ../src/import/markdown_extract_text.cpp ../src/import/postscript_extract_text.cpp ../src/import/xlsx_extract_text.cpp ../src/import/rtf_extract_text.cpp) diff --git a/tests/mdparsetests.cpp b/tests/mdparsetests.cpp index 41ddbef0..c4ea8d78 100644 --- a/tests/mdparsetests.cpp +++ b/tests/mdparsetests.cpp @@ -11,9 +11,8 @@ TEST_CASE("Markdown Parser", "[md import]") { lily_of_the_valley::markdown_extract_text md; CHECK(md.is_metadata_section(nullptr) == false); - CHECK(md(nullptr, 100) == nullptr); CHECK(md.find_metadata_section_end(nullptr) == nullptr); - CHECK(md(L"some MD text", 0) == nullptr); + CHECK(md({ L"some MD text", 0 }) == nullptr); } SECTION("Meta Sections") @@ -62,4 +61,103 @@ TEST_CASE("Markdown Parser", "[md import]") end = md.find_metadata_section_end(mdText); CHECK(end == mdText+5); } + + SECTION("Newlines") + { + lily_of_the_valley::markdown_extract_text md; + CHECK(std::wstring{ md({ L"# Header\nThis is\na line.\r\nThis is the same line. \nThis is a new line.\r\n\r\nAnother line. \nSame line." }) } == + std::wstring{ L"Header\n\nThis is a line. This is the same line. \n\nThis is a new line.\n\nAnother line. Same line." }); + } + + SECTION("Header") + { + lily_of_the_valley::markdown_extract_text md; + CHECK(std::wstring{ md({ L"# Header1\n### Header2\n Not a #header" }) } == + std::wstring{ L"Header1\n\nHeader2\n\n Not a #header" }); + CHECK(std::wstring{ md({ L"Header1\n=========\nHeader2\n--\nNot a =header" }) } == + std::wstring{ L"Header1\n\nHeader2\n\nNot a =header" }); + CHECK(std::wstring{ md({ L"# Header1 {.unnumbered}\nText" }) } == + std::wstring{ L"Header1 \n\nText" }); + } + + SECTION("Emphasis") + { + lily_of_the_valley::markdown_extract_text md; + CHECK(std::wstring{ md({ L"This is *italic* and **bold** and also __italic__. 2 \\* 2." }) } == + std::wstring{ L"This is italic and bold and also italic. 2 * 2." }); + } + + SECTION("Blockquoes") + { + lily_of_the_valley::markdown_extract_text md; + CHECK(std::wstring{ md({ L"> This is a quote\n\n>\n\n> End of Quote" }) } == + std::wstring{ L"\tThis is a quote\n\n\t\n\n\tEnd of Quote" }); + // nested + CHECK(std::wstring{ md({ L"> This is a quote\n\n>\n\n>> End of Quote" }) } == + std::wstring{ L"\tThis is a quote\n\n\t\n\n\t\tEnd of Quote" }); + // with header + CHECK(std::wstring{ md({ L"> # This is a quote header\n\n>\n\n>> End of Quote" }) } == + std::wstring{ L"\tThis is a quote header\n\n\t\n\n\t\tEnd of Quote" }); + } + + SECTION("Inline code") + { + lily_of_the_valley::markdown_extract_text md; + CHECK(std::wstring{ md({ L"This `is code`." }) } == + std::wstring{ L"This is code." }); + } + + SECTION("Images") + { + lily_of_the_valley::markdown_extract_text md; + CHECK(std::wstring{ md({ L"Tux ![Tux, the Linux mascot](/assets/tux.png) the penguin." }) } == + std::wstring{ L"Tux the penguin." }); + // malformed + CHECK(std::wstring{ md({ L"Tux ![Tux, the Linux mascot" }) } == + std::wstring{ L"Tux " }); + CHECK(std::wstring{ md({ L"Tux ![Tux, the Linux mascot](/assets/tux.png" }) } == + std::wstring{ L"Tux " }); + } + + SECTION("Links") + { + lily_of_the_valley::markdown_extract_text md; + CHECK(std::wstring{ md({ L"Tux [the Linux mascot](/assets/tux.png) the penguin." }) } == + std::wstring{ L"Tux the Linux mascot the penguin." }); + CHECK(std::wstring{ md({ L"Tux [the **Linux** mascot](/assets/tux.png) the penguin." }) } == + std::wstring{ L"Tux the Linux mascot the penguin." }); + // malformed + CHECK(std::wstring{ md({ L"Tux [the Linux mascot" }) } == + std::wstring{ L"Tux " }); + CHECK(std::wstring{ md({ L"Tux [the Linux mascot](/assets/tux.png" }) } == + std::wstring{ L"Tux " }); + } + + SECTION("Unordered Lists") + { + lily_of_the_valley::markdown_extract_text md; + CHECK(std::wstring{ md({ L"- list one\nhere\n- Item 2\n\nSome -text." }) } == + std::wstring{ L"- list one here\n- Item 2\n\nSome -text." }); + } + + SECTION("Unordered Nested Lists") + { + lily_of_the_valley::markdown_extract_text md; + CHECK(std::wstring{ md({ L"- list one\n - Item 2" }) } == + std::wstring{ L"- list one\n - Item 2" }); + } + + SECTION("Ordered Lists") + { + lily_of_the_valley::markdown_extract_text md; + CHECK(std::wstring{ md({ L"1. list one\nhere\n256. Item\n2" }) } == + std::wstring{ L"1. list one here\n256. Item 2" }); + } + + SECTION("Table") + { + lily_of_the_valley::markdown_extract_text md; + CHECK(std::wstring{ md({ L"| Syntax | Description |\n| --- | ----------- |\n| Header | Title |" }) } == + std::wstring{ L"\t Syntax \t Description \t\n\t --- \t ----------- \t\n\t Header \t Title \t" }); + } } diff --git a/tools/files.cmake b/tools/files.cmake index 9d749fd4..2b7d1d75 100644 --- a/tools/files.cmake +++ b/tools/files.cmake @@ -56,6 +56,7 @@ SET(WISTERIA_SRC src/import/doc_extract_text.cpp src/import/docx_extract_text.cpp src/import/html_extract_text.cpp + src/import/markdown_extract_text.cpp src/import/odt_odp_extract_text.cpp src/import/postscript_extract_text.cpp src/import/rtf_extract_text.cpp diff --git a/tools/guilibfiles_testing.cmake b/tools/guilibfiles_testing.cmake index 345a8115..9c210ec4 100644 --- a/tools/guilibfiles_testing.cmake +++ b/tools/guilibfiles_testing.cmake @@ -57,6 +57,7 @@ SET(WISTERIA_SRC ../../src/import/doc_extract_text.cpp ../../src/import/docx_extract_text.cpp ../../src/import/html_extract_text.cpp + ../../src/import/markdown_extract_text.cpp ../../src/import/odt_odp_extract_text.cpp ../../src/import/postscript_extract_text.cpp ../../src/import/rtf_extract_text.cpp diff --git a/tools/libfiles.cmake b/tools/libfiles.cmake index 5aa5cb9e..22a906a2 100644 --- a/tools/libfiles.cmake +++ b/tools/libfiles.cmake @@ -53,6 +53,7 @@ SET(WISTERIA_SRC src/graphs/wordcloud.cpp src/i18n-check/src/i18n_string_util.cpp src/import/html_extract_text.cpp + src/import/markdown_extract_text.cpp src/import/xlsx_extract_text.cpp src/ui/controls/thumbnail.cpp src/ui/dialogs/imageexportdlg.cpp