From dd80f23229dd69a138e42934522e89bbd33fb79c Mon Sep 17 00:00:00 2001 From: Valery Mironov <32071355+MBkkt@users.noreply.github.com> Date: Sat, 16 Sep 2023 13:55:05 +0200 Subject: [PATCH] Improves --- .../text/include/boost/text/word_break.hpp | 397 ++---------------- 1 file changed, 34 insertions(+), 363 deletions(-) diff --git a/external/text/include/boost/text/word_break.hpp b/external/text/include/boost/text/word_break.hpp index f53f00732..4f5fc8e37 100644 --- a/external/text/include/boost/text/word_break.hpp +++ b/external/text/include/boost/text/word_break.hpp @@ -19,11 +19,11 @@ namespace boost { namespace text { /** The word properties defined by Unicode. */ - enum word_property { - Other, + enum word_property : uint32_t { CR, LF, Newline, + Other, Katakana, ALetter, MidLetter, @@ -39,7 +39,7 @@ namespace boost { namespace text { WSegSpace, Format, Extend, - ZWJ + ZWJ, }; namespace detail { @@ -141,14 +141,12 @@ namespace boost { namespace text { namespace detail { inline bool skippable(word_property prop) noexcept { - return prop == word_property::Extend || - prop == word_property::Format || prop == word_property::ZWJ; + return word_property::Format <= prop; } inline bool linebreak(word_property prop) noexcept { - return prop == word_property::CR || prop == word_property::LF || - prop == word_property::Newline; + return prop <= word_property::Newline; } inline bool ah_letter(word_property prop) noexcept @@ -209,7 +207,7 @@ namespace boost { namespace text { bool it_points_to_prev = false; - std::array caps; + std::array caps; word_break_emoji_state_t emoji_state; }; @@ -248,28 +246,28 @@ namespace boost { namespace text { // clang-format off // See chart at http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakTest.html. -constexpr std::array, 20> word_breaks = {{ -// Other CR LF NL Ktk AL ML MN MNL Num ENL RI HL DQ SQ EP WSSp Fmt Extd ZWJ - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Other - {{1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, // CR - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, // LF - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, // Newline - {{1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Katakana - {{1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0}}, // ALetter - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // MidLetter - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // MidNum - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // MidNumLet - {{1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0}}, // Numeric - {{1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0}}, // ExtendNumLet - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0}}, // RI - {{1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0}}, // Hebrew_Letter - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Double_Quote - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Single_Quote - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // ExtPict - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // WSegSpace - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Format - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Extend - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0}}, // ZWJ +static constexpr std::array, 20> word_breaks = {{ +// CR LF NL Other Ktk AL ML MN MNL Num ENL RI HL DQ SQ EP WSSp Fmt Extd ZWJ + {{1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, // CR + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, // LF + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, // Newline + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Other + {{1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Katakana + {{1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0}}, // ALetter + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // MidLetter + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // MidNum + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // MidNumLet + {{1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0}}, // Numeric + {{1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0}}, // ExtendNumLet + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0}}, // RI + {{1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0}}, // Hebrew_Letter + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Double_Quote + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Single_Quote + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // ExtPict + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // WSegSpace + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Format + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Extend + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0}}, // ZWJ }}; // clang-format on auto const lhs_int = static_cast(lhs); @@ -277,8 +275,7 @@ constexpr std::array, 20> word_breaks = {{ return word_breaks[lhs_int][rhs_int]; } - // WB4: Except after line breaks, ignore/skip (Extend | Format | - // ZWJ)* + // WB4: Except after line breaks, ignore/skip (Extend | Format | ZWJ)* template void skip_forward( word_break_state& state, @@ -367,329 +364,6 @@ constexpr std::array, 20> word_breaks = {{ WordPropFunc const & word_prop = WordPropFunc{}, WordBreakFunc const & word_break = WordBreakFunc{}) noexcept { - using detail::ph; - using detail::cp_and_word_prop; - - if (it == first) - return it; - - if (it == last && --it == first) - return it; - - detail::word_break_state state; - - state.it = it; - - state.caps[ph::curr] = cp_and_word_prop(*state.it, word_prop); - - // Since word_break is evaluated unconditionally before the other - // rules, we need to do all this here before the special-casing - // below. - if (it != first) { - state.caps[ph::prev] = - cp_and_word_prop(*std::prev(state.it), word_prop); - if (std::prev(state.it) != first) { - state.caps[ph::prev_prev] = - cp_and_word_prop(*std::prev(state.it, 2), word_prop); - } - } - if (std::next(state.it) != last) { - state.caps[ph::next] = - cp_and_word_prop(*std::next(state.it), word_prop); - if (std::next(state.it, 2) != last) { - state.caps[ph::next_next] = - cp_and_word_prop(*std::next(state.it, 2), word_prop); - } - } - if (word_break( - state.caps[ph::prev_prev].cp, - state.caps[ph::prev].cp, - state.caps[ph::curr].cp, - state.caps[ph::next].cp, - state.caps[ph::next_next].cp)) { - return state.it; - } - - // Special case: If state.caps[ph::curr].prop is skippable, we - // need to skip backward until we find a non-skippable. - if (detail::skippable(state.caps[ph::curr].prop)) { - auto const prev = boost::text::find_if_not_backward( - first, it, [word_prop](uint32_t cp) { - return detail::skippable(word_prop(cp)); - }); - if (prev != it) { - state.it = prev; - state.caps[ph::next] = - cp_and_word_prop(*std::next(state.it), word_prop); - state.caps[ph::curr] = - cp_and_word_prop(*state.it, word_prop); - - // If we end up on a non-skippable that should break - // before the skippable(s) we just moved over, break on - // the last skippable. - if (!detail::skippable(state.caps[ph::curr].prop) && - detail::table_word_break( - state.caps[ph::curr].prop, - state.caps[ph::next].prop)) { - return ++state.it; - } - if (state.it == first) - return first; - } - } - - state.caps[ph::prev_prev] = cp_and_word_prop(); - if (std::prev(state.it) != first) { - state.caps[ph::prev_prev] = - cp_and_word_prop(*std::prev(state.it, 2), word_prop); - } - state.caps[ph::prev] = - cp_and_word_prop(*std::prev(state.it), word_prop); - state.caps[ph::next] = cp_and_word_prop(); - state.caps[ph::next_next] = cp_and_word_prop(); - if (std::next(state.it) != last) { - state.caps[ph::next] = - cp_and_word_prop(*std::next(state.it), word_prop); - if (std::next(state.it, 2) != last) { - state.caps[ph::next_next] = - cp_and_word_prop(*std::next(state.it, 2), word_prop); - } - } - - // Since 'it' may be anywhere within the word in which it sits, - // we need to look forward to make sure that next_prop and - // next_next_prop don't point to skippables. - { - if (std::next(state.it) != last) { - auto temp_state = detail::next(state); - detail::skip_forward(temp_state, first, last, word_prop); - if (temp_state.it == last) { - state.caps[ph::next] = cp_and_word_prop(); - state.caps[ph::next_next] = cp_and_word_prop(); - } else { - state.caps[ph::next] = temp_state.caps[ph::curr]; - if (std::next(temp_state.it) != last) { - temp_state = detail::next(temp_state); - detail::skip_forward(temp_state, first, last, word_prop); - if (temp_state.it == last) { - state.caps[ph::next_next] = cp_and_word_prop(); - } else { - state.caps[ph::next_next] = - temp_state.caps[ph::curr]; - } - } - } - } - } - - state.emoji_state = detail::word_break_emoji_state_t::none; - - // WB4: Except after line breaks, ignore/skip (Extend | Format | - // ZWJ)* - auto skip = [word_prop]( - detail::word_break_state state, - CPIter first) { - if (detail::skippable(state.caps[ph::prev].prop)) { - auto temp_it = boost::text::find_if_not_backward( - first, state.it, [word_prop](uint32_t cp) { - return detail::skippable(word_prop(cp)); - }); - if (temp_it == state.it) - return state; - auto temp_prev_cap = cp_and_word_prop(*temp_it, word_prop); - if (!detail::linebreak(temp_prev_cap.prop)) { - state.it = temp_it; - state.it_points_to_prev = true; - state.caps[ph::prev] = temp_prev_cap; - if (temp_it == first) { - state.caps[ph::prev_prev] = cp_and_word_prop(); - } else { - state.caps[ph::prev_prev] = cp_and_word_prop( - *std::prev(temp_it), word_prop); - } - } - } - return state; - }; - - for (; state.it != first; state = detail::prev(state)) { - if (std::prev(state.it) != first) { - state.caps[ph::prev_prev] = - cp_and_word_prop(*std::prev(state.it, 2), word_prop); - } else { - state.caps[ph::prev_prev] = cp_and_word_prop(); - } - - // Check word_break before anything else. - if (word_break( - state.caps[ph::prev_prev].cp, - state.caps[ph::prev].cp, - state.caps[ph::curr].cp, - state.caps[ph::next].cp, - state.caps[ph::next_next].cp)) { - return state.it; - } - - // When we see an RI, back up to the first RI so we can see - // what emoji state we're supposed to be in here. - if (state.emoji_state == - detail::word_break_emoji_state_t::none && - state.caps[ph::curr].prop == - word_property::Regional_Indicator) { - auto temp_state = state; - int ris_before = 0; - while (temp_state.it != first) { - temp_state = skip(temp_state, first); - if (temp_state.it == first) { - if (temp_state.caps[ph::prev].prop == - word_property::Regional_Indicator) { - ++ris_before; - } - break; - } - if (temp_state.caps[ph::prev].prop == - word_property::Regional_Indicator) { - temp_state = detail::prev(temp_state); - if (temp_state.it != first && - std::prev(temp_state.it) != first) { - temp_state.caps[ph::prev_prev] = - cp_and_word_prop( - *std::prev(temp_state.it, 2), - word_prop); - } else { - temp_state.caps[ph::prev_prev] = - cp_and_word_prop(); - } - ++ris_before; - } else { - break; - } - } - state.emoji_state = - (ris_before % 2 == 0) - ? detail::word_break_emoji_state_t::first_emoji - : detail::word_break_emoji_state_t::second_emoji; - } - - // WB3 - if (state.caps[ph::prev].prop == word_property::CR && - state.caps[ph::curr].prop == word_property::LF) { - continue; - } - - // WB3a - if (state.caps[ph::prev].prop == word_property::CR || - state.caps[ph::prev].prop == word_property::LF || - state.caps[ph::prev].prop == word_property::Newline) { - return state.it; - } - - // WB3b - if (state.caps[ph::curr].prop == word_property::CR || - state.caps[ph::curr].prop == word_property::LF || - state.caps[ph::curr].prop == word_property::Newline) { - return state.it; - } - - // WB3c - if (state.caps[ph::prev].prop == word_property::ZWJ && - state.caps[ph::curr].prop == word_property::ExtPict) { - continue; - } - - // WB3d - if (state.caps[ph::prev].prop == word_property::WSegSpace && - state.caps[ph::curr].prop == word_property::WSegSpace) { - continue; - } - - // If we end up breaking durign this iteration, we want the - // break to show up after the skip, so that the skippable - // CPs go with the CP before them. This is to maintain - // symmetry with next_word_break(). - auto after_skip_it = state.it; - - // Puting this here means not having to do it explicitly - // below between prev_prop and prop (and transitively, - // between prop and next_prop). - state = skip(state, first); - - // WB6 - if (detail::ah_letter(state.caps[ph::prev].prop) && - detail::mid_ah(state.caps[ph::curr].prop) && - detail::ah_letter(state.caps[ph::next].prop)) { - continue; - } - - // WB7 - if (detail::mid_ah(state.caps[ph::prev].prop) && - detail::ah_letter(state.caps[ph::curr].prop) && - state.it != first) { - auto const temp_state = skip(detail::prev(state), first); - if (detail::ah_letter(temp_state.caps[ph::prev].prop)) - continue; - } - - // WB7b - if (state.caps[ph::prev].prop == word_property::Hebrew_Letter && - state.caps[ph::curr].prop == word_property::Double_Quote && - state.caps[ph::next].prop == word_property::Hebrew_Letter) { - continue; - } - - // WB7c - if (state.caps[ph::prev].prop == word_property::Double_Quote && - state.caps[ph::curr].prop == word_property::Hebrew_Letter && - state.it != first) { - auto const temp_state = skip(detail::prev(state), first); - if (temp_state.caps[ph::prev].prop == - word_property::Hebrew_Letter) - continue; - } - - // WB11 - if (detail::mid_num(state.caps[ph::prev].prop) && - state.caps[ph::curr].prop == word_property::Numeric && - state.it != first) { - auto const temp_state = skip(detail::prev(state), first); - if (temp_state.caps[ph::prev].prop == - word_property::Numeric) - continue; - } - - // WB12 - if (state.caps[ph::prev].prop == word_property::Numeric && - detail::mid_num(state.caps[ph::curr].prop) && - state.caps[ph::next].prop == word_property::Numeric) { - continue; - } - - if (state.emoji_state == - detail::word_break_emoji_state_t::first_emoji) { - if (state.caps[ph::prev].prop == - word_property::Regional_Indicator) { - state.emoji_state = - detail::word_break_emoji_state_t::second_emoji; - return after_skip_it; - } else { - state.emoji_state = - detail::word_break_emoji_state_t::none; - } - } else if ( - state.emoji_state == - detail::word_break_emoji_state_t::second_emoji && - state.caps[ph::prev].prop == - word_property::Regional_Indicator) { - state.emoji_state = - detail::word_break_emoji_state_t::first_emoji; - continue; - } - - if (detail::table_word_break( - state.caps[ph::prev].prop, state.caps[ph::curr].prop)) - return after_skip_it; - } - return first; } @@ -718,8 +392,10 @@ constexpr std::array, 20> word_breaks = {{ if (++state.it == last) return state.it; + state.caps[ph::prev_prev] = cp_and_word_prop(); state.caps[ph::prev] = cp_and_word_prop(*first, word_prop); state.caps[ph::curr] = cp_and_word_prop(*state.it, word_prop); + state.caps[ph::next] = cp_and_word_prop(); state.it_next = std::next(state.it); state.emoji_state = @@ -739,16 +415,12 @@ constexpr std::array, 20> word_breaks = {{ } // WB3a - if (state.caps[ph::prev].prop == word_property::CR || - state.caps[ph::prev].prop == word_property::LF || - state.caps[ph::prev].prop == word_property::Newline) { + if (detail::linebreak(state.caps[ph::prev].prop)) { return state.it; } // WB3b - if (state.caps[ph::curr].prop == word_property::CR || - state.caps[ph::curr].prop == word_property::LF || - state.caps[ph::curr].prop == word_property::Newline) { + if (detail::linebreak(state.caps[ph::curr].prop)) { return state.it; } @@ -804,8 +476,7 @@ constexpr std::array, 20> word_breaks = {{ } // WB7c - if (state.caps[ph::prev_prev].prop == - word_property::Hebrew_Letter && + if (state.caps[ph::prev_prev].prop == word_property::Hebrew_Letter && state.caps[ph::prev].prop == word_property::Double_Quote && state.caps[ph::curr].prop == word_property::Hebrew_Letter) { continue;