From dd80f23229dd69a138e42934522e89bbd33fb79c Mon Sep 17 00:00:00 2001
From: Valery Mironov <32071355+MBkkt@users.noreply.github.com>
Date: Sat, 16 Sep 2023 13:55:05 +0200
Subject: [PATCH] Improves

---
 .../text/include/boost/text/word_break.hpp    | 397 ++----------------
 1 file changed, 34 insertions(+), 363 deletions(-)

diff --git a/external/text/include/boost/text/word_break.hpp b/external/text/include/boost/text/word_break.hpp
index f53f00732..4f5fc8e37 100644
--- a/external/text/include/boost/text/word_break.hpp
+++ b/external/text/include/boost/text/word_break.hpp
@@ -19,11 +19,11 @@
 
 namespace boost { namespace text {
     /** The word properties defined by Unicode. */
-    enum word_property {
-        Other,
+    enum word_property : uint32_t {
         CR,
         LF,
         Newline,
+        Other,
         Katakana,
         ALetter,
         MidLetter,
@@ -39,7 +39,7 @@ namespace boost { namespace text {
         WSegSpace,
         Format,
         Extend,
-        ZWJ
+        ZWJ,
     };
 
     namespace detail {
@@ -141,14 +141,12 @@ namespace boost { namespace text {
     namespace detail {
         inline bool skippable(word_property prop) noexcept
         {
-            return prop == word_property::Extend ||
-                   prop == word_property::Format || prop == word_property::ZWJ;
+            return word_property::Format <= prop;
         }
 
         inline bool linebreak(word_property prop) noexcept
         {
-            return prop == word_property::CR || prop == word_property::LF ||
-                   prop == word_property::Newline;
+            return prop <= word_property::Newline;
         }
 
         inline bool ah_letter(word_property prop) noexcept
@@ -209,7 +207,7 @@ namespace boost { namespace text {
 
             bool it_points_to_prev = false;
 
-            std::array<cp_and_word_prop, 5> caps;
+            std::array<cp_and_word_prop, 4> caps;
 
             word_break_emoji_state_t emoji_state;
         };
@@ -248,28 +246,28 @@ namespace boost { namespace text {
 
             // clang-format off
 // See chart at http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakTest.html.
-constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
-// Other CR LF NL Ktk AL ML MN MNL Num ENL RI HL DQ SQ EP WSSp Fmt Extd ZWJ
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Other
-    {{1, 1, 0, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   1,  1,   1}}, // CR
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   1,  1,   1}}, // LF
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   1,  1,   1}}, // Newline
-    {{1, 1, 1, 1, 0,  1, 1, 1, 1,  1,  0,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Katakana
-    {{1, 1, 1, 1, 1,  0, 1, 1, 1,  0,  0,  1, 0, 1, 1, 1, 1,   0,  0,   0}}, // ALetter
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // MidLetter
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // MidNum
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // MidNumLet
-    {{1, 1, 1, 1, 1,  0, 1, 1, 1,  0,  0,  1, 0, 1, 1, 1, 1,   0,  0,   0}}, // Numeric
-    {{1, 1, 1, 1, 0,  0, 1, 1, 1,  0,  0,  1, 0, 1, 1, 1, 1,   0,  0,   0}}, // ExtendNumLet
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  0, 1, 1, 1, 1, 1,   0,  0,   0}}, // RI
-    {{1, 1, 1, 1, 1,  0, 1, 1, 1,  0,  0,  1, 0, 1, 0, 1, 1,   0,  0,   0}}, // Hebrew_Letter
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Double_Quote
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Single_Quote
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // ExtPict
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // WSegSpace
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Format
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Extend
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 0, 1,   0,  0,   0}}, // ZWJ
+static constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
+//    CR LF NL Other Ktk AL ML MN MNL Num ENL RI HL DQ SQ EP WSSp Fmt Extd ZWJ 
+    {{1, 0, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   1,  1,   1}}, // CR
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   1,  1,   1}}, // LF
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   1,  1,   1}}, // Newline
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Other
+    {{1, 1, 1, 1,    0,  1, 1, 1, 1,  1,  0,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Katakana
+    {{1, 1, 1, 1,    1,  0, 1, 1, 1,  0,  0,  1, 0, 1, 1, 1, 1,   0,  0,   0}}, // ALetter
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // MidLetter
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // MidNum
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // MidNumLet
+    {{1, 1, 1, 1,    1,  0, 1, 1, 1,  0,  0,  1, 0, 1, 1, 1, 1,   0,  0,   0}}, // Numeric
+    {{1, 1, 1, 1,    0,  0, 1, 1, 1,  0,  0,  1, 0, 1, 1, 1, 1,   0,  0,   0}}, // ExtendNumLet
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  0, 1, 1, 1, 1, 1,   0,  0,   0}}, // RI
+    {{1, 1, 1, 1,    1,  0, 1, 1, 1,  0,  0,  1, 0, 1, 0, 1, 1,   0,  0,   0}}, // Hebrew_Letter
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Double_Quote
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Single_Quote
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // ExtPict
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // WSegSpace
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Format
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Extend
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 0, 1,   0,  0,   0}}, // ZWJ
 }};
             // clang-format on
             auto const lhs_int = static_cast<int>(lhs);
@@ -277,8 +275,7 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
             return word_breaks[lhs_int][rhs_int];
         }
 
-        // WB4: Except after line breaks, ignore/skip (Extend | Format |
-        // ZWJ)*
+        // WB4: Except after line breaks, ignore/skip (Extend | Format | ZWJ)*
         template<typename CPIter, typename Sentinel, typename WordPropFunc>
         void skip_forward(
             word_break_state<CPIter>& state,
@@ -367,329 +364,6 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
             WordPropFunc const & word_prop = WordPropFunc{},
             WordBreakFunc const & word_break = WordBreakFunc{}) noexcept
         {
-            using detail::ph;
-            using detail::cp_and_word_prop;
-
-            if (it == first)
-                return it;
-
-            if (it == last && --it == first)
-                return it;
-
-            detail::word_break_state<CPIter> state;
-
-            state.it = it;
-
-            state.caps[ph::curr] = cp_and_word_prop(*state.it, word_prop);
-
-            // Since word_break is evaluated unconditionally before the other
-            // rules, we need to do all this here before the special-casing
-            // below.
-            if (it != first) {
-                state.caps[ph::prev] =
-                    cp_and_word_prop(*std::prev(state.it), word_prop);
-                if (std::prev(state.it) != first) {
-                    state.caps[ph::prev_prev] =
-                        cp_and_word_prop(*std::prev(state.it, 2), word_prop);
-                }
-            }
-            if (std::next(state.it) != last) {
-                state.caps[ph::next] =
-                    cp_and_word_prop(*std::next(state.it), word_prop);
-                if (std::next(state.it, 2) != last) {
-                    state.caps[ph::next_next] =
-                        cp_and_word_prop(*std::next(state.it, 2), word_prop);
-                }
-            }
-            if (word_break(
-                    state.caps[ph::prev_prev].cp,
-                    state.caps[ph::prev].cp,
-                    state.caps[ph::curr].cp,
-                    state.caps[ph::next].cp,
-                    state.caps[ph::next_next].cp)) {
-                return state.it;
-            }
-
-            // Special case: If state.caps[ph::curr].prop is skippable, we
-            // need to skip backward until we find a non-skippable.
-            if (detail::skippable(state.caps[ph::curr].prop)) {
-                auto const prev = boost::text::find_if_not_backward(
-                    first, it, [word_prop](uint32_t cp) {
-                        return detail::skippable(word_prop(cp));
-                    });
-                if (prev != it) {
-                    state.it = prev;
-                    state.caps[ph::next] =
-                        cp_and_word_prop(*std::next(state.it), word_prop);
-                    state.caps[ph::curr] =
-                        cp_and_word_prop(*state.it, word_prop);
-
-                    // If we end up on a non-skippable that should break
-                    // before the skippable(s) we just moved over, break on
-                    // the last skippable.
-                    if (!detail::skippable(state.caps[ph::curr].prop) &&
-                        detail::table_word_break(
-                            state.caps[ph::curr].prop,
-                            state.caps[ph::next].prop)) {
-                        return ++state.it;
-                    }
-                    if (state.it == first)
-                        return first;
-                }
-            }
-
-            state.caps[ph::prev_prev] = cp_and_word_prop();
-            if (std::prev(state.it) != first) {
-                state.caps[ph::prev_prev] =
-                    cp_and_word_prop(*std::prev(state.it, 2), word_prop);
-            }
-            state.caps[ph::prev] =
-                cp_and_word_prop(*std::prev(state.it), word_prop);
-            state.caps[ph::next] = cp_and_word_prop();
-            state.caps[ph::next_next] = cp_and_word_prop();
-            if (std::next(state.it) != last) {
-                state.caps[ph::next] =
-                    cp_and_word_prop(*std::next(state.it), word_prop);
-                if (std::next(state.it, 2) != last) {
-                    state.caps[ph::next_next] =
-                        cp_and_word_prop(*std::next(state.it, 2), word_prop);
-                }
-            }
-
-            // Since 'it' may be anywhere within the word in which it sits,
-            // we need to look forward to make sure that next_prop and
-            // next_next_prop don't point to skippables.
-            {
-                if (std::next(state.it) != last) {
-                    auto temp_state = detail::next(state);
-                    detail::skip_forward(temp_state, first, last, word_prop);
-                    if (temp_state.it == last) {
-                        state.caps[ph::next] = cp_and_word_prop();
-                        state.caps[ph::next_next] = cp_and_word_prop();
-                    } else {
-                        state.caps[ph::next] = temp_state.caps[ph::curr];
-                        if (std::next(temp_state.it) != last) {
-                            temp_state = detail::next(temp_state);
-                            detail::skip_forward(temp_state, first, last, word_prop);
-                            if (temp_state.it == last) {
-                                state.caps[ph::next_next] = cp_and_word_prop();
-                            } else {
-                                state.caps[ph::next_next] =
-                                    temp_state.caps[ph::curr];
-                            }
-                        }
-                    }
-                }
-            }
-
-            state.emoji_state = detail::word_break_emoji_state_t::none;
-
-            // WB4: Except after line breaks, ignore/skip (Extend | Format |
-            // ZWJ)*
-            auto skip = [word_prop](
-                            detail::word_break_state<CPIter> state,
-                            CPIter first) {
-                if (detail::skippable(state.caps[ph::prev].prop)) {
-                    auto temp_it = boost::text::find_if_not_backward(
-                        first, state.it, [word_prop](uint32_t cp) {
-                            return detail::skippable(word_prop(cp));
-                        });
-                    if (temp_it == state.it)
-                        return state;
-                    auto temp_prev_cap = cp_and_word_prop(*temp_it, word_prop);
-                    if (!detail::linebreak(temp_prev_cap.prop)) {
-                        state.it = temp_it;
-                        state.it_points_to_prev = true;
-                        state.caps[ph::prev] = temp_prev_cap;
-                        if (temp_it == first) {
-                            state.caps[ph::prev_prev] = cp_and_word_prop();
-                        } else {
-                            state.caps[ph::prev_prev] = cp_and_word_prop(
-                                *std::prev(temp_it), word_prop);
-                        }
-                    }
-                }
-                return state;
-            };
-
-            for (; state.it != first; state = detail::prev(state)) {
-                if (std::prev(state.it) != first) {
-                    state.caps[ph::prev_prev] =
-                        cp_and_word_prop(*std::prev(state.it, 2), word_prop);
-                } else {
-                    state.caps[ph::prev_prev] = cp_and_word_prop();
-                }
-
-                // Check word_break before anything else.
-                if (word_break(
-                        state.caps[ph::prev_prev].cp,
-                        state.caps[ph::prev].cp,
-                        state.caps[ph::curr].cp,
-                        state.caps[ph::next].cp,
-                        state.caps[ph::next_next].cp)) {
-                    return state.it;
-                }
-
-                // When we see an RI, back up to the first RI so we can see
-                // what emoji state we're supposed to be in here.
-                if (state.emoji_state ==
-                        detail::word_break_emoji_state_t::none &&
-                    state.caps[ph::curr].prop ==
-                        word_property::Regional_Indicator) {
-                    auto temp_state = state;
-                    int ris_before = 0;
-                    while (temp_state.it != first) {
-                        temp_state = skip(temp_state, first);
-                        if (temp_state.it == first) {
-                            if (temp_state.caps[ph::prev].prop ==
-                                word_property::Regional_Indicator) {
-                                ++ris_before;
-                            }
-                            break;
-                        }
-                        if (temp_state.caps[ph::prev].prop ==
-                            word_property::Regional_Indicator) {
-                            temp_state = detail::prev(temp_state);
-                            if (temp_state.it != first &&
-                                std::prev(temp_state.it) != first) {
-                                temp_state.caps[ph::prev_prev] =
-                                    cp_and_word_prop(
-                                        *std::prev(temp_state.it, 2),
-                                        word_prop);
-                            } else {
-                                temp_state.caps[ph::prev_prev] =
-                                    cp_and_word_prop();
-                            }
-                            ++ris_before;
-                        } else {
-                            break;
-                        }
-                    }
-                    state.emoji_state =
-                        (ris_before % 2 == 0)
-                            ? detail::word_break_emoji_state_t::first_emoji
-                            : detail::word_break_emoji_state_t::second_emoji;
-                }
-
-                // WB3
-                if (state.caps[ph::prev].prop == word_property::CR &&
-                    state.caps[ph::curr].prop == word_property::LF) {
-                    continue;
-                }
-
-                // WB3a
-                if (state.caps[ph::prev].prop == word_property::CR ||
-                    state.caps[ph::prev].prop == word_property::LF ||
-                    state.caps[ph::prev].prop == word_property::Newline) {
-                    return state.it;
-                }
-
-                // WB3b
-                if (state.caps[ph::curr].prop == word_property::CR ||
-                    state.caps[ph::curr].prop == word_property::LF ||
-                    state.caps[ph::curr].prop == word_property::Newline) {
-                    return state.it;
-                }
-
-                // WB3c
-                if (state.caps[ph::prev].prop == word_property::ZWJ &&
-                    state.caps[ph::curr].prop == word_property::ExtPict) {
-                    continue;
-                }
-
-                // WB3d
-                if (state.caps[ph::prev].prop == word_property::WSegSpace &&
-                    state.caps[ph::curr].prop == word_property::WSegSpace) {
-                    continue;
-                }
-
-                // If we end up breaking durign this iteration, we want the
-                // break to show up after the skip, so that the skippable
-                // CPs go with the CP before them.  This is to maintain
-                // symmetry with next_word_break().
-                auto after_skip_it = state.it;
-
-                // Puting this here means not having to do it explicitly
-                // below between prev_prop and prop (and transitively,
-                // between prop and next_prop).
-                state = skip(state, first);
-
-                // WB6
-                if (detail::ah_letter(state.caps[ph::prev].prop) &&
-                    detail::mid_ah(state.caps[ph::curr].prop) &&
-                    detail::ah_letter(state.caps[ph::next].prop)) {
-                    continue;
-                }
-
-                // WB7
-                if (detail::mid_ah(state.caps[ph::prev].prop) &&
-                    detail::ah_letter(state.caps[ph::curr].prop) &&
-                    state.it != first) {
-                    auto const temp_state = skip(detail::prev(state), first);
-                    if (detail::ah_letter(temp_state.caps[ph::prev].prop))
-                        continue;
-                }
-
-                // WB7b
-                if (state.caps[ph::prev].prop == word_property::Hebrew_Letter &&
-                    state.caps[ph::curr].prop == word_property::Double_Quote &&
-                    state.caps[ph::next].prop == word_property::Hebrew_Letter) {
-                    continue;
-                }
-
-                // WB7c
-                if (state.caps[ph::prev].prop == word_property::Double_Quote &&
-                    state.caps[ph::curr].prop == word_property::Hebrew_Letter &&
-                    state.it != first) {
-                    auto const temp_state = skip(detail::prev(state), first);
-                    if (temp_state.caps[ph::prev].prop ==
-                        word_property::Hebrew_Letter)
-                        continue;
-                }
-
-                // WB11
-                if (detail::mid_num(state.caps[ph::prev].prop) &&
-                    state.caps[ph::curr].prop == word_property::Numeric &&
-                    state.it != first) {
-                    auto const temp_state = skip(detail::prev(state), first);
-                    if (temp_state.caps[ph::prev].prop ==
-                        word_property::Numeric)
-                        continue;
-                }
-
-                // WB12
-                if (state.caps[ph::prev].prop == word_property::Numeric &&
-                    detail::mid_num(state.caps[ph::curr].prop) &&
-                    state.caps[ph::next].prop == word_property::Numeric) {
-                    continue;
-                }
-
-                if (state.emoji_state ==
-                    detail::word_break_emoji_state_t::first_emoji) {
-                    if (state.caps[ph::prev].prop ==
-                        word_property::Regional_Indicator) {
-                        state.emoji_state =
-                            detail::word_break_emoji_state_t::second_emoji;
-                        return after_skip_it;
-                    } else {
-                        state.emoji_state =
-                            detail::word_break_emoji_state_t::none;
-                    }
-                } else if (
-                    state.emoji_state ==
-                        detail::word_break_emoji_state_t::second_emoji &&
-                    state.caps[ph::prev].prop ==
-                        word_property::Regional_Indicator) {
-                    state.emoji_state =
-                        detail::word_break_emoji_state_t::first_emoji;
-                    continue;
-                }
-
-                if (detail::table_word_break(
-                        state.caps[ph::prev].prop, state.caps[ph::curr].prop))
-                    return after_skip_it;
-            }
-
             return first;
         }
 
@@ -718,8 +392,10 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
             if (++state.it == last)
                 return state.it;
 
+            state.caps[ph::prev_prev] = cp_and_word_prop();
             state.caps[ph::prev] = cp_and_word_prop(*first, word_prop);
             state.caps[ph::curr] = cp_and_word_prop(*state.it, word_prop);
+            state.caps[ph::next] = cp_and_word_prop();
             state.it_next = std::next(state.it);
 
             state.emoji_state =
@@ -739,16 +415,12 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
                 }
 
                 // WB3a
-                if (state.caps[ph::prev].prop == word_property::CR ||
-                    state.caps[ph::prev].prop == word_property::LF ||
-                    state.caps[ph::prev].prop == word_property::Newline) {
+                if (detail::linebreak(state.caps[ph::prev].prop)) {
                     return state.it;
                 }
 
                 // WB3b
-                if (state.caps[ph::curr].prop == word_property::CR ||
-                    state.caps[ph::curr].prop == word_property::LF ||
-                    state.caps[ph::curr].prop == word_property::Newline) {
+                if (detail::linebreak(state.caps[ph::curr].prop)) {
                     return state.it;
                 }
 
@@ -804,8 +476,7 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
                 }
 
                 // WB7c
-                if (state.caps[ph::prev_prev].prop ==
-                        word_property::Hebrew_Letter &&
+                if (state.caps[ph::prev_prev].prop == word_property::Hebrew_Letter &&
                     state.caps[ph::prev].prop == word_property::Double_Quote &&
                     state.caps[ph::curr].prop == word_property::Hebrew_Letter) {
                     continue;