Skip to content
This repository has been archived by the owner on May 3, 2024. It is now read-only.

Commit

Permalink
Improve segmentation
Browse files Browse the repository at this point in the history
  • Loading branch information
MBkkt committed Sep 16, 2023
1 parent 1004c42 commit dcd2398
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 88 deletions.
22 changes: 9 additions & 13 deletions external/text/include/boost/text/grapheme_break.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ namespace boost { namespace text {
template<typename CPIter>
struct grapheme_break_state
{
CPIter it_prev;
CPIter it;

grapheme_property prev_prop;
Expand All @@ -121,7 +122,7 @@ namespace boost { namespace text {
template<typename CPIter>
grapheme_break_state<CPIter> next(grapheme_break_state<CPIter> state)
{
++state.it;
state.it_prev = state.it++;
state.prev_prop = state.prop;
return state;
}
Expand Down Expand Up @@ -259,12 +260,12 @@ constexpr std::array<std::array<bool, 15>, 15> grapheme_breaks = {{
return first;

grapheme_break_state<CPIter> state;
state.it = first;
state.it = state.it_prev = first;

if (++state.it == last)
return state.it;

state.prev_prop = boost::text::grapheme_prop(*std::prev(state.it));
state.prev_prop = boost::text::grapheme_prop(*state.it_prev);
state.prop = boost::text::grapheme_prop(*state.it);

state.emoji_state =
Expand All @@ -278,22 +279,17 @@ constexpr std::array<std::array<bool, 15>, 15> grapheme_breaks = {{
// GB11
if (state.prev_prop == grapheme_property::ZWJ &&
state.prop == grapheme_property::ExtPict &&
detail::gb11_prefix(first, std::prev(state.it))) {
detail::gb11_prefix(first, state.it_prev)) {
continue;
}

if (state.emoji_state ==
grapheme_break_emoji_state_t::first_emoji) {
if (state.emoji_state == grapheme_break_emoji_state_t::first_emoji) {
state.emoji_state = grapheme_break_emoji_state_t::none;
if (state.prop == grapheme_property::Regional_Indicator) {
state.emoji_state = grapheme_break_emoji_state_t::none;
continue;
} else {
state.emoji_state = grapheme_break_emoji_state_t::none;
}
} else if (
state.prop == grapheme_property::Regional_Indicator) {
state.emoji_state =
grapheme_break_emoji_state_t::first_emoji;
} else if (state.prop == grapheme_property::Regional_Indicator) {
state.emoji_state = grapheme_break_emoji_state_t::first_emoji;
}

if (detail::table_grapheme_break(state.prev_prop, state.prop))
Expand Down
134 changes: 59 additions & 75 deletions external/text/include/boost/text/word_break.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,10 @@ namespace boost { namespace text {
struct word_break_state
{
word_break_state() {}

CPIter it;
CPIter it_next;
CPIter it_next_next;

bool it_points_to_prev = false;

std::array<cp_and_word_prop, 5> caps;
Expand All @@ -214,11 +216,20 @@ namespace boost { namespace text {
};

template<typename CPIter>
word_break_state<CPIter> next(word_break_state<CPIter> state)
word_break_state<CPIter> next(word_break_state<CPIter> state, CPIter last)
{
++state.it;
std::copy(
state.caps.begin() + 1, state.caps.end(), state.caps.begin());
state.caps[0] = state.caps[1];
state.caps[1] = state.caps[2];
state.caps[2] = state.caps[3];
state.caps[3] = state.caps[4];
state.caps[4] = cp_and_word_prop{};

state.it = state.it_next;
state.it_next = state.it_next_next;
if (state.it_next_next != last) {
++state.it_next_next;
}

return state;
}

Expand Down Expand Up @@ -272,8 +283,8 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
// WB4: Except after line breaks, ignore/skip (Extend | Format |
// ZWJ)*
template<typename CPIter, typename Sentinel, typename WordPropFunc>
word_break_state<CPIter> skip_forward(
word_break_state<CPIter> state,
void skip_forward(
word_break_state<CPIter>& state,
CPIter first,
Sentinel last,
WordPropFunc const & word_prop)
Expand All @@ -282,33 +293,31 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
!detail::skippable(state.caps[ph::prev].prop) &&
detail::skippable(state.caps[ph::curr].prop)) {
auto last_prop = word_property::Other;
auto temp_it = boost::text::find_if_not(
state.it = boost::text::find_if_not(
state.it, last, [word_prop, &last_prop](uint32_t cp) {
last_prop = word_prop(cp);
return detail::skippable(last_prop);
});
if (temp_it == last) {
--temp_it;
if (state.it == last) {
--state.it;
} else if (last_prop == word_property::ExtPict) {
auto const next_to_last_prop =
word_prop(*std::prev(temp_it));
auto it_prev = std::prev(state.it);
auto const next_to_last_prop = word_prop(*it_prev);
if (next_to_last_prop == word_property::ZWJ)
--temp_it;
state.it = it_prev;
}
state.it = temp_it;
state.caps[ph::curr] = cp_and_word_prop(*temp_it, word_prop);
state.caps[ph::curr] = cp_and_word_prop(*state.it, word_prop);
state.caps[ph::next] = cp_and_word_prop();
state.caps[ph::next_next] = cp_and_word_prop();
if (std::next(state.it) != last) {
state.caps[ph::next] =
cp_and_word_prop(*std::next(state.it), word_prop);
if (std::next(state.it, 2) != last) {
state.caps[ph::next_next] = cp_and_word_prop(
*std::next(state.it, 2), word_prop);
state.it_next_next = state.it_next = std::next(state.it);
if (state.it_next != last) {
state.caps[ph::next] = cp_and_word_prop(*state.it_next, word_prop);
++state.it_next_next;
if (state.it_next_next != last) {
state.caps[ph::next_next] = cp_and_word_prop(*state.it_next_next, word_prop);
}
}
}
return state;
}

template<typename T>
Expand Down Expand Up @@ -460,19 +469,16 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
// next_next_prop don't point to skippables.
{
if (std::next(state.it) != last) {
auto temp_state = state;
temp_state = detail::next(temp_state);
temp_state = detail::skip_forward(
temp_state, first, last, word_prop);
auto temp_state = detail::next(state);
detail::skip_forward(temp_state, first, last, word_prop);
if (temp_state.it == last) {
state.caps[ph::next] = cp_and_word_prop();
state.caps[ph::next_next] = cp_and_word_prop();
} else {
state.caps[ph::next] = temp_state.caps[ph::curr];
if (std::next(temp_state.it) != last) {
temp_state = detail::next(temp_state);
temp_state = detail::skip_forward(
temp_state, first, last, word_prop);
detail::skip_forward(temp_state, first, last, word_prop);
if (temp_state.it == last) {
state.caps[ph::next_next] = cp_and_word_prop();
} else {
Expand Down Expand Up @@ -718,33 +724,22 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
if (++state.it == last)
return state.it;

state.caps[ph::prev_prev] = cp_and_word_prop();
state.caps[ph::prev] =
cp_and_word_prop(*std::prev(state.it), word_prop);
state.caps[ph::prev] = cp_and_word_prop(*first, word_prop);
state.caps[ph::curr] = cp_and_word_prop(*state.it, word_prop);
state.caps[ph::next] = cp_and_word_prop();
state.caps[ph::next_next] = cp_and_word_prop();
if (std::next(state.it) != last) {
state.caps[ph::next] =
cp_and_word_prop(*std::next(state.it), word_prop);
if (std::next(state.it, 2) != last) {
state.caps[ph::next_next] =
cp_and_word_prop(*std::next(state.it, 2), word_prop);
}
state.it_next_next = state.it_next = std::next(state.it);
if (state.it_next != last) {
state.caps[ph::next] = cp_and_word_prop(*state.it_next, word_prop);
++state.it_next_next;
}

state.emoji_state =
state.caps[ph::prev].prop == word_property::Regional_Indicator
? detail::word_break_emoji_state_t::first_emoji
: detail::word_break_emoji_state_t::none;

for (; state.it != last; state = detail::next(state)) {
if (std::next(state.it) != last &&
std::next(state.it, 2) != last) {
state.caps[ph::next_next] =
cp_and_word_prop(*std::next(state.it, 2), word_prop);
} else {
state.caps[ph::next_next] = cp_and_word_prop();
for (; state.it != last; state = detail::next(state, last)) {
if (state.it_next_next != last) {
state.caps[ph::next_next] = cp_and_word_prop(*state.it_next_next, word_prop);
}

// Check word_break before anything else.
Expand Down Expand Up @@ -792,16 +787,16 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
// Putting this here means not having to do it explicitly
// below between prop and next_prop (and transitively,
// between prev_prop and prop).
state = detail::skip_forward(state, first, last, word_prop);
detail::skip_forward(state, first, last, word_prop);
if (state.it == last)
return state.it;

// WB6
if (detail::ah_letter(state.caps[ph::prev].prop) &&
detail::mid_ah(state.caps[ph::curr].prop) &&
std::next(state.it) != last) {
auto const temp_state = detail::skip_forward(
detail::next(state), first, last, word_prop);
state.it_next != last) {
auto temp_state = detail::next(state, last);
detail::skip_forward(temp_state, first, last, word_prop);
if (temp_state.it == last)
return temp_state.it;
if (detail::ah_letter(temp_state.caps[ph::curr].prop))
Expand All @@ -818,9 +813,9 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
// WB7b
if (state.caps[ph::prev].prop == word_property::Hebrew_Letter &&
state.caps[ph::curr].prop == word_property::Double_Quote &&
std::next(state.it) != last) {
auto const temp_state = detail::skip_forward(
detail::next(state), first, last, word_prop);
state.it_next != last) {
auto temp_state = detail::next(state, last);
detail::skip_forward(temp_state, first, last, word_prop);
if (temp_state.it == last)
return temp_state.it;
if (temp_state.caps[ph::curr].prop ==
Expand All @@ -846,37 +841,26 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
// WB12
if (state.caps[ph::prev].prop == word_property::Numeric &&
detail::mid_num(state.caps[ph::curr].prop) &&
std::next(state.it) != last) {
auto const temp_state = detail::skip_forward(
detail::next(state), first, last, word_prop);
state.it_next != last) {
auto temp_state = detail::next(state, last);
detail::skip_forward(temp_state, first, last, word_prop);
if (temp_state.it == last)
return temp_state.it;
if (temp_state.caps[ph::curr].prop ==
word_property::Numeric)
if (temp_state.caps[ph::curr].prop == word_property::Numeric)
continue;
}

if (state.emoji_state ==
detail::word_break_emoji_state_t::first_emoji) {
if (state.caps[ph::curr].prop ==
word_property::Regional_Indicator) {
state.emoji_state =
detail::word_break_emoji_state_t::none;
if (state.emoji_state == detail::word_break_emoji_state_t::first_emoji) {
state.emoji_state = detail::word_break_emoji_state_t::none;
if (state.caps[ph::curr].prop == word_property::Regional_Indicator) {
continue;
} else {
state.emoji_state =
detail::word_break_emoji_state_t::none;
}
} else if (
state.caps[ph::curr].prop ==
word_property::Regional_Indicator) {
state.emoji_state =
detail::word_break_emoji_state_t::first_emoji;
} else if (state.caps[ph::curr].prop == word_property::Regional_Indicator) {
state.emoji_state = detail::word_break_emoji_state_t::first_emoji;
return state.it;
}

if (detail::table_word_break(
state.caps[ph::prev].prop, state.caps[ph::curr].prop))
if (detail::table_word_break(state.caps[ph::prev].prop, state.caps[ph::curr].prop))
return state.it;
}
return state.it;
Expand Down

0 comments on commit dcd2398

Please sign in to comment.