Skip to content
This repository has been archived by the owner on May 3, 2024. It is now read-only.

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
MBkkt committed Sep 20, 2023
1 parent 791c2fd commit e43d784
Show file tree
Hide file tree
Showing 2 changed files with 329 additions and 3 deletions.
6 changes: 3 additions & 3 deletions core/analysis/segmentation_token_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,11 +328,10 @@ bool segmentation_token_stream::next() {
const auto begin = gr_begin.base();
const auto end = gr_end.base();

const size_t length =
const auto length =
static_cast<size_t>(std::distance(begin.base(), end.base()));

if (!length) {
// eof
if (length == 0) { // eof
return false;
}

Expand All @@ -355,6 +354,7 @@ bool segmentation_token_stream::next() {
term.value = {reinterpret_cast<const byte_type*>(&(*begin.base())),
length};
break;
// TODO(MBkkt) do we need to call as_graphemes? Feels like no
case options_t::case_convert_t::LOWER:
term_buf_.clear();
to_lower(as_graphemes(begin, end), from_utf32_back_inserter(term_buf_));
Expand Down
326 changes: 326 additions & 0 deletions external/text/include/boost/text/word_break.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,332 @@ static constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
WordPropFunc const & word_prop = WordPropFunc{},
WordBreakFunc const & word_break = WordBreakFunc{}) noexcept
{
using detail::ph;
using detail::cp_and_word_prop;

if (it == first)
return it;

if (it == last && --it == first)
return it;

detail::word_break_state<CPIter> state;

state.it = it;

state.caps[ph::curr] = cp_and_word_prop(*state.it, word_prop);

// Since word_break is evaluated unconditionally before the other
// rules, we need to do all this here before the special-casing
// below.
if (it != first) {
state.caps[ph::prev] =
cp_and_word_prop(*std::prev(state.it), word_prop);
if (std::prev(state.it) != first) {
state.caps[ph::prev_prev] =
cp_and_word_prop(*std::prev(state.it, 2), word_prop);
}
}
if (std::next(state.it) != last) {
state.caps[ph::next] =
cp_and_word_prop(*std::next(state.it), word_prop);
if (std::next(state.it, 2) != last) {
state.caps[ph::next_next] =
cp_and_word_prop(*std::next(state.it, 2), word_prop);
}
}
if (word_break(
state.caps[ph::prev_prev].cp,
state.caps[ph::prev].cp,
state.caps[ph::curr].cp,
state.caps[ph::next].cp,
state.caps[ph::next_next].cp)) {
return state.it;
}

// Special case: If state.caps[ph::curr].prop is skippable, we
// need to skip backward until we find a non-skippable.
if (detail::skippable(state.caps[ph::curr].prop)) {
auto const prev = boost::text::find_if_not_backward(
first, it, [word_prop](uint32_t cp) {
return detail::skippable(word_prop(cp));
});
if (prev != it) {
state.it = prev;
state.caps[ph::next] =
cp_and_word_prop(*std::next(state.it), word_prop);
state.caps[ph::curr] =
cp_and_word_prop(*state.it, word_prop);

// If we end up on a non-skippable that should break
// before the skippable(s) we just moved over, break on
// the last skippable.
if (!detail::skippable(state.caps[ph::curr].prop) &&
detail::table_word_break(
state.caps[ph::curr].prop,
state.caps[ph::next].prop)) {
return ++state.it;
}
if (state.it == first)
return first;
}
}

state.caps[ph::prev_prev] = cp_and_word_prop();
if (std::prev(state.it) != first) {
state.caps[ph::prev_prev] =
cp_and_word_prop(*std::prev(state.it, 2), word_prop);
}
state.caps[ph::prev] =
cp_and_word_prop(*std::prev(state.it), word_prop);
state.caps[ph::next] = cp_and_word_prop();
state.caps[ph::next_next] = cp_and_word_prop();
if (std::next(state.it) != last) {
state.caps[ph::next] =
cp_and_word_prop(*std::next(state.it), word_prop);
if (std::next(state.it, 2) != last) {
state.caps[ph::next_next] =
cp_and_word_prop(*std::next(state.it, 2), word_prop);
}
}

// Since 'it' may be anywhere within the word in which it sits,
// we need to look forward to make sure that next_prop and
// next_next_prop don't point to skippables.
{
if (std::next(state.it) != last) {
auto temp_state = state;
temp_state = detail::next(temp_state);
temp_state = detail::skip_forward(
temp_state, first, last, word_prop);
if (temp_state.it == last) {
state.caps[ph::next] = cp_and_word_prop();
state.caps[ph::next_next] = cp_and_word_prop();
} else {
state.caps[ph::next] = temp_state.caps[ph::curr];
if (std::next(temp_state.it) != last) {
temp_state = detail::next(temp_state);
temp_state = detail::skip_forward(
temp_state, first, last, word_prop);
if (temp_state.it == last) {
state.caps[ph::next_next] = cp_and_word_prop();
} else {
state.caps[ph::next_next] =
temp_state.caps[ph::curr];
}
}
}
}
}

state.emoji_state = detail::word_break_emoji_state_t::none;

// WB4: Except after line breaks, ignore/skip (Extend | Format |
// ZWJ)*
auto skip = [word_prop](
detail::word_break_state<CPIter> state,
CPIter first) {
if (detail::skippable(state.caps[ph::prev].prop)) {
auto temp_it = boost::text::find_if_not_backward(
first, state.it, [word_prop](uint32_t cp) {
return detail::skippable(word_prop(cp));
});
if (temp_it == state.it)
return state;
auto temp_prev_cap = cp_and_word_prop(*temp_it, word_prop);
if (!detail::linebreak(temp_prev_cap.prop)) {
state.it = temp_it;
state.it_points_to_prev = true;
state.caps[ph::prev] = temp_prev_cap;
if (temp_it == first) {
state.caps[ph::prev_prev] = cp_and_word_prop();
} else {
state.caps[ph::prev_prev] = cp_and_word_prop(
*std::prev(temp_it), word_prop);
}
}
}
return state;
};

for (; state.it != first; state = detail::prev(state)) {
if (std::prev(state.it) != first) {
state.caps[ph::prev_prev] =
cp_and_word_prop(*std::prev(state.it, 2), word_prop);
} else {
state.caps[ph::prev_prev] = cp_and_word_prop();
}

// Check word_break before anything else.
if (word_break(
state.caps[ph::prev_prev].cp,
state.caps[ph::prev].cp,
state.caps[ph::curr].cp,
state.caps[ph::next].cp,
state.caps[ph::next_next].cp)) {
return state.it;
}

// When we see an RI, back up to the first RI so we can see
// what emoji state we're supposed to be in here.
if (state.emoji_state ==
detail::word_break_emoji_state_t::none &&
state.caps[ph::curr].prop ==
word_property::Regional_Indicator) {
auto temp_state = state;
int ris_before = 0;
while (temp_state.it != first) {
temp_state = skip(temp_state, first);
if (temp_state.it == first) {
if (temp_state.caps[ph::prev].prop ==
word_property::Regional_Indicator) {
++ris_before;
}
break;
}
if (temp_state.caps[ph::prev].prop ==
word_property::Regional_Indicator) {
temp_state = detail::prev(temp_state);
if (temp_state.it != first &&
std::prev(temp_state.it) != first) {
temp_state.caps[ph::prev_prev] =
cp_and_word_prop(
*std::prev(temp_state.it, 2),
word_prop);
} else {
temp_state.caps[ph::prev_prev] =
cp_and_word_prop();
}
++ris_before;
} else {
break;
}
}
state.emoji_state =
(ris_before % 2 == 0)
? detail::word_break_emoji_state_t::first_emoji
: detail::word_break_emoji_state_t::second_emoji;
}

// WB3
if (state.caps[ph::prev].prop == word_property::CR &&
state.caps[ph::curr].prop == word_property::LF) {
continue;
}

// WB3a
if (state.caps[ph::prev].prop == word_property::CR ||
state.caps[ph::prev].prop == word_property::LF ||
state.caps[ph::prev].prop == word_property::Newline) {
return state.it;
}

// WB3b
if (state.caps[ph::curr].prop == word_property::CR ||
state.caps[ph::curr].prop == word_property::LF ||
state.caps[ph::curr].prop == word_property::Newline) {
return state.it;
}

// WB3c
if (state.caps[ph::prev].prop == word_property::ZWJ &&
state.caps[ph::curr].prop == word_property::ExtPict) {
continue;
}

// WB3d
if (state.caps[ph::prev].prop == word_property::WSegSpace &&
state.caps[ph::curr].prop == word_property::WSegSpace) {
continue;
}

// If we end up breaking durign this iteration, we want the
// break to show up after the skip, so that the skippable
// CPs go with the CP before them. This is to maintain
// symmetry with next_word_break().
auto after_skip_it = state.it;

// Puting this here means not having to do it explicitly
// below between prev_prop and prop (and transitively,
// between prop and next_prop).
state = skip(state, first);

// WB6
if (detail::ah_letter(state.caps[ph::prev].prop) &&
detail::mid_ah(state.caps[ph::curr].prop) &&
detail::ah_letter(state.caps[ph::next].prop)) {
continue;
}

// WB7
if (detail::mid_ah(state.caps[ph::prev].prop) &&
detail::ah_letter(state.caps[ph::curr].prop) &&
state.it != first) {
auto const temp_state = skip(detail::prev(state), first);
if (detail::ah_letter(temp_state.caps[ph::prev].prop))
continue;
}

// WB7b
if (state.caps[ph::prev].prop == word_property::Hebrew_Letter &&
state.caps[ph::curr].prop == word_property::Double_Quote &&
state.caps[ph::next].prop == word_property::Hebrew_Letter) {
continue;
}

// WB7c
if (state.caps[ph::prev].prop == word_property::Double_Quote &&
state.caps[ph::curr].prop == word_property::Hebrew_Letter &&
state.it != first) {
auto const temp_state = skip(detail::prev(state), first);
if (temp_state.caps[ph::prev].prop ==
word_property::Hebrew_Letter)
continue;
}

// WB11
if (detail::mid_num(state.caps[ph::prev].prop) &&
state.caps[ph::curr].prop == word_property::Numeric &&
state.it != first) {
auto const temp_state = skip(detail::prev(state), first);
if (temp_state.caps[ph::prev].prop ==
word_property::Numeric)
continue;
}

// WB12
if (state.caps[ph::prev].prop == word_property::Numeric &&
detail::mid_num(state.caps[ph::curr].prop) &&
state.caps[ph::next].prop == word_property::Numeric) {
continue;
}

if (state.emoji_state ==
detail::word_break_emoji_state_t::first_emoji) {
if (state.caps[ph::prev].prop ==
word_property::Regional_Indicator) {
state.emoji_state =
detail::word_break_emoji_state_t::second_emoji;
return after_skip_it;
} else {
state.emoji_state =
detail::word_break_emoji_state_t::none;
}
} else if (
state.emoji_state ==
detail::word_break_emoji_state_t::second_emoji &&
state.caps[ph::prev].prop ==
word_property::Regional_Indicator) {
state.emoji_state =
detail::word_break_emoji_state_t::first_emoji;
continue;
}

if (detail::table_word_break(
state.caps[ph::prev].prop, state.caps[ph::curr].prop))
return after_skip_it;
}

return first;
}

Expand Down

0 comments on commit e43d784

Please sign in to comment.