wip

iresearch-toolkit · Sep 20, 2023 · e43d784 · e43d784
1 parent 791c2fd
commit e43d784
Show file tree

Hide file tree

Showing 2 changed files with 329 additions and 3 deletions.
diff --git a/core/analysis/segmentation_token_stream.cpp b/core/analysis/segmentation_token_stream.cpp
@@ -328,11 +328,10 @@ bool segmentation_token_stream::next() {
  const auto begin = gr_begin.base();
  const auto end = gr_end.base();
 
- const size_t length =
+ const auto length =
  static_cast<size_t>(std::distance(begin.base(), end.base()));
 
- if (!length) {
- // eof
+ if (length == 0) { // eof
  return false;
  }
 
@@ -355,6 +354,7 @@ bool segmentation_token_stream::next() {
  term.value = {reinterpret_cast<const byte_type*>(&(*begin.base())),
  length};
  break;
+ // TODO(MBkkt) do we need to call as_graphemes? Feels like no
  case options_t::case_convert_t::LOWER:
  term_buf_.clear();
  to_lower(as_graphemes(begin, end), from_utf32_back_inserter(term_buf_));

diff --git a/external/text/include/boost/text/word_break.hpp b/external/text/include/boost/text/word_break.hpp
@@ -364,6 +364,332 @@ static constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
  WordPropFunc const & word_prop = WordPropFunc{},
  WordBreakFunc const & word_break = WordBreakFunc{}) noexcept
  {
+ using detail::ph;
+ using detail::cp_and_word_prop;
+
+ if (it == first)
+ return it;
+
+ if (it == last && --it == first)
+ return it;
+
+ detail::word_break_state<CPIter> state;
+
+ state.it = it;
+
+ state.caps[ph::curr] = cp_and_word_prop(*state.it, word_prop);
+
+ // Since word_break is evaluated unconditionally before the other
+ // rules, we need to do all this here before the special-casing
+ // below.
+ if (it != first) {
+ state.caps[ph::prev] =
+ cp_and_word_prop(*std::prev(state.it), word_prop);
+ if (std::prev(state.it) != first) {
+ state.caps[ph::prev_prev] =
+ cp_and_word_prop(*std::prev(state.it, 2), word_prop);
+ }
+ }
+ if (std::next(state.it) != last) {
+ state.caps[ph::next] =
+ cp_and_word_prop(*std::next(state.it), word_prop);
+ if (std::next(state.it, 2) != last) {
+ state.caps[ph::next_next] =
+ cp_and_word_prop(*std::next(state.it, 2), word_prop);
+ }
+ }
+ if (word_break(
+ state.caps[ph::prev_prev].cp,
+ state.caps[ph::prev].cp,
+ state.caps[ph::curr].cp,
+ state.caps[ph::next].cp,
+ state.caps[ph::next_next].cp)) {
+ return state.it;
+ }
+
+ // Special case: If state.caps[ph::curr].prop is skippable, we
+ // need to skip backward until we find a non-skippable.
+ if (detail::skippable(state.caps[ph::curr].prop)) {
+ auto const prev = boost::text::find_if_not_backward(
+ first, it, [word_prop](uint32_t cp) {
+ return detail::skippable(word_prop(cp));
+ });
+ if (prev != it) {
+ state.it = prev;
+ state.caps[ph::next] =
+ cp_and_word_prop(*std::next(state.it), word_prop);
+ state.caps[ph::curr] =
+ cp_and_word_prop(*state.it, word_prop);
+
+ // If we end up on a non-skippable that should break
+ // before the skippable(s) we just moved over, break on
+ // the last skippable.
+ if (!detail::skippable(state.caps[ph::curr].prop) &&
+ detail::table_word_break(
+ state.caps[ph::curr].prop,
+ state.caps[ph::next].prop)) {
+ return ++state.it;
+ }
+ if (state.it == first)
+ return first;
+ }
+ }
+
+ state.caps[ph::prev_prev] = cp_and_word_prop();
+ if (std::prev(state.it) != first) {
+ state.caps[ph::prev_prev] =
+ cp_and_word_prop(*std::prev(state.it, 2), word_prop);
+ }
+ state.caps[ph::prev] =
+ cp_and_word_prop(*std::prev(state.it), word_prop);
+ state.caps[ph::next] = cp_and_word_prop();
+ state.caps[ph::next_next] = cp_and_word_prop();
+ if (std::next(state.it) != last) {
+ state.caps[ph::next] =
+ cp_and_word_prop(*std::next(state.it), word_prop);
+ if (std::next(state.it, 2) != last) {
+ state.caps[ph::next_next] =
+ cp_and_word_prop(*std::next(state.it, 2), word_prop);
+ }
+ }
+
+ // Since 'it' may be anywhere within the word in which it sits,
+ // we need to look forward to make sure that next_prop and
+ // next_next_prop don't point to skippables.
+ {
+ if (std::next(state.it) != last) {
+ auto temp_state = state;
+ temp_state = detail::next(temp_state);
+ temp_state = detail::skip_forward(
+ temp_state, first, last, word_prop);
+ if (temp_state.it == last) {
+ state.caps[ph::next] = cp_and_word_prop();
+ state.caps[ph::next_next] = cp_and_word_prop();
+ } else {
+ state.caps[ph::next] = temp_state.caps[ph::curr];
+ if (std::next(temp_state.it) != last) {
+ temp_state = detail::next(temp_state);
+ temp_state = detail::skip_forward(
+ temp_state, first, last, word_prop);
+ if (temp_state.it == last) {
+ state.caps[ph::next_next] = cp_and_word_prop();
+ } else {
+ state.caps[ph::next_next] =
+ temp_state.caps[ph::curr];
+ }
+ }
+ }
+ }
+ }
+
+ state.emoji_state = detail::word_break_emoji_state_t::none;
+
+ // WB4: Except after line breaks, ignore/skip (Extend | Format |
+ // ZWJ)*
+ auto skip = [word_prop](
+ detail::word_break_state<CPIter> state,
+ CPIter first) {
+ if (detail::skippable(state.caps[ph::prev].prop)) {
+ auto temp_it = boost::text::find_if_not_backward(
+ first, state.it, [word_prop](uint32_t cp) {
+ return detail::skippable(word_prop(cp));
+ });
+ if (temp_it == state.it)
+ return state;
+ auto temp_prev_cap = cp_and_word_prop(*temp_it, word_prop);
+ if (!detail::linebreak(temp_prev_cap.prop)) {
+ state.it = temp_it;
+ state.it_points_to_prev = true;
+ state.caps[ph::prev] = temp_prev_cap;
+ if (temp_it == first) {
+ state.caps[ph::prev_prev] = cp_and_word_prop();
+ } else {
+ state.caps[ph::prev_prev] = cp_and_word_prop(
+ *std::prev(temp_it), word_prop);
+ }
+ }
+ }
+ return state;
+ };
+
+ for (; state.it != first; state = detail::prev(state)) {
+ if (std::prev(state.it) != first) {
+ state.caps[ph::prev_prev] =
+ cp_and_word_prop(*std::prev(state.it, 2), word_prop);
+ } else {
+ state.caps[ph::prev_prev] = cp_and_word_prop();
+ }
+
+ // Check word_break before anything else.
+ if (word_break(
+ state.caps[ph::prev_prev].cp,
+ state.caps[ph::prev].cp,
+ state.caps[ph::curr].cp,
+ state.caps[ph::next].cp,
+ state.caps[ph::next_next].cp)) {
+ return state.it;
+ }
+
+ // When we see an RI, back up to the first RI so we can see
+ // what emoji state we're supposed to be in here.
+ if (state.emoji_state ==
+ detail::word_break_emoji_state_t::none &&
+ state.caps[ph::curr].prop ==
+ word_property::Regional_Indicator) {
+ auto temp_state = state;
+ int ris_before = 0;
+ while (temp_state.it != first) {
+ temp_state = skip(temp_state, first);
+ if (temp_state.it == first) {
+ if (temp_state.caps[ph::prev].prop ==
+ word_property::Regional_Indicator) {
+ ++ris_before;
+ }
+ break;
+ }
+ if (temp_state.caps[ph::prev].prop ==
+ word_property::Regional_Indicator) {
+ temp_state = detail::prev(temp_state);
+ if (temp_state.it != first &&
+ std::prev(temp_state.it) != first) {
+ temp_state.caps[ph::prev_prev] =
+ cp_and_word_prop(
+ *std::prev(temp_state.it, 2),
+ word_prop);
+ } else {
+ temp_state.caps[ph::prev_prev] =
+ cp_and_word_prop();
+ }
+ ++ris_before;
+ } else {
+ break;
+ }
+ }
+ state.emoji_state =
+ (ris_before % 2 == 0)
+ ? detail::word_break_emoji_state_t::first_emoji
+ : detail::word_break_emoji_state_t::second_emoji;
+ }
+
+ // WB3
+ if (state.caps[ph::prev].prop == word_property::CR &&
+ state.caps[ph::curr].prop == word_property::LF) {
+ continue;
+ }
+
+ // WB3a
+ if (state.caps[ph::prev].prop == word_property::CR ||
+ state.caps[ph::prev].prop == word_property::LF ||
+ state.caps[ph::prev].prop == word_property::Newline) {
+ return state.it;
+ }
+
+ // WB3b
+ if (state.caps[ph::curr].prop == word_property::CR ||
+ state.caps[ph::curr].prop == word_property::LF ||
+ state.caps[ph::curr].prop == word_property::Newline) {
+ return state.it;
+ }
+
+ // WB3c
+ if (state.caps[ph::prev].prop == word_property::ZWJ &&
+ state.caps[ph::curr].prop == word_property::ExtPict) {
+ continue;
+ }
+
+ // WB3d
+ if (state.caps[ph::prev].prop == word_property::WSegSpace &&
+ state.caps[ph::curr].prop == word_property::WSegSpace) {
+ continue;
+ }
+
+ // If we end up breaking durign this iteration, we want the
+ // break to show up after the skip, so that the skippable
+ // CPs go with the CP before them. This is to maintain
+ // symmetry with next_word_break().
+ auto after_skip_it = state.it;
+
+ // Puting this here means not having to do it explicitly
+ // below between prev_prop and prop (and transitively,
+ // between prop and next_prop).
+ state = skip(state, first);
+
+ // WB6
+ if (detail::ah_letter(state.caps[ph::prev].prop) &&
+ detail::mid_ah(state.caps[ph::curr].prop) &&
+ detail::ah_letter(state.caps[ph::next].prop)) {
+ continue;
+ }
+
+ // WB7
+ if (detail::mid_ah(state.caps[ph::prev].prop) &&
+ detail::ah_letter(state.caps[ph::curr].prop) &&
+ state.it != first) {
+ auto const temp_state = skip(detail::prev(state), first);
+ if (detail::ah_letter(temp_state.caps[ph::prev].prop))
+ continue;
+ }
+
+ // WB7b
+ if (state.caps[ph::prev].prop == word_property::Hebrew_Letter &&
+ state.caps[ph::curr].prop == word_property::Double_Quote &&
+ state.caps[ph::next].prop == word_property::Hebrew_Letter) {
+ continue;
+ }
+
+ // WB7c
+ if (state.caps[ph::prev].prop == word_property::Double_Quote &&
+ state.caps[ph::curr].prop == word_property::Hebrew_Letter &&
+ state.it != first) {
+ auto const temp_state = skip(detail::prev(state), first);
+ if (temp_state.caps[ph::prev].prop ==
+ word_property::Hebrew_Letter)
+ continue;
+ }
+
+ // WB11
+ if (detail::mid_num(state.caps[ph::prev].prop) &&
+ state.caps[ph::curr].prop == word_property::Numeric &&
+ state.it != first) {
+ auto const temp_state = skip(detail::prev(state), first);
+ if (temp_state.caps[ph::prev].prop ==
+ word_property::Numeric)
+ continue;
+ }
+
+ // WB12
+ if (state.caps[ph::prev].prop == word_property::Numeric &&
+ detail::mid_num(state.caps[ph::curr].prop) &&
+ state.caps[ph::next].prop == word_property::Numeric) {
+ continue;
+ }
+
+ if (state.emoji_state ==
+ detail::word_break_emoji_state_t::first_emoji) {
+ if (state.caps[ph::prev].prop ==
+ word_property::Regional_Indicator) {
+ state.emoji_state =
+ detail::word_break_emoji_state_t::second_emoji;
+ return after_skip_it;
+ } else {
+ state.emoji_state =
+ detail::word_break_emoji_state_t::none;
+ }
+ } else if (
+ state.emoji_state ==
+ detail::word_break_emoji_state_t::second_emoji &&
+ state.caps[ph::prev].prop ==
+ word_property::Regional_Indicator) {
+ state.emoji_state =
+ detail::word_break_emoji_state_t::first_emoji;
+ continue;
+ }
+
+ if (detail::table_word_break(
+ state.caps[ph::prev].prop, state.caps[ph::curr].prop))
+ return after_skip_it;
+ }
+
  return first;
  }