From 6472fa7bdd7b9b14f157e4f3b9d3a8b8817f2619 Mon Sep 17 00:00:00 2001
From: Valery Mironov <32071355+MBkkt@users.noreply.github.com>
Date: Thu, 14 Sep 2023 18:44:36 +0200
Subject: [PATCH 1/3] Speedup segmentation analyzer

---
 CMakeLists.txt                                |   8 +
 core/analysis/segmentation_token_stream.cpp   |   6 +-
 core/formats/columnstore.cpp                  |  13 +-
 core/index/field_data.cpp                     |  32 +--
 core/index/field_data.hpp                     |  13 +-
 core/index/postings.cpp                       |  59 +++--
 core/index/postings.hpp                       |  40 ++--
 core/utils/hash_set_utils.hpp                 |  39 ++--
 .../include/boost/text/grapheme_break.hpp     |  22 +-
 .../text/include/boost/text/word_break.hpp    | 204 +++++++-----------
 10 files changed, 196 insertions(+), 240 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f4e88d4d..19d2461df 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,6 +97,14 @@ endif ()
 if (CMAKE_BUILD_TYPE MATCHES "Coverage")
   set(IRESEARCH_COVERAGE ON)
   set(CMAKE_BUILD_TYPE "Debug")
+elseif (CMAKE_BUILD_TYPE MATCHES "Profile")
+  set(CMAKE_BUILD_TYPE "Release")
+  add_compile_options(
+    -g 
+    -fno-omit-frame-pointer
+  # -fno-inline
+  # -fno-optimize-sibling-calls
+  )
 endif ()
 
 add_option_gprof(FALSE)
diff --git a/core/analysis/segmentation_token_stream.cpp b/core/analysis/segmentation_token_stream.cpp
index 29bb9324d..6c0f81932 100644
--- a/core/analysis/segmentation_token_stream.cpp
+++ b/core/analysis/segmentation_token_stream.cpp
@@ -328,11 +328,10 @@ bool segmentation_token_stream::next() {
     const auto begin = gr_begin.base();
     const auto end = gr_end.base();
 
-    const size_t length =
+    const auto length =
       static_cast<size_t>(std::distance(begin.base(), end.base()));
 
-    if (!length) {
-      // eof
+    if (length == 0) {  // eof
       return false;
     }
 
@@ -355,6 +354,7 @@ bool segmentation_token_stream::next() {
         term.value = {reinterpret_cast<const byte_type*>(&(*begin.base())),
                       length};
         break;
+        // TODO(MBkkt) do we need to call as_graphemes? Feels like no
       case options_t::case_convert_t::LOWER:
         term_buf_.clear();
         to_lower(as_graphemes(begin, end), from_utf32_back_inserter(term_buf_));
diff --git a/core/formats/columnstore.cpp b/core/formats/columnstore.cpp
index f522683ed..4e26296c8 100644
--- a/core/formats/columnstore.cpp
+++ b/core/formats/columnstore.cpp
@@ -256,21 +256,22 @@ void read_compact(irs::index_input& in, irs::encryption::stream* cipher,
   }
 }
 
-struct column_ref_eq : value_ref_eq<column_meta*> {
-  using self_t::operator();
+struct ColumnMetaEq : ValueRefEq<column_meta*> {
+  using is_transparent = void;
+  using Self::operator();
 
-  bool operator()(const ref_t& lhs,
+  bool operator()(const Ref& lhs,
                   const hashed_string_view& rhs) const noexcept {
-    return lhs.second->name == rhs;
+    return lhs.ref->name == rhs;
   }
 
   bool operator()(const hashed_string_view& lhs,
-                  const ref_t& rhs) const noexcept {
+                  const Ref& rhs) const noexcept {
     return this->operator()(rhs, lhs);
   }
 };
 
-using name_to_column_map = flat_hash_set<column_ref_eq>;
+using name_to_column_map = flat_hash_set<ColumnMetaEq>;
 
 class meta_writer final {
  public:
diff --git a/core/index/field_data.cpp b/core/index/field_data.cpp
index 06906f7d9..e4fb2de5b 100644
--- a/core/index/field_data.cpp
+++ b/core/index/field_data.cpp
@@ -1054,23 +1054,23 @@ bool field_data::invert(token_stream& stream, doc_id_t id) {
       last_start_offs_ = start_offset;
     }
 
-    const auto res = terms_.emplace(term->value);
-
-    if (nullptr == res.first) {
-      IRS_LOG_WARN(absl::StrCat("skipping too long term of size '",
-                                term->value.size(), "' in field '", meta_.name,
-                                "'"));
-      IRS_LOG_TRACE(absl::StrCat("field '", meta_.name,
-                                 "' contains too long term '",
-                                 ViewCast<char>(term->value), "'"));
+    auto* posting = terms_.emplace(term->value);
+
+    if (posting == nullptr) {
+      IRS_LOG_WARN(absl::StrCat("skipping too long term of size: ",
+                                term->value.size(), " in field: ", meta_.name));
+      IRS_LOG_TRACE(
+        absl::StrCat("field: ", meta_.name,
+                     " contains too long term: ", ViewCast<char>(term->value)));
       continue;
     }
 
-    (this->*proc_table_[size_t(res.second)])(*res.first, id, pay, offs);
+    (this->*proc_table_[posting->doc == doc_limits::invalid()])(*posting, id,
+                                                                pay, offs);
 
     if (0 == ++stats_.len) {
-      IRS_LOG_ERROR(absl::StrCat("too many tokens in field '", meta_.name,
-                                 "', document '", id, "'"));
+      IRS_LOG_ERROR(absl::StrCat("too many tokens in field: ", meta_.name,
+                                 ", document: ", id));
       return false;
     }
 
@@ -1108,12 +1108,12 @@ field_data* fields_data::emplace(const hashed_string_view& name,
 
   auto it = fields_map_.lazy_emplace(
     name, [&name](const fields_map::constructor& ctor) {
-      ctor(name.hash(), nullptr);
+      ctor(nullptr, name.hash());
     });
 
-  if (!it->second) {
+  if (!it->ref) {
     try {
-      const_cast<field_data*&>(it->second) = &fields_.emplace_back(
+      const_cast<field_data*&>(it->ref) = &fields_.emplace_back(
         name, features, *feature_info_, *cached_columns_, *cached_features_,
         columns, byte_writer_, int_writer_, index_features,
         (nullptr != comparator_));
@@ -1123,7 +1123,7 @@ field_data* fields_data::emplace(const hashed_string_view& name,
     }
   }
 
-  return it->second;
+  return it->ref;
 }
 
 void fields_data::flush(field_writer& fw, flush_state& state) {
diff --git a/core/index/field_data.hpp b/core/index/field_data.hpp
index 8da273d0e..9b3af90b3 100644
--- a/core/index/field_data.hpp
+++ b/core/index/field_data.hpp
@@ -198,21 +198,22 @@ class field_data : util::noncopyable {
 
 class fields_data : util::noncopyable {
  private:
-  struct field_ref_eq : value_ref_eq<field_data*> {
-    using self_t::operator();
+  struct FieldEq : ValueRefEq<field_data*> {
+    using is_transparent = void;
+    using Self::operator();
 
-    bool operator()(const ref_t& lhs,
+    bool operator()(const Ref& lhs,
                     const hashed_string_view& rhs) const noexcept {
-      return lhs.second->meta().name == rhs;
+      return lhs.ref->meta().name == rhs;
     }
 
     bool operator()(const hashed_string_view& lhs,
-                    const ref_t& rhs) const noexcept {
+                    const Ref& rhs) const noexcept {
       return this->operator()(rhs, lhs);
     }
   };
 
-  using fields_map = flat_hash_set<field_ref_eq>;
+  using fields_map = flat_hash_set<FieldEq>;
 
  public:
   using postings_ref_t = std::vector<const posting*>;
diff --git a/core/index/postings.cpp b/core/index/postings.cpp
index b24c61f42..577bbd479 100644
--- a/core/index/postings.cpp
+++ b/core/index/postings.cpp
@@ -34,12 +34,12 @@ namespace irs {
 
 void postings::get_sorted_postings(
   std::vector<const posting*>& postings) const {
-  postings.resize(map_.size());
+  IRS_ASSERT(terms_.size() == postings_.size());
 
-  auto begin = postings.begin();
-  for (auto& entry : map_) {
-    *begin = &postings_[entry.second];
-    ++begin;
+  postings.resize(postings_.size());
+
+  for (auto* p = postings.data(); const auto& posting : postings_) {
+    *p++ = &posting;
   }
 
   std::sort(postings.begin(), postings.end(),
@@ -48,20 +48,20 @@ void postings::get_sorted_postings(
             });
 }
 
-std::pair<posting*, bool> postings::emplace(bytes_view term) {
+posting* postings::emplace(bytes_view term) {
   REGISTER_TIMER_DETAILED();
   auto& parent = writer_.parent();
 
   // maximum number to bytes needed for storage of term length and data
-  const auto max_term_len = term.size();  // + vencode_size(term.size());
+  const auto term_size = term.size();  // + vencode_size(term.size());
 
-  if (writer_t::container::block_type::SIZE < max_term_len) {
+  if (writer_t::container::block_type::SIZE < term_size) {
     // TODO: maybe move big terms it to a separate storage
     // reject terms that do not fit in a block
-    return std::make_pair(nullptr, false);
+    return nullptr;
   }
 
-  const auto slice_end = writer_.pool_offset() + max_term_len;
+  const auto slice_end = writer_.pool_offset() + term_size;
   const auto next_block_start =
     writer_.pool_offset() < parent.value_count()
       ? writer_.position().block_offset() +
@@ -74,34 +74,31 @@ std::pair<posting*, bool> postings::emplace(bytes_view term) {
   }
 
   IRS_ASSERT(size() < doc_limits::eof());  // not larger then the static flag
-  IRS_ASSERT(map_.size() == postings_.size());
+  IRS_ASSERT(terms_.size() == postings_.size());
 
   const hashed_bytes_view hashed_term{term};
 
   bool is_new = false;
-  const auto it = map_.lazy_emplace(
-    hashed_term, [&is_new, hash = hashed_term.hash(),
-                  id = map_.size()](const map_t::constructor& ctor) {
+  const auto it = terms_.lazy_emplace(
+    hashed_term, [&, size = terms_.size()](const auto& ctor) {
+      ctor(size, hashed_term.hash());
       is_new = true;
-      ctor(hash, id);
     });
-
-  if (is_new) {
-    // for new terms also write out their value
-    try {
-      writer_.write(term.data(), term.size());
-      postings_.emplace_back();
-    } catch (...) {
-      // we leave some garbage in block pool
-      map_.erase(it);
-      throw;
-    }
-
-    postings_.back().term = {(writer_.position() - term.size()).buffer(),
-                             term.size()};
+  if (IRS_LIKELY(!is_new)) {
+    return &postings_[it->ref];
+  }
+  // for new terms also write out their value
+  try {
+    auto* start = writer_.position().buffer();
+    writer_.write(term.data(), term_size);
+    IRS_ASSERT(start == (writer_.position() - term_size).buffer());
+    auto& posting = postings_.emplace_back(start, term_size);
+    return &posting;
+  } catch (...) {
+    // we leave some garbage in block pool
+    terms_.erase(it);
+    throw;
   }
-
-  return {&postings_[it->second], is_new};
 }
 
 }  // namespace irs
diff --git a/core/index/postings.hpp b/core/index/postings.hpp
index 621cad99f..5ec06594a 100644
--- a/core/index/postings.hpp
+++ b/core/index/postings.hpp
@@ -29,6 +29,7 @@
 #include "utils/hash_utils.hpp"
 #include "utils/noncopyable.hpp"
 #include "utils/string.hpp"
+#include "utils/type_limits.hpp"
 
 namespace irs {
 
@@ -54,6 +55,9 @@ using byte_block_pool =
   block_pool<byte_type, 32768, ManagedTypedAllocator<byte_type>>;
 
 struct posting {
+  explicit posting(const byte_type* data, size_t size) noexcept
+    : term{data, size} {}
+
   bytes_view term;
   uint64_t doc_code;
   // ...........................................................................
@@ -64,7 +68,7 @@ struct posting {
   // [3] - pointer to prox stream begin
   // ...........................................................................
   size_t int_start;
-  doc_id_t doc;
+  doc_id_t doc{doc_limits::invalid()};
   uint32_t freq;
   uint32_t pos;
   uint32_t offs{0};
@@ -77,10 +81,10 @@ class postings : util::noncopyable {
 
   // cppcheck-suppress constParameter
   explicit postings(writer_t& writer)
-    : map_{0, value_ref_hash{}, term_id_eq{postings_}}, writer_(writer) {}
+    : terms_{0, ValueRefHash{}, TermEq{postings_}}, writer_(writer) {}
 
   void clear() noexcept {
-    map_.clear();
+    terms_.clear();
     postings_.clear();
   }
 
@@ -88,29 +92,28 @@ class postings : util::noncopyable {
   /// sorted order
   void get_sorted_postings(std::vector<const posting*>& postings) const;
 
-  /// @note on error returns std::ptr(nullptr, false)
+  /// @note on error returns nullptr
   /// @note returned poitern remains valid until the next call
-  std::pair<posting*, bool> emplace(bytes_view term);
+  posting* emplace(bytes_view term);
 
-  bool empty() const noexcept { return map_.empty(); }
-  size_t size() const noexcept { return map_.size(); }
+  bool empty() const noexcept { return terms_.empty(); }
+  size_t size() const noexcept { return terms_.size(); }
 
  private:
-  class term_id_eq : public value_ref_eq<size_t> {
-   public:
-    explicit term_id_eq(const std::vector<posting>& data) noexcept
-      : data_(&data) {}
+  struct TermEq : ValueRefEq<size_t> {
+    using is_transparent = void;
+    using Self::operator();
 
-    using self_t::operator();
+    explicit TermEq(const std::vector<posting>& data) noexcept : data_{&data} {}
 
-    bool operator()(const ref_t& lhs,
+    bool operator()(const Ref& lhs,
                     const hashed_bytes_view& rhs) const noexcept {
-      IRS_ASSERT(lhs.second < data_->size());
-      return (*data_)[lhs.second].term == rhs;
+      IRS_ASSERT(lhs.ref < data_->size());
+      return (*data_)[lhs.ref].term == rhs;
     }
 
     bool operator()(const hashed_bytes_view& lhs,
-                    const ref_t& rhs) const noexcept {
+                    const Ref& rhs) const noexcept {
       return this->operator()(rhs, lhs);
     }
 
@@ -118,10 +121,9 @@ class postings : util::noncopyable {
     const std::vector<posting>* data_;
   };
 
-  using map_t = flat_hash_set<term_id_eq>;
-
+  // TODO(MBkkt) Maybe just flat_hash_set<unique_ptr<posting>>?
   std::vector<posting> postings_;
-  map_t map_;
+  flat_hash_set<TermEq> terms_;
   writer_t& writer_;
 };
 
diff --git a/core/utils/hash_set_utils.hpp b/core/utils/hash_set_utils.hpp
index 236e02cee..40091e5ea 100644
--- a/core/utils/hash_set_utils.hpp
+++ b/core/utils/hash_set_utils.hpp
@@ -28,22 +28,20 @@
 
 namespace irs {
 
-////////////////////////////////////////////////////////////////////////////////
-/// @brief first - hash value, second - reference
-////////////////////////////////////////////////////////////////////////////////
 template<typename T>
-using value_ref_t = std::pair<size_t, T>;
+struct ValueRef {
+  explicit ValueRef(T ref, size_t hash) : ref{ref}, hash{hash} {}
 
-////////////////////////////////////////////////////////////////////////////////
-/// @struct transparent hash for value_ref_t
-////////////////////////////////////////////////////////////////////////////////
-class value_ref_hash {
- public:
+  T ref;
+  size_t hash;
+};
+
+struct ValueRefHash {
   using is_transparent = void;
 
   template<typename T>
-  size_t operator()(const value_ref_t<T>& value) const noexcept {
-    return value.first;
+  size_t operator()(const ValueRef<T>& value) const noexcept {
+    return value.hash;
   }
 
   template<typename Char>
@@ -53,19 +51,13 @@ class value_ref_hash {
   }
 };
 
-////////////////////////////////////////////////////////////////////////////////
-/// @struct transparent equality comparator for value_ref_t
-////////////////////////////////////////////////////////////////////////////////
 template<typename T>
-struct value_ref_eq {
-  using is_transparent = void;
-
-  using self_t = value_ref_eq<T>;
-  using ref_t = value_ref_t<T>;
-  using value_t = typename ref_t::second_type;
+struct ValueRefEq {
+  using Self = ValueRefEq<T>;
+  using Ref = ValueRef<T>;
 
-  bool operator()(const ref_t& lhs, const ref_t& rhs) const noexcept {
-    return lhs.second == rhs.second;
+  bool operator()(const Ref& lhs, const Ref& rhs) const noexcept {
+    return lhs.ref == rhs.ref;
   }
 };
 
@@ -74,7 +66,6 @@ struct value_ref_eq {
 ///        rehash may still happen even if enough space was allocated
 ////////////////////////////////////////////////////////////////////////////////
 template<typename Eq>
-using flat_hash_set =
-  absl::flat_hash_set<typename Eq::ref_t, value_ref_hash, Eq>;
+using flat_hash_set = absl::flat_hash_set<typename Eq::Ref, ValueRefHash, Eq>;
 
 }  // namespace irs
diff --git a/external/text/include/boost/text/grapheme_break.hpp b/external/text/include/boost/text/grapheme_break.hpp
index 19790ca25..d6256c891 100644
--- a/external/text/include/boost/text/grapheme_break.hpp
+++ b/external/text/include/boost/text/grapheme_break.hpp
@@ -110,6 +110,7 @@ namespace boost { namespace text {
         template<typename CPIter>
         struct grapheme_break_state
         {
+            CPIter it_prev;
             CPIter it;
 
             grapheme_property prev_prop;
@@ -121,7 +122,7 @@ namespace boost { namespace text {
         template<typename CPIter>
         grapheme_break_state<CPIter> next(grapheme_break_state<CPIter> state)
         {
-            ++state.it;
+            state.it_prev = state.it++;
             state.prev_prop = state.prop;
             return state;
         }
@@ -259,12 +260,12 @@ constexpr std::array<std::array<bool, 15>, 15> grapheme_breaks = {{
                 return first;
 
             grapheme_break_state<CPIter> state;
-            state.it = first;
+            state.it = state.it_prev = first;
 
             if (++state.it == last)
                 return state.it;
 
-            state.prev_prop = boost::text::grapheme_prop(*std::prev(state.it));
+            state.prev_prop = boost::text::grapheme_prop(*state.it_prev);
             state.prop = boost::text::grapheme_prop(*state.it);
 
             state.emoji_state =
@@ -278,22 +279,17 @@ constexpr std::array<std::array<bool, 15>, 15> grapheme_breaks = {{
                 // GB11
                 if (state.prev_prop == grapheme_property::ZWJ &&
                     state.prop == grapheme_property::ExtPict &&
-                    detail::gb11_prefix(first, std::prev(state.it))) {
+                    detail::gb11_prefix(first, state.it_prev)) {
                     continue;
                 }
 
-                if (state.emoji_state ==
-                    grapheme_break_emoji_state_t::first_emoji) {
+                if (state.emoji_state == grapheme_break_emoji_state_t::first_emoji) {
+                    state.emoji_state = grapheme_break_emoji_state_t::none;
                     if (state.prop == grapheme_property::Regional_Indicator) {
-                        state.emoji_state = grapheme_break_emoji_state_t::none;
                         continue;
-                    } else {
-                        state.emoji_state = grapheme_break_emoji_state_t::none;
                     }
-                } else if (
-                    state.prop == grapheme_property::Regional_Indicator) {
-                    state.emoji_state =
-                        grapheme_break_emoji_state_t::first_emoji;
+                } else if (state.prop == grapheme_property::Regional_Indicator) {
+                    state.emoji_state = grapheme_break_emoji_state_t::first_emoji;
                 }
 
                 if (detail::table_grapheme_break(state.prev_prop, state.prop))
diff --git a/external/text/include/boost/text/word_break.hpp b/external/text/include/boost/text/word_break.hpp
index 1f34cb149..02661d030 100644
--- a/external/text/include/boost/text/word_break.hpp
+++ b/external/text/include/boost/text/word_break.hpp
@@ -19,11 +19,11 @@
 
 namespace boost { namespace text {
     /** The word properties defined by Unicode. */
-    enum word_property {
-        Other,
+    enum word_property : uint32_t {
         CR,
         LF,
         Newline,
+        Other,
         Katakana,
         ALetter,
         MidLetter,
@@ -39,7 +39,7 @@ namespace boost { namespace text {
         WSegSpace,
         Format,
         Extend,
-        ZWJ
+        ZWJ,
     };
 
     namespace detail {
@@ -141,14 +141,12 @@ namespace boost { namespace text {
     namespace detail {
         inline bool skippable(word_property prop) noexcept
         {
-            return prop == word_property::Extend ||
-                   prop == word_property::Format || prop == word_property::ZWJ;
+            return word_property::Format <= prop;
         }
 
         inline bool linebreak(word_property prop) noexcept
         {
-            return prop == word_property::CR || prop == word_property::LF ||
-                   prop == word_property::Newline;
+            return prop <= word_property::Newline;
         }
 
         inline bool ah_letter(word_property prop) noexcept
@@ -204,21 +202,29 @@ namespace boost { namespace text {
         struct word_break_state
         {
             word_break_state() {}
-
             CPIter it;
+            CPIter it_next;
+
             bool it_points_to_prev = false;
 
-            std::array<cp_and_word_prop, 5> caps;
+            std::array<cp_and_word_prop, 4> caps;
 
             word_break_emoji_state_t emoji_state;
         };
 
         template<typename CPIter>
-        word_break_state<CPIter> next(word_break_state<CPIter> state)
+        word_break_state<CPIter> next(word_break_state<CPIter> state, CPIter last)
         {
-            ++state.it;
-            std::copy(
-                state.caps.begin() + 1, state.caps.end(), state.caps.begin());
+            state.caps[0] = state.caps[1];
+            state.caps[1] = state.caps[2];
+            state.caps[2] = state.caps[3];
+            state.caps[3] = cp_and_word_prop{};
+
+            state.it = state.it_next;
+            if (state.it_next != last) {
+                ++state.it_next;
+            }
+
             return state;
         }
 
@@ -240,28 +246,28 @@ namespace boost { namespace text {
 
             // clang-format off
 // See chart at http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakTest.html.
-constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
-// Other CR LF NL Ktk AL ML MN MNL Num ENL RI HL DQ SQ EP WSSp Fmt Extd ZWJ
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Other
-    {{1, 1, 0, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   1,  1,   1}}, // CR
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   1,  1,   1}}, // LF
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   1,  1,   1}}, // Newline
-    {{1, 1, 1, 1, 0,  1, 1, 1, 1,  1,  0,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Katakana
-    {{1, 1, 1, 1, 1,  0, 1, 1, 1,  0,  0,  1, 0, 1, 1, 1, 1,   0,  0,   0}}, // ALetter
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // MidLetter
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // MidNum
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // MidNumLet
-    {{1, 1, 1, 1, 1,  0, 1, 1, 1,  0,  0,  1, 0, 1, 1, 1, 1,   0,  0,   0}}, // Numeric
-    {{1, 1, 1, 1, 0,  0, 1, 1, 1,  0,  0,  1, 0, 1, 1, 1, 1,   0,  0,   0}}, // ExtendNumLet
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  0, 1, 1, 1, 1, 1,   0,  0,   0}}, // RI
-    {{1, 1, 1, 1, 1,  0, 1, 1, 1,  0,  0,  1, 0, 1, 0, 1, 1,   0,  0,   0}}, // Hebrew_Letter
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Double_Quote
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Single_Quote
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // ExtPict
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // WSegSpace
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Format
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Extend
-    {{1, 1, 1, 1, 1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 0, 1,   0,  0,   0}}, // ZWJ
+static constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
+//    CR LF NL Other Ktk AL ML MN MNL Num ENL RI HL DQ SQ EP WSSp Fmt Extd ZWJ 
+    {{1, 0, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   1,  1,   1}}, // CR
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   1,  1,   1}}, // LF
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   1,  1,   1}}, // Newline
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Other
+    {{1, 1, 1, 1,    0,  1, 1, 1, 1,  1,  0,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Katakana
+    {{1, 1, 1, 1,    1,  0, 1, 1, 1,  0,  0,  1, 0, 1, 1, 1, 1,   0,  0,   0}}, // ALetter
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // MidLetter
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // MidNum
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // MidNumLet
+    {{1, 1, 1, 1,    1,  0, 1, 1, 1,  0,  0,  1, 0, 1, 1, 1, 1,   0,  0,   0}}, // Numeric
+    {{1, 1, 1, 1,    0,  0, 1, 1, 1,  0,  0,  1, 0, 1, 1, 1, 1,   0,  0,   0}}, // ExtendNumLet
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  0, 1, 1, 1, 1, 1,   0,  0,   0}}, // RI
+    {{1, 1, 1, 1,    1,  0, 1, 1, 1,  0,  0,  1, 0, 1, 0, 1, 1,   0,  0,   0}}, // Hebrew_Letter
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Double_Quote
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Single_Quote
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // ExtPict
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // WSegSpace
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Format
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 1, 1,   0,  0,   0}}, // Extend
+    {{1, 1, 1, 1,    1,  1, 1, 1, 1,  1,  1,  1, 1, 1, 1, 0, 1,   0,  0,   0}}, // ZWJ
 }};
             // clang-format on
             auto const lhs_int = static_cast<int>(lhs);
@@ -269,11 +275,10 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
             return word_breaks[lhs_int][rhs_int];
         }
 
-        // WB4: Except after line breaks, ignore/skip (Extend | Format |
-        // ZWJ)*
+        // WB4: Except after line breaks, ignore/skip (Extend | Format | ZWJ)*
         template<typename CPIter, typename Sentinel, typename WordPropFunc>
-        word_break_state<CPIter> skip_forward(
-            word_break_state<CPIter> state,
+        void skip_forward(
+            word_break_state<CPIter>& state,
             CPIter first,
             Sentinel last,
             WordPropFunc const & word_prop)
@@ -282,33 +287,26 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
                 !detail::skippable(state.caps[ph::prev].prop) &&
                 detail::skippable(state.caps[ph::curr].prop)) {
                 auto last_prop = word_property::Other;
-                auto temp_it = boost::text::find_if_not(
+                state.it = boost::text::find_if_not(
                     state.it, last, [word_prop, &last_prop](uint32_t cp) {
                         last_prop = word_prop(cp);
                         return detail::skippable(last_prop);
                     });
-                if (temp_it == last) {
-                    --temp_it;
+                if (state.it == last) {
+                    --state.it;
                 } else if (last_prop == word_property::ExtPict) {
-                    auto const next_to_last_prop =
-                        word_prop(*std::prev(temp_it));
+                    auto it_prev = std::prev(state.it);
+                    auto const next_to_last_prop = word_prop(*it_prev);
                     if (next_to_last_prop == word_property::ZWJ)
-                        --temp_it;
+                        state.it = it_prev;
                 }
-                state.it = temp_it;
-                state.caps[ph::curr] = cp_and_word_prop(*temp_it, word_prop);
+                state.caps[ph::curr] = cp_and_word_prop(*state.it, word_prop);
                 state.caps[ph::next] = cp_and_word_prop();
-                state.caps[ph::next_next] = cp_and_word_prop();
-                if (std::next(state.it) != last) {
-                    state.caps[ph::next] =
-                        cp_and_word_prop(*std::next(state.it), word_prop);
-                    if (std::next(state.it, 2) != last) {
-                        state.caps[ph::next_next] = cp_and_word_prop(
-                            *std::next(state.it, 2), word_prop);
-                    }
+                state.it_next = std::next(state.it);
+                if (state.it_next != last) {
+                    state.caps[ph::next] = cp_and_word_prop(*state.it_next, word_prop);
                 }
             }
-            return state;
         }
 
         template<typename T>
@@ -706,6 +704,8 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
             WordPropFunc const & word_prop = WordPropFunc{},
             WordBreakFunc const & word_break = WordBreakFunc{}) noexcept
         {
+            static_assert(std::is_same_v<WordBreakFunc, untailored_word_break>,
+                "Otherwise you need to revert changes for this function about next_next");
             using detail::ph;
             using detail::cp_and_word_prop;
 
@@ -719,42 +719,19 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
                 return state.it;
 
             state.caps[ph::prev_prev] = cp_and_word_prop();
-            state.caps[ph::prev] =
-                cp_and_word_prop(*std::prev(state.it), word_prop);
+            state.caps[ph::prev] = cp_and_word_prop(*first, word_prop);
             state.caps[ph::curr] = cp_and_word_prop(*state.it, word_prop);
             state.caps[ph::next] = cp_and_word_prop();
-            state.caps[ph::next_next] = cp_and_word_prop();
-            if (std::next(state.it) != last) {
-                state.caps[ph::next] =
-                    cp_and_word_prop(*std::next(state.it), word_prop);
-                if (std::next(state.it, 2) != last) {
-                    state.caps[ph::next_next] =
-                        cp_and_word_prop(*std::next(state.it, 2), word_prop);
-                }
-            }
+            state.it_next = std::next(state.it);
 
             state.emoji_state =
                 state.caps[ph::prev].prop == word_property::Regional_Indicator
                     ? detail::word_break_emoji_state_t::first_emoji
                     : detail::word_break_emoji_state_t::none;
 
-            for (; state.it != last; state = detail::next(state)) {
-                if (std::next(state.it) != last &&
-                    std::next(state.it, 2) != last) {
-                    state.caps[ph::next_next] =
-                        cp_and_word_prop(*std::next(state.it, 2), word_prop);
-                } else {
-                    state.caps[ph::next_next] = cp_and_word_prop();
-                }
-
-                // Check word_break before anything else.
-                if (word_break(
-                        state.caps[ph::prev_prev].cp,
-                        state.caps[ph::prev].cp,
-                        state.caps[ph::curr].cp,
-                        state.caps[ph::next].cp,
-                        state.caps[ph::next_next].cp)) {
-                    return state.it;
+            for (; state.it != last; state = detail::next(state, last)) {
+                if (state.it_next != last) {
+                    state.caps[ph::next] = cp_and_word_prop(*state.it_next, word_prop);
                 }
 
                 // WB3
@@ -764,16 +741,12 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
                 }
 
                 // WB3a
-                if (state.caps[ph::prev].prop == word_property::CR ||
-                    state.caps[ph::prev].prop == word_property::LF ||
-                    state.caps[ph::prev].prop == word_property::Newline) {
+                if (detail::linebreak(state.caps[ph::prev].prop)) {
                     return state.it;
                 }
 
                 // WB3b
-                if (state.caps[ph::curr].prop == word_property::CR ||
-                    state.caps[ph::curr].prop == word_property::LF ||
-                    state.caps[ph::curr].prop == word_property::Newline) {
+                if (detail::linebreak(state.caps[ph::curr].prop)) {
                     return state.it;
                 }
 
@@ -792,16 +765,16 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
                 // Putting this here means not having to do it explicitly
                 // below between prop and next_prop (and transitively,
                 // between prev_prop and prop).
-                state = detail::skip_forward(state, first, last, word_prop);
+                detail::skip_forward(state, first, last, word_prop);
                 if (state.it == last)
                     return state.it;
 
                 // WB6
                 if (detail::ah_letter(state.caps[ph::prev].prop) &&
                     detail::mid_ah(state.caps[ph::curr].prop) &&
-                    std::next(state.it) != last) {
-                    auto const temp_state = detail::skip_forward(
-                        detail::next(state), first, last, word_prop);
+                    state.it_next != last) {
+                    auto temp_state = detail::next(state, last);
+                    detail::skip_forward(temp_state, first, last, word_prop);
                     if (temp_state.it == last)
                         return temp_state.it;
                     if (detail::ah_letter(temp_state.caps[ph::curr].prop))
@@ -818,19 +791,17 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
                 // WB7b
                 if (state.caps[ph::prev].prop == word_property::Hebrew_Letter &&
                     state.caps[ph::curr].prop == word_property::Double_Quote &&
-                    std::next(state.it) != last) {
-                    auto const temp_state = detail::skip_forward(
-                        detail::next(state), first, last, word_prop);
+                    state.it_next != last) {
+                    auto temp_state = detail::next(state, last);
+                    detail::skip_forward(temp_state, first, last, word_prop);
                     if (temp_state.it == last)
                         return temp_state.it;
-                    if (temp_state.caps[ph::curr].prop ==
-                        word_property::Hebrew_Letter)
+                    if (temp_state.caps[ph::curr].prop == word_property::Hebrew_Letter)
                         continue;
                 }
 
                 // WB7c
-                if (state.caps[ph::prev_prev].prop ==
-                        word_property::Hebrew_Letter &&
+                if (state.caps[ph::prev_prev].prop == word_property::Hebrew_Letter &&
                     state.caps[ph::prev].prop == word_property::Double_Quote &&
                     state.caps[ph::curr].prop == word_property::Hebrew_Letter) {
                     continue;
@@ -846,37 +817,26 @@ constexpr std::array<std::array<bool, 20>, 20> word_breaks = {{
                 // WB12
                 if (state.caps[ph::prev].prop == word_property::Numeric &&
                     detail::mid_num(state.caps[ph::curr].prop) &&
-                    std::next(state.it) != last) {
-                    auto const temp_state = detail::skip_forward(
-                        detail::next(state), first, last, word_prop);
+                    state.it_next != last) {
+                    auto temp_state = detail::next(state, last);
+                    detail::skip_forward(temp_state, first, last, word_prop);
                     if (temp_state.it == last)
                         return temp_state.it;
-                    if (temp_state.caps[ph::curr].prop ==
-                        word_property::Numeric)
+                    if (temp_state.caps[ph::curr].prop == word_property::Numeric)
                         continue;
                 }
 
-                if (state.emoji_state ==
-                    detail::word_break_emoji_state_t::first_emoji) {
-                    if (state.caps[ph::curr].prop ==
-                        word_property::Regional_Indicator) {
-                        state.emoji_state =
-                            detail::word_break_emoji_state_t::none;
+                if (state.emoji_state == detail::word_break_emoji_state_t::first_emoji) {
+                    state.emoji_state = detail::word_break_emoji_state_t::none;
+                    if (state.caps[ph::curr].prop == word_property::Regional_Indicator) {
                         continue;
-                    } else {
-                        state.emoji_state =
-                            detail::word_break_emoji_state_t::none;
                     }
-                } else if (
-                    state.caps[ph::curr].prop ==
-                    word_property::Regional_Indicator) {
-                    state.emoji_state =
-                        detail::word_break_emoji_state_t::first_emoji;
+                } else if (state.caps[ph::curr].prop == word_property::Regional_Indicator) {
+                    state.emoji_state = detail::word_break_emoji_state_t::first_emoji;
                     return state.it;
                 }
 
-                if (detail::table_word_break(
-                        state.caps[ph::prev].prop, state.caps[ph::curr].prop))
+                if (detail::table_word_break(state.caps[ph::prev].prop, state.caps[ph::curr].prop))
                     return state.it;
             }
             return state.it;

From e903df12aeaff411af827bacbad4001d27cf572a Mon Sep 17 00:00:00 2001
From: Valery Mironov <32071355+MBkkt@users.noreply.github.com>
Date: Wed, 20 Sep 2023 12:23:44 +0200
Subject: [PATCH 2/3] Fix test

---
 tests/index/postings_tests.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/index/postings_tests.cpp b/tests/index/postings_tests.cpp
index 36d8d1b8a..f7c90ac6d 100644
--- a/tests/index/postings_tests.cpp
+++ b/tests/index/postings_tests.cpp
@@ -55,12 +55,13 @@ void insert_find_core(const std::vector<std::string>& src) {
     const std::string& s = src[i];
     const bytes_view b = detail::to_bytes_view(s);
     auto res = bh.emplace(b);
-    ASSERT_NE(nullptr, res.first);
-    ASSERT_TRUE(res.second);
+    ASSERT_NE(nullptr, res);
+    ASSERT_FALSE(doc_limits::valid(res->doc));
+    res->doc = doc_limits::invalid() + 1;
 
     res = bh.emplace(b);
-    ASSERT_NE(nullptr, res.first);
-    ASSERT_FALSE(res.second);
+    ASSERT_NE(nullptr, res);
+    ASSERT_TRUE(doc_limits::valid(res->doc));
   }
   ASSERT_GT(memory.counter_, 0);
 
@@ -72,21 +73,21 @@ void insert_find_core(const std::vector<std::string>& src) {
     const std::string long_str(block_size - irs::bytes_io<size_t>::vsize(32767),
                                'c');
     auto res = bh.emplace(detail::to_bytes_view(long_str));
-    ASSERT_TRUE(res.second);
+    ASSERT_FALSE(doc_limits::valid(res->doc));
   }
 
   // insert long key
   {
     const std::string too_long_str(block_size, 'c');
     auto res = bh.emplace(detail::to_bytes_view(too_long_str));
-    ASSERT_TRUE(res.second);
+    ASSERT_FALSE(doc_limits::valid(res->doc));
   }
 
   // insert too long key
   {
     const std::string too_long_str(1 + block_size, 'c');
     auto res = bh.emplace(detail::to_bytes_view(too_long_str));
-    ASSERT_FALSE(res.second);
+    ASSERT_EQ(nullptr, res);
   }
   ASSERT_GT(memory.counter_, 0);
   pool.reset();

From 20cbac407b9ce060896f53a1d491dc035df0d0e9 Mon Sep 17 00:00:00 2001
From: Valery Mironov <32071355+MBkkt@users.noreply.github.com>
Date: Wed, 20 Sep 2023 12:26:03 +0200
Subject: [PATCH 3/3] Review suggestion

---
 core/index/field_data.cpp | 8 ++++----
 core/index/postings.cpp   | 3 +--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/core/index/field_data.cpp b/core/index/field_data.cpp
index e4fb2de5b..4e2b54053 100644
--- a/core/index/field_data.cpp
+++ b/core/index/field_data.cpp
@@ -1054,9 +1054,9 @@ bool field_data::invert(token_stream& stream, doc_id_t id) {
       last_start_offs_ = start_offset;
     }
 
-    auto* posting = terms_.emplace(term->value);
+    auto* p = terms_.emplace(term->value);
 
-    if (posting == nullptr) {
+    if (p == nullptr) {
       IRS_LOG_WARN(absl::StrCat("skipping too long term of size: ",
                                 term->value.size(), " in field: ", meta_.name));
       IRS_LOG_TRACE(
@@ -1065,8 +1065,8 @@ bool field_data::invert(token_stream& stream, doc_id_t id) {
       continue;
     }
 
-    (this->*proc_table_[posting->doc == doc_limits::invalid()])(*posting, id,
-                                                                pay, offs);
+    (this->*proc_table_[!doc_limits::valid(p->doc)])(*p, id, pay, offs);
+    IRS_ASSERT(doc_limits::valid(p->doc));
 
     if (0 == ++stats_.len) {
       IRS_LOG_ERROR(absl::StrCat("too many tokens in field: ", meta_.name,
diff --git a/core/index/postings.cpp b/core/index/postings.cpp
index 577bbd479..9f2f1a3a6 100644
--- a/core/index/postings.cpp
+++ b/core/index/postings.cpp
@@ -92,8 +92,7 @@ posting* postings::emplace(bytes_view term) {
     auto* start = writer_.position().buffer();
     writer_.write(term.data(), term_size);
     IRS_ASSERT(start == (writer_.position() - term_size).buffer());
-    auto& posting = postings_.emplace_back(start, term_size);
-    return &posting;
+    return &postings_.emplace_back(start, term_size);
   } catch (...) {
     // we leave some garbage in block pool
     terms_.erase(it);