Speedup segmentation (#561)

* Speedup segmentation analyzer * Fix test * Review suggestion
iresearch-toolkit · Sep 20, 2023 · 2350a18 · 2350a18
1 parent add15fb
commit 2350a18
Show file tree

Hide file tree

Showing 11 changed files with 203 additions and 247 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -97,6 +97,14 @@ endif ()
 if (CMAKE_BUILD_TYPE MATCHES "Coverage")
   set(IRESEARCH_COVERAGE ON)
   set(CMAKE_BUILD_TYPE "Debug")
+elseif (CMAKE_BUILD_TYPE MATCHES "Profile")
+  set(CMAKE_BUILD_TYPE "Release")
+  add_compile_options(
+    -g 
+    -fno-omit-frame-pointer
+  # -fno-inline
+  # -fno-optimize-sibling-calls
+  )
 endif ()
 
 add_option_gprof(FALSE)

diff --git a/core/analysis/segmentation_token_stream.cpp b/core/analysis/segmentation_token_stream.cpp
@@ -328,11 +328,10 @@ bool segmentation_token_stream::next() {
     const auto begin = gr_begin.base();
     const auto end = gr_end.base();
 
-    const size_t length =
+    const auto length =
       static_cast<size_t>(std::distance(begin.base(), end.base()));
 
-    if (!length) {
-      // eof
+    if (length == 0) {  // eof
       return false;
     }
 
@@ -355,6 +354,7 @@ bool segmentation_token_stream::next() {
         term.value = {reinterpret_cast<const byte_type*>(&(*begin.base())),
                       length};
         break;
+        // TODO(MBkkt) do we need to call as_graphemes? Feels like no
       case options_t::case_convert_t::LOWER:
         term_buf_.clear();
         to_lower(as_graphemes(begin, end), from_utf32_back_inserter(term_buf_));

diff --git a/core/formats/columnstore.cpp b/core/formats/columnstore.cpp
@@ -256,21 +256,22 @@ void read_compact(irs::index_input& in, irs::encryption::stream* cipher,
   }
 }
 
-struct column_ref_eq : value_ref_eq<column_meta*> {
-  using self_t::operator();
+struct ColumnMetaEq : ValueRefEq<column_meta*> {
+  using is_transparent = void;
+  using Self::operator();
 
-  bool operator()(const ref_t& lhs,
+  bool operator()(const Ref& lhs,
                   const hashed_string_view& rhs) const noexcept {
-    return lhs.second->name == rhs;
+    return lhs.ref->name == rhs;
   }
 
   bool operator()(const hashed_string_view& lhs,
-                  const ref_t& rhs) const noexcept {
+                  const Ref& rhs) const noexcept {
     return this->operator()(rhs, lhs);
   }
 };
 
-using name_to_column_map = flat_hash_set<column_ref_eq>;
+using name_to_column_map = flat_hash_set<ColumnMetaEq>;
 
 class meta_writer final {
  public:

diff --git a/core/index/field_data.cpp b/core/index/field_data.cpp
@@ -1054,23 +1054,23 @@ bool field_data::invert(token_stream& stream, doc_id_t id) {
       last_start_offs_ = start_offset;
     }
 
-    const auto res = terms_.emplace(term->value);
-
-    if (nullptr == res.first) {
-      IRS_LOG_WARN(absl::StrCat("skipping too long term of size '",
-                                term->value.size(), "' in field '", meta_.name,
-                                "'"));
-      IRS_LOG_TRACE(absl::StrCat("field '", meta_.name,
-                                 "' contains too long term '",
-                                 ViewCast<char>(term->value), "'"));
+    auto* p = terms_.emplace(term->value);
+
+    if (p == nullptr) {
+      IRS_LOG_WARN(absl::StrCat("skipping too long term of size: ",
+                                term->value.size(), " in field: ", meta_.name));
+      IRS_LOG_TRACE(
+        absl::StrCat("field: ", meta_.name,
+                     " contains too long term: ", ViewCast<char>(term->value)));
       continue;
     }
 
-    (this->*proc_table_[size_t(res.second)])(*res.first, id, pay, offs);
+    (this->*proc_table_[!doc_limits::valid(p->doc)])(*p, id, pay, offs);
+    IRS_ASSERT(doc_limits::valid(p->doc));
 
     if (0 == ++stats_.len) {
-      IRS_LOG_ERROR(absl::StrCat("too many tokens in field '", meta_.name,
-                                 "', document '", id, "'"));
+      IRS_LOG_ERROR(absl::StrCat("too many tokens in field: ", meta_.name,
+                                 ", document: ", id));
       return false;
     }
 
@@ -1108,12 +1108,12 @@ field_data* fields_data::emplace(const hashed_string_view& name,
 
   auto it = fields_map_.lazy_emplace(
     name, [&name](const fields_map::constructor& ctor) {
-      ctor(name.hash(), nullptr);
+      ctor(nullptr, name.hash());
     });
 
-  if (!it->second) {
+  if (!it->ref) {
     try {
-      const_cast<field_data*&>(it->second) = &fields_.emplace_back(
+      const_cast<field_data*&>(it->ref) = &fields_.emplace_back(
         name, features, *feature_info_, *cached_columns_, *cached_features_,
         columns, byte_writer_, int_writer_, index_features,
         (nullptr != comparator_));
@@ -1123,7 +1123,7 @@ field_data* fields_data::emplace(const hashed_string_view& name,
     }
   }
 
-  return it->second;
+  return it->ref;
 }
 
 void fields_data::flush(field_writer& fw, flush_state& state) {

diff --git a/core/index/field_data.hpp b/core/index/field_data.hpp
@@ -198,21 +198,22 @@ class field_data : util::noncopyable {
 
 class fields_data : util::noncopyable {
  private:
-  struct field_ref_eq : value_ref_eq<field_data*> {
-    using self_t::operator();
+  struct FieldEq : ValueRefEq<field_data*> {
+    using is_transparent = void;
+    using Self::operator();
 
-    bool operator()(const ref_t& lhs,
+    bool operator()(const Ref& lhs,
                     const hashed_string_view& rhs) const noexcept {
-      return lhs.second->meta().name == rhs;
+      return lhs.ref->meta().name == rhs;
     }
 
     bool operator()(const hashed_string_view& lhs,
-                    const ref_t& rhs) const noexcept {
+                    const Ref& rhs) const noexcept {
       return this->operator()(rhs, lhs);
     }
   };
 
-  using fields_map = flat_hash_set<field_ref_eq>;
+  using fields_map = flat_hash_set<FieldEq>;
 
  public:
   using postings_ref_t = std::vector<const posting*>;

diff --git a/core/index/postings.cpp b/core/index/postings.cpp
@@ -34,12 +34,12 @@ namespace irs {
 
 void postings::get_sorted_postings(
   std::vector<const posting*>& postings) const {
-  postings.resize(map_.size());
+  IRS_ASSERT(terms_.size() == postings_.size());
 
-  auto begin = postings.begin();
-  for (auto& entry : map_) {
-    *begin = &postings_[entry.second];
-    ++begin;
+  postings.resize(postings_.size());
+
+  for (auto* p = postings.data(); const auto& posting : postings_) {
+    *p++ = &posting;
   }
 
   std::sort(postings.begin(), postings.end(),
@@ -48,20 +48,20 @@ void postings::get_sorted_postings(
             });
 }
 
-std::pair<posting*, bool> postings::emplace(bytes_view term) {
+posting* postings::emplace(bytes_view term) {
   REGISTER_TIMER_DETAILED();
   auto& parent = writer_.parent();
 
   // maximum number to bytes needed for storage of term length and data
-  const auto max_term_len = term.size();  // + vencode_size(term.size());
+  const auto term_size = term.size();  // + vencode_size(term.size());
 
-  if (writer_t::container::block_type::SIZE < max_term_len) {
+  if (writer_t::container::block_type::SIZE < term_size) {
     // TODO: maybe move big terms it to a separate storage
     // reject terms that do not fit in a block
-    return std::make_pair(nullptr, false);
+    return nullptr;
   }
 
-  const auto slice_end = writer_.pool_offset() + max_term_len;
+  const auto slice_end = writer_.pool_offset() + term_size;
   const auto next_block_start =
     writer_.pool_offset() < parent.value_count()
       ? writer_.position().block_offset() +
@@ -74,34 +74,30 @@ std::pair<posting*, bool> postings::emplace(bytes_view term) {
   }
 
   IRS_ASSERT(size() < doc_limits::eof());  // not larger then the static flag
-  IRS_ASSERT(map_.size() == postings_.size());
+  IRS_ASSERT(terms_.size() == postings_.size());
 
   const hashed_bytes_view hashed_term{term};
 
   bool is_new = false;
-  const auto it = map_.lazy_emplace(
-    hashed_term, [&is_new, hash = hashed_term.hash(),
-                  id = map_.size()](const map_t::constructor& ctor) {
+  const auto it = terms_.lazy_emplace(
+    hashed_term, [&, size = terms_.size()](const auto& ctor) {
+      ctor(size, hashed_term.hash());
       is_new = true;
-      ctor(hash, id);
     });
-
-  if (is_new) {
-    // for new terms also write out their value
-    try {
-      writer_.write(term.data(), term.size());
-      postings_.emplace_back();
-    } catch (...) {
-      // we leave some garbage in block pool
-      map_.erase(it);
-      throw;
-    }
-
-    postings_.back().term = {(writer_.position() - term.size()).buffer(),
-                             term.size()};
+  if (IRS_LIKELY(!is_new)) {
+    return &postings_[it->ref];
+  }
+  // for new terms also write out their value
+  try {
+    auto* start = writer_.position().buffer();
+    writer_.write(term.data(), term_size);
+    IRS_ASSERT(start == (writer_.position() - term_size).buffer());
+    return &postings_.emplace_back(start, term_size);
+  } catch (...) {
+    // we leave some garbage in block pool
+    terms_.erase(it);
+    throw;
   }
-
-  return {&postings_[it->second], is_new};
 }
 
 }  // namespace irs
diff --git a/core/index/postings.hpp b/core/index/postings.hpp
@@ -29,6 +29,7 @@
 #include "utils/hash_utils.hpp"
 #include "utils/noncopyable.hpp"
 #include "utils/string.hpp"
+#include "utils/type_limits.hpp"
 
 namespace irs {
 
@@ -54,6 +55,9 @@ using byte_block_pool =
   block_pool<byte_type, 32768, ManagedTypedAllocator<byte_type>>;
 
 struct posting {
+  explicit posting(const byte_type* data, size_t size) noexcept
+    : term{data, size} {}
+
   bytes_view term;
   uint64_t doc_code;
   // ...........................................................................
@@ -64,7 +68,7 @@ struct posting {
   // [3] - pointer to prox stream begin
   // ...........................................................................
   size_t int_start;
-  doc_id_t doc;
+  doc_id_t doc{doc_limits::invalid()};
   uint32_t freq;
   uint32_t pos;
   uint32_t offs{0};
@@ -77,51 +81,49 @@ class postings : util::noncopyable {
 
   // cppcheck-suppress constParameter
   explicit postings(writer_t& writer)
-    : map_{0, value_ref_hash{}, term_id_eq{postings_}}, writer_(writer) {}
+    : terms_{0, ValueRefHash{}, TermEq{postings_}}, writer_(writer) {}
 
   void clear() noexcept {
-    map_.clear();
+    terms_.clear();
     postings_.clear();
   }
 
   /// @brief fill a provided vector with terms and corresponding postings in
   /// sorted order
   void get_sorted_postings(std::vector<const posting*>& postings) const;
 
-  /// @note on error returns std::ptr(nullptr, false)
+  /// @note on error returns nullptr
   /// @note returned poitern remains valid until the next call
-  std::pair<posting*, bool> emplace(bytes_view term);
+  posting* emplace(bytes_view term);
 
-  bool empty() const noexcept { return map_.empty(); }
-  size_t size() const noexcept { return map_.size(); }
+  bool empty() const noexcept { return terms_.empty(); }
+  size_t size() const noexcept { return terms_.size(); }
 
  private:
-  class term_id_eq : public value_ref_eq<size_t> {
-   public:
-    explicit term_id_eq(const std::vector<posting>& data) noexcept
-      : data_(&data) {}
+  struct TermEq : ValueRefEq<size_t> {
+    using is_transparent = void;
+    using Self::operator();
 
-    using self_t::operator();
+    explicit TermEq(const std::vector<posting>& data) noexcept : data_{&data} {}
 
-    bool operator()(const ref_t& lhs,
+    bool operator()(const Ref& lhs,
                     const hashed_bytes_view& rhs) const noexcept {
-      IRS_ASSERT(lhs.second < data_->size());
-      return (*data_)[lhs.second].term == rhs;
+      IRS_ASSERT(lhs.ref < data_->size());
+      return (*data_)[lhs.ref].term == rhs;
     }
 
     bool operator()(const hashed_bytes_view& lhs,
-                    const ref_t& rhs) const noexcept {
+                    const Ref& rhs) const noexcept {
       return this->operator()(rhs, lhs);
     }
 
    private:
     const std::vector<posting>* data_;
   };
 
-  using map_t = flat_hash_set<term_id_eq>;
-
+  // TODO(MBkkt) Maybe just flat_hash_set<unique_ptr<posting>>?
   std::vector<posting> postings_;
-  map_t map_;
+  flat_hash_set<TermEq> terms_;
   writer_t& writer_;
 };