iresearch-toolkit · MBkkt · Sep 20, 2023 · Sep 14, 2023 · Sep 20, 2023 · Sep 20, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -97,6 +97,14 @@ endif ()
 if (CMAKE_BUILD_TYPE MATCHES "Coverage")
  set(IRESEARCH_COVERAGE ON)
  set(CMAKE_BUILD_TYPE "Debug")
+elseif (CMAKE_BUILD_TYPE MATCHES "Profile")
+ set(CMAKE_BUILD_TYPE "Release")
+ add_compile_options(
+ -g 
+ -fno-omit-frame-pointer
+ # -fno-inline
+ # -fno-optimize-sibling-calls
+ )
 endif ()
 
 add_option_gprof(FALSE)

diff --git a/core/analysis/segmentation_token_stream.cpp b/core/analysis/segmentation_token_stream.cpp
@@ -328,11 +328,10 @@ bool segmentation_token_stream::next() {
  const auto begin = gr_begin.base();
  const auto end = gr_end.base();
 
- const size_t length =
+ const auto length =
  static_cast<size_t>(std::distance(begin.base(), end.base()));
 
- if (!length) {
- // eof
+ if (length == 0) { // eof
  return false;
  }
 
@@ -355,6 +354,7 @@ bool segmentation_token_stream::next() {
  term.value = {reinterpret_cast<const byte_type*>(&(*begin.base())),
  length};
  break;
+ // TODO(MBkkt) do we need to call as_graphemes? Feels like no
  case options_t::case_convert_t::LOWER:
  term_buf_.clear();
  to_lower(as_graphemes(begin, end), from_utf32_back_inserter(term_buf_));

diff --git a/core/formats/columnstore.cpp b/core/formats/columnstore.cpp
@@ -256,21 +256,22 @@ void read_compact(irs::index_input& in, irs::encryption::stream* cipher,
  }
 }
 
-struct column_ref_eq : value_ref_eq<column_meta*> {
- using self_t::operator();
+struct ColumnMetaEq : ValueRefEq<column_meta*> {
+ using is_transparent = void;
+ using Self::operator();
 
- bool operator()(const ref_t& lhs,
+ bool operator()(const Ref& lhs,
  const hashed_string_view& rhs) const noexcept {
- return lhs.second->name == rhs;
+ return lhs.ref->name == rhs;
  }
 
  bool operator()(const hashed_string_view& lhs,
- const ref_t& rhs) const noexcept {
+ const Ref& rhs) const noexcept {
  return this->operator()(rhs, lhs);
  }
 };
 
-using name_to_column_map = flat_hash_set<column_ref_eq>;
+using name_to_column_map = flat_hash_set<ColumnMetaEq>;
 
 class meta_writer final {
  public:

diff --git a/core/index/field_data.cpp b/core/index/field_data.cpp
@@ -1054,23 +1054,23 @@ bool field_data::invert(token_stream& stream, doc_id_t id) {
  last_start_offs_ = start_offset;
  }
 
- const auto res = terms_.emplace(term->value);
-
- if (nullptr == res.first) {
- IRS_LOG_WARN(absl::StrCat("skipping too long term of size '",
- term->value.size(), "' in field '", meta_.name,
- "'"));
- IRS_LOG_TRACE(absl::StrCat("field '", meta_.name,
- "' contains too long term '",
- ViewCast<char>(term->value), "'"));
+ auto* p = terms_.emplace(term->value);
+
+ if (p == nullptr) {
+ IRS_LOG_WARN(absl::StrCat("skipping too long term of size: ",
+ term->value.size(), " in field: ", meta_.name));
+ IRS_LOG_TRACE(
+ absl::StrCat("field: ", meta_.name,
+ " contains too long term: ", ViewCast<char>(term->value)));
  continue;
  }
 
- (this->*proc_table_[size_t(res.second)])(*res.first, id, pay, offs);
+ (this->*proc_table_[!doc_limits::valid(p->doc)])(*p, id, pay, offs);
+ IRS_ASSERT(doc_limits::valid(p->doc));
 
  if (0 == ++stats_.len) {
- IRS_LOG_ERROR(absl::StrCat("too many tokens in field '", meta_.name,
- "', document '", id, "'"));
+ IRS_LOG_ERROR(absl::StrCat("too many tokens in field: ", meta_.name,
+ ", document: ", id));
  return false;
  }
 
@@ -1108,12 +1108,12 @@ field_data* fields_data::emplace(const hashed_string_view& name,
 
  auto it = fields_map_.lazy_emplace(
  name, [&name](const fields_map::constructor& ctor) {
- ctor(name.hash(), nullptr);
+ ctor(nullptr, name.hash());
  });
 
- if (!it->second) {
+ if (!it->ref) {
  try {
- const_cast<field_data*&>(it->second) = &fields_.emplace_back(
+ const_cast<field_data*&>(it->ref) = &fields_.emplace_back(
  name, features, *feature_info_, *cached_columns_, *cached_features_,
  columns, byte_writer_, int_writer_, index_features,
  (nullptr != comparator_));
@@ -1123,7 +1123,7 @@ field_data* fields_data::emplace(const hashed_string_view& name,
  }
  }
 
- return it->second;
+ return it->ref;
 }
 
 void fields_data::flush(field_writer& fw, flush_state& state) {

diff --git a/core/index/field_data.hpp b/core/index/field_data.hpp
@@ -198,21 +198,22 @@ class field_data : util::noncopyable {
 
 class fields_data : util::noncopyable {
  private:
- struct field_ref_eq : value_ref_eq<field_data*> {
- using self_t::operator();
+ struct FieldEq : ValueRefEq<field_data*> {
+ using is_transparent = void;
+ using Self::operator();
 
- bool operator()(const ref_t& lhs,
+ bool operator()(const Ref& lhs,
  const hashed_string_view& rhs) const noexcept {
- return lhs.second->meta().name == rhs;
+ return lhs.ref->meta().name == rhs;
  }
 
  bool operator()(const hashed_string_view& lhs,
- const ref_t& rhs) const noexcept {
+ const Ref& rhs) const noexcept {
  return this->operator()(rhs, lhs);
  }
  };
 
- using fields_map = flat_hash_set<field_ref_eq>;
+ using fields_map = flat_hash_set<FieldEq>;
 
  public:
  using postings_ref_t = std::vector<const posting*>;

diff --git a/core/index/postings.cpp b/core/index/postings.cpp
@@ -34,12 +34,12 @@ namespace irs {
 
 void postings::get_sorted_postings(
  std::vector<const posting*>& postings) const {
- postings.resize(map_.size());
+ IRS_ASSERT(terms_.size() == postings_.size());
 
- auto begin = postings.begin();
- for (auto& entry : map_) {
-  *begin = &postings_[entry.second];
- ++begin;
+ postings.resize(postings_.size());
+
+ for (auto* p = postings.data(); const auto& posting : postings_) {
+ *p++ = &posting;
  }
 
  std::sort(postings.begin(), postings.end(),
@@ -48,20 +48,20 @@ void postings::get_sorted_postings(
  });
 }
 
-std::pair<posting*, bool> postings::emplace(bytes_view term) {
+posting* postings::emplace(bytes_view term) {
  REGISTER_TIMER_DETAILED();
  auto& parent = writer_.parent();
 
  // maximum number to bytes needed for storage of term length and data
- const auto max_term_len = term.size(); // + vencode_size(term.size());
+ const auto term_size = term.size(); // + vencode_size(term.size());
 
- if (writer_t::container::block_type::SIZE < max_term_len) {
+ if (writer_t::container::block_type::SIZE < term_size) {
  // TODO: maybe move big terms it to a separate storage
  // reject terms that do not fit in a block
- return std::make_pair(nullptr, false);
+ return nullptr;
  }
 
- const auto slice_end = writer_.pool_offset() + max_term_len;
+ const auto slice_end = writer_.pool_offset() + term_size;
  const auto next_block_start =
  writer_.pool_offset() < parent.value_count()
  ? writer_.position().block_offset() +
@@ -74,34 +74,30 @@ std::pair<posting*, bool> postings::emplace(bytes_view term) {
  }
 
  IRS_ASSERT(size() < doc_limits::eof()); // not larger then the static flag
- IRS_ASSERT(map_.size() == postings_.size());
+ IRS_ASSERT(terms_.size() == postings_.size());
 
  const hashed_bytes_view hashed_term{term};
 
  bool is_new = false;
- const auto it = map_.lazy_emplace(
- hashed_term, [&is_new, hash = hashed_term.hash(),
-  id = map_.size()](const map_t::constructor& ctor) {
+ const auto it = terms_.lazy_emplace(
+ hashed_term, [&, size = terms_.size()](const auto& ctor) {
+ ctor(size, hashed_term.hash());
  is_new = true;
- ctor(hash, id);
  });
-
- if (is_new) {
- // for new terms also write out their value
- try {
- writer_.write(term.data(), term.size());
- postings_.emplace_back();
- } catch (...) {
- // we leave some garbage in block pool
- map_.erase(it);
- throw;
- }
-
- postings_.back().term = {(writer_.position() - term.size()).buffer(),
- term.size()};
+ if (IRS_LIKELY(!is_new)) {
+ return &postings_[it->ref];
+ }
+ // for new terms also write out their value
+ try {
+ auto* start = writer_.position().buffer();
+ writer_.write(term.data(), term_size);
+ IRS_ASSERT(start == (writer_.position() - term_size).buffer());
+ return &postings_.emplace_back(start, term_size);
+ } catch (...) {
+ // we leave some garbage in block pool
+ terms_.erase(it);
+ throw;
  }
-
- return {&postings_[it->second], is_new};
 }
 
 } // namespace irs
diff --git a/core/index/postings.hpp b/core/index/postings.hpp
@@ -29,6 +29,7 @@
 #include "utils/hash_utils.hpp"
 #include "utils/noncopyable.hpp"
 #include "utils/string.hpp"
+#include "utils/type_limits.hpp"
 
 namespace irs {
 
@@ -54,6 +55,9 @@ using byte_block_pool =
  block_pool<byte_type, 32768, ManagedTypedAllocator<byte_type>>;
 
 struct posting {
+ explicit posting(const byte_type* data, size_t size) noexcept
+ : term{data, size} {}
+
  bytes_view term;
  uint64_t doc_code;
  // ...........................................................................
@@ -64,7 +68,7 @@ struct posting {
  // [3] - pointer to prox stream begin
  // ...........................................................................
  size_t int_start;
- doc_id_t doc;
+ doc_id_t doc{doc_limits::invalid()};
  uint32_t freq;
  uint32_t pos;
  uint32_t offs{0};
@@ -77,51 +81,49 @@ class postings : util::noncopyable {
 
  // cppcheck-suppress constParameter
  explicit postings(writer_t& writer)
- : map_{0, value_ref_hash{}, term_id_eq{postings_}}, writer_(writer) {}
+ : terms_{0, ValueRefHash{}, TermEq{postings_}}, writer_(writer) {}
 
  void clear() noexcept {
- map_.clear();
+ terms_.clear();
  postings_.clear();
  }
 
  /// @brief fill a provided vector with terms and corresponding postings in
  /// sorted order
  void get_sorted_postings(std::vector<const posting*>& postings) const;
 
- /// @note on error returns std::ptr(nullptr, false)
+ /// @note on error returns nullptr
  /// @note returned poitern remains valid until the next call
- std::pair<posting*, bool> emplace(bytes_view term);
+ posting* emplace(bytes_view term);
 
- bool empty() const noexcept { return map_.empty(); }
- size_t size() const noexcept { return map_.size(); }
+ bool empty() const noexcept { return terms_.empty(); }
+ size_t size() const noexcept { return terms_.size(); }
 
  private:
- class term_id_eq : public value_ref_eq<size_t> {
- public:
- explicit term_id_eq(const std::vector<posting>& data) noexcept
- : data_(&data) {}
+ struct TermEq : ValueRefEq<size_t> {
+ using is_transparent = void;
+ using Self::operator();
 
- using self_t::operator();
+ explicit TermEq(const std::vector<posting>& data) noexcept : data_{&data} {}
 
- bool operator()(const ref_t& lhs,
+ bool operator()(const Ref& lhs,
  const hashed_bytes_view& rhs) const noexcept {
- IRS_ASSERT(lhs.second < data_->size());
- return (*data_)[lhs.second].term == rhs;
+ IRS_ASSERT(lhs.ref < data_->size());
+ return (*data_)[lhs.ref].term == rhs;
  }
 
  bool operator()(const hashed_bytes_view& lhs,
- const ref_t& rhs) const noexcept {
+ const Ref& rhs) const noexcept {
  return this->operator()(rhs, lhs);
  }
 
  private:
  const std::vector<posting>* data_;
  };
 
- using map_t = flat_hash_set<term_id_eq>;
-
+ // TODO(MBkkt) Maybe just flat_hash_set<unique_ptr<posting>>?
  std::vector<posting> postings_;
- map_t map_;
+ flat_hash_set<TermEq> terms_;
  writer_t& writer_;
 };