From 6472fa7bdd7b9b14f157e4f3b9d3a8b8817f2619 Mon Sep 17 00:00:00 2001 From: Valery Mironov <32071355+MBkkt@users.noreply.github.com> Date: Thu, 14 Sep 2023 18:44:36 +0200 Subject: [PATCH 1/3] Speedup segmentation analyzer --- CMakeLists.txt | 8 + core/analysis/segmentation_token_stream.cpp | 6 +- core/formats/columnstore.cpp | 13 +- core/index/field_data.cpp | 32 +-- core/index/field_data.hpp | 13 +- core/index/postings.cpp | 59 +++-- core/index/postings.hpp | 40 ++-- core/utils/hash_set_utils.hpp | 39 ++-- .../include/boost/text/grapheme_break.hpp | 22 +- .../text/include/boost/text/word_break.hpp | 204 +++++++----------- 10 files changed, 196 insertions(+), 240 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f4e88d4d..19d2461df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,6 +97,14 @@ endif () if (CMAKE_BUILD_TYPE MATCHES "Coverage") set(IRESEARCH_COVERAGE ON) set(CMAKE_BUILD_TYPE "Debug") +elseif (CMAKE_BUILD_TYPE MATCHES "Profile") + set(CMAKE_BUILD_TYPE "Release") + add_compile_options( + -g + -fno-omit-frame-pointer + # -fno-inline + # -fno-optimize-sibling-calls + ) endif () add_option_gprof(FALSE) diff --git a/core/analysis/segmentation_token_stream.cpp b/core/analysis/segmentation_token_stream.cpp index 29bb9324d..6c0f81932 100644 --- a/core/analysis/segmentation_token_stream.cpp +++ b/core/analysis/segmentation_token_stream.cpp @@ -328,11 +328,10 @@ bool segmentation_token_stream::next() { const auto begin = gr_begin.base(); const auto end = gr_end.base(); - const size_t length = + const auto length = static_cast(std::distance(begin.base(), end.base())); - if (!length) { - // eof + if (length == 0) { // eof return false; } @@ -355,6 +354,7 @@ bool segmentation_token_stream::next() { term.value = {reinterpret_cast(&(*begin.base())), length}; break; + // TODO(MBkkt) do we need to call as_graphemes? Feels like no case options_t::case_convert_t::LOWER: term_buf_.clear(); to_lower(as_graphemes(begin, end), from_utf32_back_inserter(term_buf_)); diff --git a/core/formats/columnstore.cpp b/core/formats/columnstore.cpp index f522683ed..4e26296c8 100644 --- a/core/formats/columnstore.cpp +++ b/core/formats/columnstore.cpp @@ -256,21 +256,22 @@ void read_compact(irs::index_input& in, irs::encryption::stream* cipher, } } -struct column_ref_eq : value_ref_eq { - using self_t::operator(); +struct ColumnMetaEq : ValueRefEq { + using is_transparent = void; + using Self::operator(); - bool operator()(const ref_t& lhs, + bool operator()(const Ref& lhs, const hashed_string_view& rhs) const noexcept { - return lhs.second->name == rhs; + return lhs.ref->name == rhs; } bool operator()(const hashed_string_view& lhs, - const ref_t& rhs) const noexcept { + const Ref& rhs) const noexcept { return this->operator()(rhs, lhs); } }; -using name_to_column_map = flat_hash_set; +using name_to_column_map = flat_hash_set; class meta_writer final { public: diff --git a/core/index/field_data.cpp b/core/index/field_data.cpp index 06906f7d9..e4fb2de5b 100644 --- a/core/index/field_data.cpp +++ b/core/index/field_data.cpp @@ -1054,23 +1054,23 @@ bool field_data::invert(token_stream& stream, doc_id_t id) { last_start_offs_ = start_offset; } - const auto res = terms_.emplace(term->value); - - if (nullptr == res.first) { - IRS_LOG_WARN(absl::StrCat("skipping too long term of size '", - term->value.size(), "' in field '", meta_.name, - "'")); - IRS_LOG_TRACE(absl::StrCat("field '", meta_.name, - "' contains too long term '", - ViewCast(term->value), "'")); + auto* posting = terms_.emplace(term->value); + + if (posting == nullptr) { + IRS_LOG_WARN(absl::StrCat("skipping too long term of size: ", + term->value.size(), " in field: ", meta_.name)); + IRS_LOG_TRACE( + absl::StrCat("field: ", meta_.name, + " contains too long term: ", ViewCast(term->value))); continue; } - (this->*proc_table_[size_t(res.second)])(*res.first, id, pay, offs); + (this->*proc_table_[posting->doc == doc_limits::invalid()])(*posting, id, + pay, offs); if (0 == ++stats_.len) { - IRS_LOG_ERROR(absl::StrCat("too many tokens in field '", meta_.name, - "', document '", id, "'")); + IRS_LOG_ERROR(absl::StrCat("too many tokens in field: ", meta_.name, + ", document: ", id)); return false; } @@ -1108,12 +1108,12 @@ field_data* fields_data::emplace(const hashed_string_view& name, auto it = fields_map_.lazy_emplace( name, [&name](const fields_map::constructor& ctor) { - ctor(name.hash(), nullptr); + ctor(nullptr, name.hash()); }); - if (!it->second) { + if (!it->ref) { try { - const_cast(it->second) = &fields_.emplace_back( + const_cast(it->ref) = &fields_.emplace_back( name, features, *feature_info_, *cached_columns_, *cached_features_, columns, byte_writer_, int_writer_, index_features, (nullptr != comparator_)); @@ -1123,7 +1123,7 @@ field_data* fields_data::emplace(const hashed_string_view& name, } } - return it->second; + return it->ref; } void fields_data::flush(field_writer& fw, flush_state& state) { diff --git a/core/index/field_data.hpp b/core/index/field_data.hpp index 8da273d0e..9b3af90b3 100644 --- a/core/index/field_data.hpp +++ b/core/index/field_data.hpp @@ -198,21 +198,22 @@ class field_data : util::noncopyable { class fields_data : util::noncopyable { private: - struct field_ref_eq : value_ref_eq { - using self_t::operator(); + struct FieldEq : ValueRefEq { + using is_transparent = void; + using Self::operator(); - bool operator()(const ref_t& lhs, + bool operator()(const Ref& lhs, const hashed_string_view& rhs) const noexcept { - return lhs.second->meta().name == rhs; + return lhs.ref->meta().name == rhs; } bool operator()(const hashed_string_view& lhs, - const ref_t& rhs) const noexcept { + const Ref& rhs) const noexcept { return this->operator()(rhs, lhs); } }; - using fields_map = flat_hash_set; + using fields_map = flat_hash_set; public: using postings_ref_t = std::vector; diff --git a/core/index/postings.cpp b/core/index/postings.cpp index b24c61f42..577bbd479 100644 --- a/core/index/postings.cpp +++ b/core/index/postings.cpp @@ -34,12 +34,12 @@ namespace irs { void postings::get_sorted_postings( std::vector& postings) const { - postings.resize(map_.size()); + IRS_ASSERT(terms_.size() == postings_.size()); - auto begin = postings.begin(); - for (auto& entry : map_) { - *begin = &postings_[entry.second]; - ++begin; + postings.resize(postings_.size()); + + for (auto* p = postings.data(); const auto& posting : postings_) { + *p++ = &posting; } std::sort(postings.begin(), postings.end(), @@ -48,20 +48,20 @@ void postings::get_sorted_postings( }); } -std::pair postings::emplace(bytes_view term) { +posting* postings::emplace(bytes_view term) { REGISTER_TIMER_DETAILED(); auto& parent = writer_.parent(); // maximum number to bytes needed for storage of term length and data - const auto max_term_len = term.size(); // + vencode_size(term.size()); + const auto term_size = term.size(); // + vencode_size(term.size()); - if (writer_t::container::block_type::SIZE < max_term_len) { + if (writer_t::container::block_type::SIZE < term_size) { // TODO: maybe move big terms it to a separate storage // reject terms that do not fit in a block - return std::make_pair(nullptr, false); + return nullptr; } - const auto slice_end = writer_.pool_offset() + max_term_len; + const auto slice_end = writer_.pool_offset() + term_size; const auto next_block_start = writer_.pool_offset() < parent.value_count() ? writer_.position().block_offset() + @@ -74,34 +74,31 @@ std::pair postings::emplace(bytes_view term) { } IRS_ASSERT(size() < doc_limits::eof()); // not larger then the static flag - IRS_ASSERT(map_.size() == postings_.size()); + IRS_ASSERT(terms_.size() == postings_.size()); const hashed_bytes_view hashed_term{term}; bool is_new = false; - const auto it = map_.lazy_emplace( - hashed_term, [&is_new, hash = hashed_term.hash(), - id = map_.size()](const map_t::constructor& ctor) { + const auto it = terms_.lazy_emplace( + hashed_term, [&, size = terms_.size()](const auto& ctor) { + ctor(size, hashed_term.hash()); is_new = true; - ctor(hash, id); }); - - if (is_new) { - // for new terms also write out their value - try { - writer_.write(term.data(), term.size()); - postings_.emplace_back(); - } catch (...) { - // we leave some garbage in block pool - map_.erase(it); - throw; - } - - postings_.back().term = {(writer_.position() - term.size()).buffer(), - term.size()}; + if (IRS_LIKELY(!is_new)) { + return &postings_[it->ref]; + } + // for new terms also write out their value + try { + auto* start = writer_.position().buffer(); + writer_.write(term.data(), term_size); + IRS_ASSERT(start == (writer_.position() - term_size).buffer()); + auto& posting = postings_.emplace_back(start, term_size); + return &posting; + } catch (...) { + // we leave some garbage in block pool + terms_.erase(it); + throw; } - - return {&postings_[it->second], is_new}; } } // namespace irs diff --git a/core/index/postings.hpp b/core/index/postings.hpp index 621cad99f..5ec06594a 100644 --- a/core/index/postings.hpp +++ b/core/index/postings.hpp @@ -29,6 +29,7 @@ #include "utils/hash_utils.hpp" #include "utils/noncopyable.hpp" #include "utils/string.hpp" +#include "utils/type_limits.hpp" namespace irs { @@ -54,6 +55,9 @@ using byte_block_pool = block_pool>; struct posting { + explicit posting(const byte_type* data, size_t size) noexcept + : term{data, size} {} + bytes_view term; uint64_t doc_code; // ........................................................................... @@ -64,7 +68,7 @@ struct posting { // [3] - pointer to prox stream begin // ........................................................................... size_t int_start; - doc_id_t doc; + doc_id_t doc{doc_limits::invalid()}; uint32_t freq; uint32_t pos; uint32_t offs{0}; @@ -77,10 +81,10 @@ class postings : util::noncopyable { // cppcheck-suppress constParameter explicit postings(writer_t& writer) - : map_{0, value_ref_hash{}, term_id_eq{postings_}}, writer_(writer) {} + : terms_{0, ValueRefHash{}, TermEq{postings_}}, writer_(writer) {} void clear() noexcept { - map_.clear(); + terms_.clear(); postings_.clear(); } @@ -88,29 +92,28 @@ class postings : util::noncopyable { /// sorted order void get_sorted_postings(std::vector& postings) const; - /// @note on error returns std::ptr(nullptr, false) + /// @note on error returns nullptr /// @note returned poitern remains valid until the next call - std::pair emplace(bytes_view term); + posting* emplace(bytes_view term); - bool empty() const noexcept { return map_.empty(); } - size_t size() const noexcept { return map_.size(); } + bool empty() const noexcept { return terms_.empty(); } + size_t size() const noexcept { return terms_.size(); } private: - class term_id_eq : public value_ref_eq { - public: - explicit term_id_eq(const std::vector& data) noexcept - : data_(&data) {} + struct TermEq : ValueRefEq { + using is_transparent = void; + using Self::operator(); - using self_t::operator(); + explicit TermEq(const std::vector& data) noexcept : data_{&data} {} - bool operator()(const ref_t& lhs, + bool operator()(const Ref& lhs, const hashed_bytes_view& rhs) const noexcept { - IRS_ASSERT(lhs.second < data_->size()); - return (*data_)[lhs.second].term == rhs; + IRS_ASSERT(lhs.ref < data_->size()); + return (*data_)[lhs.ref].term == rhs; } bool operator()(const hashed_bytes_view& lhs, - const ref_t& rhs) const noexcept { + const Ref& rhs) const noexcept { return this->operator()(rhs, lhs); } @@ -118,10 +121,9 @@ class postings : util::noncopyable { const std::vector* data_; }; - using map_t = flat_hash_set; - + // TODO(MBkkt) Maybe just flat_hash_set>? std::vector postings_; - map_t map_; + flat_hash_set terms_; writer_t& writer_; }; diff --git a/core/utils/hash_set_utils.hpp b/core/utils/hash_set_utils.hpp index 236e02cee..40091e5ea 100644 --- a/core/utils/hash_set_utils.hpp +++ b/core/utils/hash_set_utils.hpp @@ -28,22 +28,20 @@ namespace irs { -//////////////////////////////////////////////////////////////////////////////// -/// @brief first - hash value, second - reference -//////////////////////////////////////////////////////////////////////////////// template -using value_ref_t = std::pair; +struct ValueRef { + explicit ValueRef(T ref, size_t hash) : ref{ref}, hash{hash} {} -//////////////////////////////////////////////////////////////////////////////// -/// @struct transparent hash for value_ref_t -//////////////////////////////////////////////////////////////////////////////// -class value_ref_hash { - public: + T ref; + size_t hash; +}; + +struct ValueRefHash { using is_transparent = void; template - size_t operator()(const value_ref_t& value) const noexcept { - return value.first; + size_t operator()(const ValueRef& value) const noexcept { + return value.hash; } template @@ -53,19 +51,13 @@ class value_ref_hash { } }; -//////////////////////////////////////////////////////////////////////////////// -/// @struct transparent equality comparator for value_ref_t -//////////////////////////////////////////////////////////////////////////////// template -struct value_ref_eq { - using is_transparent = void; - - using self_t = value_ref_eq; - using ref_t = value_ref_t; - using value_t = typename ref_t::second_type; +struct ValueRefEq { + using Self = ValueRefEq; + using Ref = ValueRef; - bool operator()(const ref_t& lhs, const ref_t& rhs) const noexcept { - return lhs.second == rhs.second; + bool operator()(const Ref& lhs, const Ref& rhs) const noexcept { + return lhs.ref == rhs.ref; } }; @@ -74,7 +66,6 @@ struct value_ref_eq { /// rehash may still happen even if enough space was allocated //////////////////////////////////////////////////////////////////////////////// template -using flat_hash_set = - absl::flat_hash_set; +using flat_hash_set = absl::flat_hash_set; } // namespace irs diff --git a/external/text/include/boost/text/grapheme_break.hpp b/external/text/include/boost/text/grapheme_break.hpp index 19790ca25..d6256c891 100644 --- a/external/text/include/boost/text/grapheme_break.hpp +++ b/external/text/include/boost/text/grapheme_break.hpp @@ -110,6 +110,7 @@ namespace boost { namespace text { template struct grapheme_break_state { + CPIter it_prev; CPIter it; grapheme_property prev_prop; @@ -121,7 +122,7 @@ namespace boost { namespace text { template grapheme_break_state next(grapheme_break_state state) { - ++state.it; + state.it_prev = state.it++; state.prev_prop = state.prop; return state; } @@ -259,12 +260,12 @@ constexpr std::array, 15> grapheme_breaks = {{ return first; grapheme_break_state state; - state.it = first; + state.it = state.it_prev = first; if (++state.it == last) return state.it; - state.prev_prop = boost::text::grapheme_prop(*std::prev(state.it)); + state.prev_prop = boost::text::grapheme_prop(*state.it_prev); state.prop = boost::text::grapheme_prop(*state.it); state.emoji_state = @@ -278,22 +279,17 @@ constexpr std::array, 15> grapheme_breaks = {{ // GB11 if (state.prev_prop == grapheme_property::ZWJ && state.prop == grapheme_property::ExtPict && - detail::gb11_prefix(first, std::prev(state.it))) { + detail::gb11_prefix(first, state.it_prev)) { continue; } - if (state.emoji_state == - grapheme_break_emoji_state_t::first_emoji) { + if (state.emoji_state == grapheme_break_emoji_state_t::first_emoji) { + state.emoji_state = grapheme_break_emoji_state_t::none; if (state.prop == grapheme_property::Regional_Indicator) { - state.emoji_state = grapheme_break_emoji_state_t::none; continue; - } else { - state.emoji_state = grapheme_break_emoji_state_t::none; } - } else if ( - state.prop == grapheme_property::Regional_Indicator) { - state.emoji_state = - grapheme_break_emoji_state_t::first_emoji; + } else if (state.prop == grapheme_property::Regional_Indicator) { + state.emoji_state = grapheme_break_emoji_state_t::first_emoji; } if (detail::table_grapheme_break(state.prev_prop, state.prop)) diff --git a/external/text/include/boost/text/word_break.hpp b/external/text/include/boost/text/word_break.hpp index 1f34cb149..02661d030 100644 --- a/external/text/include/boost/text/word_break.hpp +++ b/external/text/include/boost/text/word_break.hpp @@ -19,11 +19,11 @@ namespace boost { namespace text { /** The word properties defined by Unicode. */ - enum word_property { - Other, + enum word_property : uint32_t { CR, LF, Newline, + Other, Katakana, ALetter, MidLetter, @@ -39,7 +39,7 @@ namespace boost { namespace text { WSegSpace, Format, Extend, - ZWJ + ZWJ, }; namespace detail { @@ -141,14 +141,12 @@ namespace boost { namespace text { namespace detail { inline bool skippable(word_property prop) noexcept { - return prop == word_property::Extend || - prop == word_property::Format || prop == word_property::ZWJ; + return word_property::Format <= prop; } inline bool linebreak(word_property prop) noexcept { - return prop == word_property::CR || prop == word_property::LF || - prop == word_property::Newline; + return prop <= word_property::Newline; } inline bool ah_letter(word_property prop) noexcept @@ -204,21 +202,29 @@ namespace boost { namespace text { struct word_break_state { word_break_state() {} - CPIter it; + CPIter it_next; + bool it_points_to_prev = false; - std::array caps; + std::array caps; word_break_emoji_state_t emoji_state; }; template - word_break_state next(word_break_state state) + word_break_state next(word_break_state state, CPIter last) { - ++state.it; - std::copy( - state.caps.begin() + 1, state.caps.end(), state.caps.begin()); + state.caps[0] = state.caps[1]; + state.caps[1] = state.caps[2]; + state.caps[2] = state.caps[3]; + state.caps[3] = cp_and_word_prop{}; + + state.it = state.it_next; + if (state.it_next != last) { + ++state.it_next; + } + return state; } @@ -240,28 +246,28 @@ namespace boost { namespace text { // clang-format off // See chart at http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakTest.html. -constexpr std::array, 20> word_breaks = {{ -// Other CR LF NL Ktk AL ML MN MNL Num ENL RI HL DQ SQ EP WSSp Fmt Extd ZWJ - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Other - {{1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, // CR - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, // LF - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, // Newline - {{1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Katakana - {{1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0}}, // ALetter - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // MidLetter - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // MidNum - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // MidNumLet - {{1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0}}, // Numeric - {{1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0}}, // ExtendNumLet - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0}}, // RI - {{1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0}}, // Hebrew_Letter - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Double_Quote - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Single_Quote - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // ExtPict - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // WSegSpace - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Format - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Extend - {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0}}, // ZWJ +static constexpr std::array, 20> word_breaks = {{ +// CR LF NL Other Ktk AL ML MN MNL Num ENL RI HL DQ SQ EP WSSp Fmt Extd ZWJ + {{1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, // CR + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, // LF + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, // Newline + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Other + {{1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Katakana + {{1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0}}, // ALetter + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // MidLetter + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // MidNum + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // MidNumLet + {{1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0}}, // Numeric + {{1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0}}, // ExtendNumLet + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0}}, // RI + {{1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0}}, // Hebrew_Letter + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Double_Quote + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Single_Quote + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // ExtPict + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // WSegSpace + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Format + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}, // Extend + {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0}}, // ZWJ }}; // clang-format on auto const lhs_int = static_cast(lhs); @@ -269,11 +275,10 @@ constexpr std::array, 20> word_breaks = {{ return word_breaks[lhs_int][rhs_int]; } - // WB4: Except after line breaks, ignore/skip (Extend | Format | - // ZWJ)* + // WB4: Except after line breaks, ignore/skip (Extend | Format | ZWJ)* template - word_break_state skip_forward( - word_break_state state, + void skip_forward( + word_break_state& state, CPIter first, Sentinel last, WordPropFunc const & word_prop) @@ -282,33 +287,26 @@ constexpr std::array, 20> word_breaks = {{ !detail::skippable(state.caps[ph::prev].prop) && detail::skippable(state.caps[ph::curr].prop)) { auto last_prop = word_property::Other; - auto temp_it = boost::text::find_if_not( + state.it = boost::text::find_if_not( state.it, last, [word_prop, &last_prop](uint32_t cp) { last_prop = word_prop(cp); return detail::skippable(last_prop); }); - if (temp_it == last) { - --temp_it; + if (state.it == last) { + --state.it; } else if (last_prop == word_property::ExtPict) { - auto const next_to_last_prop = - word_prop(*std::prev(temp_it)); + auto it_prev = std::prev(state.it); + auto const next_to_last_prop = word_prop(*it_prev); if (next_to_last_prop == word_property::ZWJ) - --temp_it; + state.it = it_prev; } - state.it = temp_it; - state.caps[ph::curr] = cp_and_word_prop(*temp_it, word_prop); + state.caps[ph::curr] = cp_and_word_prop(*state.it, word_prop); state.caps[ph::next] = cp_and_word_prop(); - state.caps[ph::next_next] = cp_and_word_prop(); - if (std::next(state.it) != last) { - state.caps[ph::next] = - cp_and_word_prop(*std::next(state.it), word_prop); - if (std::next(state.it, 2) != last) { - state.caps[ph::next_next] = cp_and_word_prop( - *std::next(state.it, 2), word_prop); - } + state.it_next = std::next(state.it); + if (state.it_next != last) { + state.caps[ph::next] = cp_and_word_prop(*state.it_next, word_prop); } } - return state; } template @@ -706,6 +704,8 @@ constexpr std::array, 20> word_breaks = {{ WordPropFunc const & word_prop = WordPropFunc{}, WordBreakFunc const & word_break = WordBreakFunc{}) noexcept { + static_assert(std::is_same_v, + "Otherwise you need to revert changes for this function about next_next"); using detail::ph; using detail::cp_and_word_prop; @@ -719,42 +719,19 @@ constexpr std::array, 20> word_breaks = {{ return state.it; state.caps[ph::prev_prev] = cp_and_word_prop(); - state.caps[ph::prev] = - cp_and_word_prop(*std::prev(state.it), word_prop); + state.caps[ph::prev] = cp_and_word_prop(*first, word_prop); state.caps[ph::curr] = cp_and_word_prop(*state.it, word_prop); state.caps[ph::next] = cp_and_word_prop(); - state.caps[ph::next_next] = cp_and_word_prop(); - if (std::next(state.it) != last) { - state.caps[ph::next] = - cp_and_word_prop(*std::next(state.it), word_prop); - if (std::next(state.it, 2) != last) { - state.caps[ph::next_next] = - cp_and_word_prop(*std::next(state.it, 2), word_prop); - } - } + state.it_next = std::next(state.it); state.emoji_state = state.caps[ph::prev].prop == word_property::Regional_Indicator ? detail::word_break_emoji_state_t::first_emoji : detail::word_break_emoji_state_t::none; - for (; state.it != last; state = detail::next(state)) { - if (std::next(state.it) != last && - std::next(state.it, 2) != last) { - state.caps[ph::next_next] = - cp_and_word_prop(*std::next(state.it, 2), word_prop); - } else { - state.caps[ph::next_next] = cp_and_word_prop(); - } - - // Check word_break before anything else. - if (word_break( - state.caps[ph::prev_prev].cp, - state.caps[ph::prev].cp, - state.caps[ph::curr].cp, - state.caps[ph::next].cp, - state.caps[ph::next_next].cp)) { - return state.it; + for (; state.it != last; state = detail::next(state, last)) { + if (state.it_next != last) { + state.caps[ph::next] = cp_and_word_prop(*state.it_next, word_prop); } // WB3 @@ -764,16 +741,12 @@ constexpr std::array, 20> word_breaks = {{ } // WB3a - if (state.caps[ph::prev].prop == word_property::CR || - state.caps[ph::prev].prop == word_property::LF || - state.caps[ph::prev].prop == word_property::Newline) { + if (detail::linebreak(state.caps[ph::prev].prop)) { return state.it; } // WB3b - if (state.caps[ph::curr].prop == word_property::CR || - state.caps[ph::curr].prop == word_property::LF || - state.caps[ph::curr].prop == word_property::Newline) { + if (detail::linebreak(state.caps[ph::curr].prop)) { return state.it; } @@ -792,16 +765,16 @@ constexpr std::array, 20> word_breaks = {{ // Putting this here means not having to do it explicitly // below between prop and next_prop (and transitively, // between prev_prop and prop). - state = detail::skip_forward(state, first, last, word_prop); + detail::skip_forward(state, first, last, word_prop); if (state.it == last) return state.it; // WB6 if (detail::ah_letter(state.caps[ph::prev].prop) && detail::mid_ah(state.caps[ph::curr].prop) && - std::next(state.it) != last) { - auto const temp_state = detail::skip_forward( - detail::next(state), first, last, word_prop); + state.it_next != last) { + auto temp_state = detail::next(state, last); + detail::skip_forward(temp_state, first, last, word_prop); if (temp_state.it == last) return temp_state.it; if (detail::ah_letter(temp_state.caps[ph::curr].prop)) @@ -818,19 +791,17 @@ constexpr std::array, 20> word_breaks = {{ // WB7b if (state.caps[ph::prev].prop == word_property::Hebrew_Letter && state.caps[ph::curr].prop == word_property::Double_Quote && - std::next(state.it) != last) { - auto const temp_state = detail::skip_forward( - detail::next(state), first, last, word_prop); + state.it_next != last) { + auto temp_state = detail::next(state, last); + detail::skip_forward(temp_state, first, last, word_prop); if (temp_state.it == last) return temp_state.it; - if (temp_state.caps[ph::curr].prop == - word_property::Hebrew_Letter) + if (temp_state.caps[ph::curr].prop == word_property::Hebrew_Letter) continue; } // WB7c - if (state.caps[ph::prev_prev].prop == - word_property::Hebrew_Letter && + if (state.caps[ph::prev_prev].prop == word_property::Hebrew_Letter && state.caps[ph::prev].prop == word_property::Double_Quote && state.caps[ph::curr].prop == word_property::Hebrew_Letter) { continue; @@ -846,37 +817,26 @@ constexpr std::array, 20> word_breaks = {{ // WB12 if (state.caps[ph::prev].prop == word_property::Numeric && detail::mid_num(state.caps[ph::curr].prop) && - std::next(state.it) != last) { - auto const temp_state = detail::skip_forward( - detail::next(state), first, last, word_prop); + state.it_next != last) { + auto temp_state = detail::next(state, last); + detail::skip_forward(temp_state, first, last, word_prop); if (temp_state.it == last) return temp_state.it; - if (temp_state.caps[ph::curr].prop == - word_property::Numeric) + if (temp_state.caps[ph::curr].prop == word_property::Numeric) continue; } - if (state.emoji_state == - detail::word_break_emoji_state_t::first_emoji) { - if (state.caps[ph::curr].prop == - word_property::Regional_Indicator) { - state.emoji_state = - detail::word_break_emoji_state_t::none; + if (state.emoji_state == detail::word_break_emoji_state_t::first_emoji) { + state.emoji_state = detail::word_break_emoji_state_t::none; + if (state.caps[ph::curr].prop == word_property::Regional_Indicator) { continue; - } else { - state.emoji_state = - detail::word_break_emoji_state_t::none; } - } else if ( - state.caps[ph::curr].prop == - word_property::Regional_Indicator) { - state.emoji_state = - detail::word_break_emoji_state_t::first_emoji; + } else if (state.caps[ph::curr].prop == word_property::Regional_Indicator) { + state.emoji_state = detail::word_break_emoji_state_t::first_emoji; return state.it; } - if (detail::table_word_break( - state.caps[ph::prev].prop, state.caps[ph::curr].prop)) + if (detail::table_word_break(state.caps[ph::prev].prop, state.caps[ph::curr].prop)) return state.it; } return state.it; From e903df12aeaff411af827bacbad4001d27cf572a Mon Sep 17 00:00:00 2001 From: Valery Mironov <32071355+MBkkt@users.noreply.github.com> Date: Wed, 20 Sep 2023 12:23:44 +0200 Subject: [PATCH 2/3] Fix test --- tests/index/postings_tests.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/index/postings_tests.cpp b/tests/index/postings_tests.cpp index 36d8d1b8a..f7c90ac6d 100644 --- a/tests/index/postings_tests.cpp +++ b/tests/index/postings_tests.cpp @@ -55,12 +55,13 @@ void insert_find_core(const std::vector& src) { const std::string& s = src[i]; const bytes_view b = detail::to_bytes_view(s); auto res = bh.emplace(b); - ASSERT_NE(nullptr, res.first); - ASSERT_TRUE(res.second); + ASSERT_NE(nullptr, res); + ASSERT_FALSE(doc_limits::valid(res->doc)); + res->doc = doc_limits::invalid() + 1; res = bh.emplace(b); - ASSERT_NE(nullptr, res.first); - ASSERT_FALSE(res.second); + ASSERT_NE(nullptr, res); + ASSERT_TRUE(doc_limits::valid(res->doc)); } ASSERT_GT(memory.counter_, 0); @@ -72,21 +73,21 @@ void insert_find_core(const std::vector& src) { const std::string long_str(block_size - irs::bytes_io::vsize(32767), 'c'); auto res = bh.emplace(detail::to_bytes_view(long_str)); - ASSERT_TRUE(res.second); + ASSERT_FALSE(doc_limits::valid(res->doc)); } // insert long key { const std::string too_long_str(block_size, 'c'); auto res = bh.emplace(detail::to_bytes_view(too_long_str)); - ASSERT_TRUE(res.second); + ASSERT_FALSE(doc_limits::valid(res->doc)); } // insert too long key { const std::string too_long_str(1 + block_size, 'c'); auto res = bh.emplace(detail::to_bytes_view(too_long_str)); - ASSERT_FALSE(res.second); + ASSERT_EQ(nullptr, res); } ASSERT_GT(memory.counter_, 0); pool.reset(); From 20cbac407b9ce060896f53a1d491dc035df0d0e9 Mon Sep 17 00:00:00 2001 From: Valery Mironov <32071355+MBkkt@users.noreply.github.com> Date: Wed, 20 Sep 2023 12:26:03 +0200 Subject: [PATCH 3/3] Review suggestion --- core/index/field_data.cpp | 8 ++++---- core/index/postings.cpp | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/core/index/field_data.cpp b/core/index/field_data.cpp index e4fb2de5b..4e2b54053 100644 --- a/core/index/field_data.cpp +++ b/core/index/field_data.cpp @@ -1054,9 +1054,9 @@ bool field_data::invert(token_stream& stream, doc_id_t id) { last_start_offs_ = start_offset; } - auto* posting = terms_.emplace(term->value); + auto* p = terms_.emplace(term->value); - if (posting == nullptr) { + if (p == nullptr) { IRS_LOG_WARN(absl::StrCat("skipping too long term of size: ", term->value.size(), " in field: ", meta_.name)); IRS_LOG_TRACE( @@ -1065,8 +1065,8 @@ bool field_data::invert(token_stream& stream, doc_id_t id) { continue; } - (this->*proc_table_[posting->doc == doc_limits::invalid()])(*posting, id, - pay, offs); + (this->*proc_table_[!doc_limits::valid(p->doc)])(*p, id, pay, offs); + IRS_ASSERT(doc_limits::valid(p->doc)); if (0 == ++stats_.len) { IRS_LOG_ERROR(absl::StrCat("too many tokens in field: ", meta_.name, diff --git a/core/index/postings.cpp b/core/index/postings.cpp index 577bbd479..9f2f1a3a6 100644 --- a/core/index/postings.cpp +++ b/core/index/postings.cpp @@ -92,8 +92,7 @@ posting* postings::emplace(bytes_view term) { auto* start = writer_.position().buffer(); writer_.write(term.data(), term_size); IRS_ASSERT(start == (writer_.position() - term_size).buffer()); - auto& posting = postings_.emplace_back(start, term_size); - return &posting; + return &postings_.emplace_back(start, term_size); } catch (...) { // we leave some garbage in block pool terms_.erase(it);