Skip to content
This repository has been archived by the owner on May 3, 2024. It is now read-only.

Commit

Permalink
Speedup segmentation (#561)
Browse files Browse the repository at this point in the history
* Speedup segmentation analyzer

* Fix test

* Review suggestion
  • Loading branch information
MBkkt authored Sep 20, 2023
1 parent add15fb commit 2350a18
Show file tree
Hide file tree
Showing 11 changed files with 203 additions and 247 deletions.
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ endif ()
if (CMAKE_BUILD_TYPE MATCHES "Coverage")
set(IRESEARCH_COVERAGE ON)
set(CMAKE_BUILD_TYPE "Debug")
elseif (CMAKE_BUILD_TYPE MATCHES "Profile")
set(CMAKE_BUILD_TYPE "Release")
add_compile_options(
-g
-fno-omit-frame-pointer
# -fno-inline
# -fno-optimize-sibling-calls
)
endif ()

add_option_gprof(FALSE)
Expand Down
6 changes: 3 additions & 3 deletions core/analysis/segmentation_token_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,11 +328,10 @@ bool segmentation_token_stream::next() {
const auto begin = gr_begin.base();
const auto end = gr_end.base();

const size_t length =
const auto length =
static_cast<size_t>(std::distance(begin.base(), end.base()));

if (!length) {
// eof
if (length == 0) { // eof
return false;
}

Expand All @@ -355,6 +354,7 @@ bool segmentation_token_stream::next() {
term.value = {reinterpret_cast<const byte_type*>(&(*begin.base())),
length};
break;
// TODO(MBkkt) do we need to call as_graphemes? Feels like no
case options_t::case_convert_t::LOWER:
term_buf_.clear();
to_lower(as_graphemes(begin, end), from_utf32_back_inserter(term_buf_));
Expand Down
13 changes: 7 additions & 6 deletions core/formats/columnstore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,21 +256,22 @@ void read_compact(irs::index_input& in, irs::encryption::stream* cipher,
}
}

struct column_ref_eq : value_ref_eq<column_meta*> {
using self_t::operator();
struct ColumnMetaEq : ValueRefEq<column_meta*> {
using is_transparent = void;
using Self::operator();

bool operator()(const ref_t& lhs,
bool operator()(const Ref& lhs,
const hashed_string_view& rhs) const noexcept {
return lhs.second->name == rhs;
return lhs.ref->name == rhs;
}

bool operator()(const hashed_string_view& lhs,
const ref_t& rhs) const noexcept {
const Ref& rhs) const noexcept {
return this->operator()(rhs, lhs);
}
};

using name_to_column_map = flat_hash_set<column_ref_eq>;
using name_to_column_map = flat_hash_set<ColumnMetaEq>;

class meta_writer final {
public:
Expand Down
32 changes: 16 additions & 16 deletions core/index/field_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1054,23 +1054,23 @@ bool field_data::invert(token_stream& stream, doc_id_t id) {
last_start_offs_ = start_offset;
}

const auto res = terms_.emplace(term->value);

if (nullptr == res.first) {
IRS_LOG_WARN(absl::StrCat("skipping too long term of size '",
term->value.size(), "' in field '", meta_.name,
"'"));
IRS_LOG_TRACE(absl::StrCat("field '", meta_.name,
"' contains too long term '",
ViewCast<char>(term->value), "'"));
auto* p = terms_.emplace(term->value);

if (p == nullptr) {
IRS_LOG_WARN(absl::StrCat("skipping too long term of size: ",
term->value.size(), " in field: ", meta_.name));
IRS_LOG_TRACE(
absl::StrCat("field: ", meta_.name,
" contains too long term: ", ViewCast<char>(term->value)));
continue;
}

(this->*proc_table_[size_t(res.second)])(*res.first, id, pay, offs);
(this->*proc_table_[!doc_limits::valid(p->doc)])(*p, id, pay, offs);
IRS_ASSERT(doc_limits::valid(p->doc));

if (0 == ++stats_.len) {
IRS_LOG_ERROR(absl::StrCat("too many tokens in field '", meta_.name,
"', document '", id, "'"));
IRS_LOG_ERROR(absl::StrCat("too many tokens in field: ", meta_.name,
", document: ", id));
return false;
}

Expand Down Expand Up @@ -1108,12 +1108,12 @@ field_data* fields_data::emplace(const hashed_string_view& name,

auto it = fields_map_.lazy_emplace(
name, [&name](const fields_map::constructor& ctor) {
ctor(name.hash(), nullptr);
ctor(nullptr, name.hash());
});

if (!it->second) {
if (!it->ref) {
try {
const_cast<field_data*&>(it->second) = &fields_.emplace_back(
const_cast<field_data*&>(it->ref) = &fields_.emplace_back(
name, features, *feature_info_, *cached_columns_, *cached_features_,
columns, byte_writer_, int_writer_, index_features,
(nullptr != comparator_));
Expand All @@ -1123,7 +1123,7 @@ field_data* fields_data::emplace(const hashed_string_view& name,
}
}

return it->second;
return it->ref;
}

void fields_data::flush(field_writer& fw, flush_state& state) {
Expand Down
13 changes: 7 additions & 6 deletions core/index/field_data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,21 +198,22 @@ class field_data : util::noncopyable {

class fields_data : util::noncopyable {
private:
struct field_ref_eq : value_ref_eq<field_data*> {
using self_t::operator();
struct FieldEq : ValueRefEq<field_data*> {
using is_transparent = void;
using Self::operator();

bool operator()(const ref_t& lhs,
bool operator()(const Ref& lhs,
const hashed_string_view& rhs) const noexcept {
return lhs.second->meta().name == rhs;
return lhs.ref->meta().name == rhs;
}

bool operator()(const hashed_string_view& lhs,
const ref_t& rhs) const noexcept {
const Ref& rhs) const noexcept {
return this->operator()(rhs, lhs);
}
};

using fields_map = flat_hash_set<field_ref_eq>;
using fields_map = flat_hash_set<FieldEq>;

public:
using postings_ref_t = std::vector<const posting*>;
Expand Down
58 changes: 27 additions & 31 deletions core/index/postings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ namespace irs {

void postings::get_sorted_postings(
std::vector<const posting*>& postings) const {
postings.resize(map_.size());
IRS_ASSERT(terms_.size() == postings_.size());

auto begin = postings.begin();
for (auto& entry : map_) {
*begin = &postings_[entry.second];
++begin;
postings.resize(postings_.size());

for (auto* p = postings.data(); const auto& posting : postings_) {
*p++ = &posting;
}

std::sort(postings.begin(), postings.end(),
Expand All @@ -48,20 +48,20 @@ void postings::get_sorted_postings(
});
}

std::pair<posting*, bool> postings::emplace(bytes_view term) {
posting* postings::emplace(bytes_view term) {
REGISTER_TIMER_DETAILED();
auto& parent = writer_.parent();

// maximum number to bytes needed for storage of term length and data
const auto max_term_len = term.size(); // + vencode_size(term.size());
const auto term_size = term.size(); // + vencode_size(term.size());

if (writer_t::container::block_type::SIZE < max_term_len) {
if (writer_t::container::block_type::SIZE < term_size) {
// TODO: maybe move big terms it to a separate storage
// reject terms that do not fit in a block
return std::make_pair(nullptr, false);
return nullptr;
}

const auto slice_end = writer_.pool_offset() + max_term_len;
const auto slice_end = writer_.pool_offset() + term_size;
const auto next_block_start =
writer_.pool_offset() < parent.value_count()
? writer_.position().block_offset() +
Expand All @@ -74,34 +74,30 @@ std::pair<posting*, bool> postings::emplace(bytes_view term) {
}

IRS_ASSERT(size() < doc_limits::eof()); // not larger then the static flag
IRS_ASSERT(map_.size() == postings_.size());
IRS_ASSERT(terms_.size() == postings_.size());

const hashed_bytes_view hashed_term{term};

bool is_new = false;
const auto it = map_.lazy_emplace(
hashed_term, [&is_new, hash = hashed_term.hash(),
id = map_.size()](const map_t::constructor& ctor) {
const auto it = terms_.lazy_emplace(
hashed_term, [&, size = terms_.size()](const auto& ctor) {
ctor(size, hashed_term.hash());
is_new = true;
ctor(hash, id);
});

if (is_new) {
// for new terms also write out their value
try {
writer_.write(term.data(), term.size());
postings_.emplace_back();
} catch (...) {
// we leave some garbage in block pool
map_.erase(it);
throw;
}

postings_.back().term = {(writer_.position() - term.size()).buffer(),
term.size()};
if (IRS_LIKELY(!is_new)) {
return &postings_[it->ref];
}
// for new terms also write out their value
try {
auto* start = writer_.position().buffer();
writer_.write(term.data(), term_size);
IRS_ASSERT(start == (writer_.position() - term_size).buffer());
return &postings_.emplace_back(start, term_size);
} catch (...) {
// we leave some garbage in block pool
terms_.erase(it);
throw;
}

return {&postings_[it->second], is_new};
}

} // namespace irs
40 changes: 21 additions & 19 deletions core/index/postings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "utils/hash_utils.hpp"
#include "utils/noncopyable.hpp"
#include "utils/string.hpp"
#include "utils/type_limits.hpp"

namespace irs {

Expand All @@ -54,6 +55,9 @@ using byte_block_pool =
block_pool<byte_type, 32768, ManagedTypedAllocator<byte_type>>;

struct posting {
explicit posting(const byte_type* data, size_t size) noexcept
: term{data, size} {}

bytes_view term;
uint64_t doc_code;
// ...........................................................................
Expand All @@ -64,7 +68,7 @@ struct posting {
// [3] - pointer to prox stream begin
// ...........................................................................
size_t int_start;
doc_id_t doc;
doc_id_t doc{doc_limits::invalid()};
uint32_t freq;
uint32_t pos;
uint32_t offs{0};
Expand All @@ -77,51 +81,49 @@ class postings : util::noncopyable {

// cppcheck-suppress constParameter
explicit postings(writer_t& writer)
: map_{0, value_ref_hash{}, term_id_eq{postings_}}, writer_(writer) {}
: terms_{0, ValueRefHash{}, TermEq{postings_}}, writer_(writer) {}

void clear() noexcept {
map_.clear();
terms_.clear();
postings_.clear();
}

/// @brief fill a provided vector with terms and corresponding postings in
/// sorted order
void get_sorted_postings(std::vector<const posting*>& postings) const;

/// @note on error returns std::ptr(nullptr, false)
/// @note on error returns nullptr
/// @note returned poitern remains valid until the next call
std::pair<posting*, bool> emplace(bytes_view term);
posting* emplace(bytes_view term);

bool empty() const noexcept { return map_.empty(); }
size_t size() const noexcept { return map_.size(); }
bool empty() const noexcept { return terms_.empty(); }
size_t size() const noexcept { return terms_.size(); }

private:
class term_id_eq : public value_ref_eq<size_t> {
public:
explicit term_id_eq(const std::vector<posting>& data) noexcept
: data_(&data) {}
struct TermEq : ValueRefEq<size_t> {
using is_transparent = void;
using Self::operator();

using self_t::operator();
explicit TermEq(const std::vector<posting>& data) noexcept : data_{&data} {}

bool operator()(const ref_t& lhs,
bool operator()(const Ref& lhs,
const hashed_bytes_view& rhs) const noexcept {
IRS_ASSERT(lhs.second < data_->size());
return (*data_)[lhs.second].term == rhs;
IRS_ASSERT(lhs.ref < data_->size());
return (*data_)[lhs.ref].term == rhs;
}

bool operator()(const hashed_bytes_view& lhs,
const ref_t& rhs) const noexcept {
const Ref& rhs) const noexcept {
return this->operator()(rhs, lhs);
}

private:
const std::vector<posting>* data_;
};

using map_t = flat_hash_set<term_id_eq>;

// TODO(MBkkt) Maybe just flat_hash_set<unique_ptr<posting>>?
std::vector<posting> postings_;
map_t map_;
flat_hash_set<TermEq> terms_;
writer_t& writer_;
};

Expand Down
Loading

0 comments on commit 2350a18

Please sign in to comment.