Skip to content
This repository has been archived by the owner on May 3, 2024. It is now read-only.

Speedup segmentation #561

Merged
merged 3 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ endif ()
if (CMAKE_BUILD_TYPE MATCHES "Coverage")
set(IRESEARCH_COVERAGE ON)
set(CMAKE_BUILD_TYPE "Debug")
elseif (CMAKE_BUILD_TYPE MATCHES "Profile")
set(CMAKE_BUILD_TYPE "Release")
add_compile_options(
-g
-fno-omit-frame-pointer
# -fno-inline
# -fno-optimize-sibling-calls
)
endif ()

add_option_gprof(FALSE)
Expand Down
6 changes: 3 additions & 3 deletions core/analysis/segmentation_token_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,11 +328,10 @@ bool segmentation_token_stream::next() {
const auto begin = gr_begin.base();
const auto end = gr_end.base();

const size_t length =
const auto length =
static_cast<size_t>(std::distance(begin.base(), end.base()));

if (!length) {
// eof
if (length == 0) { // eof
return false;
}

Expand All @@ -355,6 +354,7 @@ bool segmentation_token_stream::next() {
term.value = {reinterpret_cast<const byte_type*>(&(*begin.base())),
length};
break;
// TODO(MBkkt) do we need to call as_graphemes? Feels like no
case options_t::case_convert_t::LOWER:
term_buf_.clear();
to_lower(as_graphemes(begin, end), from_utf32_back_inserter(term_buf_));
Expand Down
13 changes: 7 additions & 6 deletions core/formats/columnstore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,21 +256,22 @@ void read_compact(irs::index_input& in, irs::encryption::stream* cipher,
}
}

struct column_ref_eq : value_ref_eq<column_meta*> {
using self_t::operator();
struct ColumnMetaEq : ValueRefEq<column_meta*> {
using is_transparent = void;
using Self::operator();

bool operator()(const ref_t& lhs,
bool operator()(const Ref& lhs,
const hashed_string_view& rhs) const noexcept {
return lhs.second->name == rhs;
return lhs.ref->name == rhs;
}

bool operator()(const hashed_string_view& lhs,
const ref_t& rhs) const noexcept {
const Ref& rhs) const noexcept {
return this->operator()(rhs, lhs);
}
};

using name_to_column_map = flat_hash_set<column_ref_eq>;
using name_to_column_map = flat_hash_set<ColumnMetaEq>;

class meta_writer final {
public:
Expand Down
32 changes: 16 additions & 16 deletions core/index/field_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1054,23 +1054,23 @@ bool field_data::invert(token_stream& stream, doc_id_t id) {
last_start_offs_ = start_offset;
}

const auto res = terms_.emplace(term->value);

if (nullptr == res.first) {
IRS_LOG_WARN(absl::StrCat("skipping too long term of size '",
term->value.size(), "' in field '", meta_.name,
"'"));
IRS_LOG_TRACE(absl::StrCat("field '", meta_.name,
"' contains too long term '",
ViewCast<char>(term->value), "'"));
auto* p = terms_.emplace(term->value);

if (p == nullptr) {
IRS_LOG_WARN(absl::StrCat("skipping too long term of size: ",
term->value.size(), " in field: ", meta_.name));
IRS_LOG_TRACE(
absl::StrCat("field: ", meta_.name,
" contains too long term: ", ViewCast<char>(term->value)));
continue;
}

(this->*proc_table_[size_t(res.second)])(*res.first, id, pay, offs);
(this->*proc_table_[!doc_limits::valid(p->doc)])(*p, id, pay, offs);
IRS_ASSERT(doc_limits::valid(p->doc));

if (0 == ++stats_.len) {
IRS_LOG_ERROR(absl::StrCat("too many tokens in field '", meta_.name,
"', document '", id, "'"));
IRS_LOG_ERROR(absl::StrCat("too many tokens in field: ", meta_.name,
", document: ", id));
return false;
}

Expand Down Expand Up @@ -1108,12 +1108,12 @@ field_data* fields_data::emplace(const hashed_string_view& name,

auto it = fields_map_.lazy_emplace(
name, [&name](const fields_map::constructor& ctor) {
ctor(name.hash(), nullptr);
ctor(nullptr, name.hash());
});

if (!it->second) {
if (!it->ref) {
try {
const_cast<field_data*&>(it->second) = &fields_.emplace_back(
const_cast<field_data*&>(it->ref) = &fields_.emplace_back(
name, features, *feature_info_, *cached_columns_, *cached_features_,
columns, byte_writer_, int_writer_, index_features,
(nullptr != comparator_));
Expand All @@ -1123,7 +1123,7 @@ field_data* fields_data::emplace(const hashed_string_view& name,
}
}

return it->second;
return it->ref;
}

void fields_data::flush(field_writer& fw, flush_state& state) {
Expand Down
13 changes: 7 additions & 6 deletions core/index/field_data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,21 +198,22 @@ class field_data : util::noncopyable {

class fields_data : util::noncopyable {
private:
struct field_ref_eq : value_ref_eq<field_data*> {
using self_t::operator();
struct FieldEq : ValueRefEq<field_data*> {
using is_transparent = void;
using Self::operator();

bool operator()(const ref_t& lhs,
bool operator()(const Ref& lhs,
const hashed_string_view& rhs) const noexcept {
return lhs.second->meta().name == rhs;
return lhs.ref->meta().name == rhs;
}

bool operator()(const hashed_string_view& lhs,
const ref_t& rhs) const noexcept {
const Ref& rhs) const noexcept {
return this->operator()(rhs, lhs);
}
};

using fields_map = flat_hash_set<field_ref_eq>;
using fields_map = flat_hash_set<FieldEq>;

public:
using postings_ref_t = std::vector<const posting*>;
Expand Down
58 changes: 27 additions & 31 deletions core/index/postings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ namespace irs {

void postings::get_sorted_postings(
std::vector<const posting*>& postings) const {
postings.resize(map_.size());
IRS_ASSERT(terms_.size() == postings_.size());

auto begin = postings.begin();
for (auto& entry : map_) {
*begin = &postings_[entry.second];
++begin;
postings.resize(postings_.size());

for (auto* p = postings.data(); const auto& posting : postings_) {
*p++ = &posting;
}

std::sort(postings.begin(), postings.end(),
Expand All @@ -48,20 +48,20 @@ void postings::get_sorted_postings(
});
}

std::pair<posting*, bool> postings::emplace(bytes_view term) {
posting* postings::emplace(bytes_view term) {
REGISTER_TIMER_DETAILED();
auto& parent = writer_.parent();

// maximum number to bytes needed for storage of term length and data
const auto max_term_len = term.size(); // + vencode_size(term.size());
const auto term_size = term.size(); // + vencode_size(term.size());

if (writer_t::container::block_type::SIZE < max_term_len) {
if (writer_t::container::block_type::SIZE < term_size) {
// TODO: maybe move big terms it to a separate storage
// reject terms that do not fit in a block
return std::make_pair(nullptr, false);
return nullptr;
}

const auto slice_end = writer_.pool_offset() + max_term_len;
const auto slice_end = writer_.pool_offset() + term_size;
const auto next_block_start =
writer_.pool_offset() < parent.value_count()
? writer_.position().block_offset() +
Expand All @@ -74,34 +74,30 @@ std::pair<posting*, bool> postings::emplace(bytes_view term) {
}

IRS_ASSERT(size() < doc_limits::eof()); // not larger then the static flag
IRS_ASSERT(map_.size() == postings_.size());
IRS_ASSERT(terms_.size() == postings_.size());

const hashed_bytes_view hashed_term{term};

bool is_new = false;
const auto it = map_.lazy_emplace(
hashed_term, [&is_new, hash = hashed_term.hash(),
id = map_.size()](const map_t::constructor& ctor) {
const auto it = terms_.lazy_emplace(
hashed_term, [&, size = terms_.size()](const auto& ctor) {
ctor(size, hashed_term.hash());
is_new = true;
ctor(hash, id);
});

if (is_new) {
// for new terms also write out their value
try {
writer_.write(term.data(), term.size());
postings_.emplace_back();
} catch (...) {
// we leave some garbage in block pool
map_.erase(it);
throw;
}

postings_.back().term = {(writer_.position() - term.size()).buffer(),
term.size()};
if (IRS_LIKELY(!is_new)) {
return &postings_[it->ref];
}
// for new terms also write out their value
try {
auto* start = writer_.position().buffer();
writer_.write(term.data(), term_size);
IRS_ASSERT(start == (writer_.position() - term_size).buffer());
return &postings_.emplace_back(start, term_size);
} catch (...) {
// we leave some garbage in block pool
terms_.erase(it);
throw;
}

return {&postings_[it->second], is_new};
}

} // namespace irs
40 changes: 21 additions & 19 deletions core/index/postings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "utils/hash_utils.hpp"
#include "utils/noncopyable.hpp"
#include "utils/string.hpp"
#include "utils/type_limits.hpp"

namespace irs {

Expand All @@ -54,6 +55,9 @@ using byte_block_pool =
block_pool<byte_type, 32768, ManagedTypedAllocator<byte_type>>;

struct posting {
explicit posting(const byte_type* data, size_t size) noexcept
: term{data, size} {}

bytes_view term;
uint64_t doc_code;
// ...........................................................................
Expand All @@ -64,7 +68,7 @@ struct posting {
// [3] - pointer to prox stream begin
// ...........................................................................
size_t int_start;
doc_id_t doc;
doc_id_t doc{doc_limits::invalid()};
uint32_t freq;
uint32_t pos;
uint32_t offs{0};
Expand All @@ -77,51 +81,49 @@ class postings : util::noncopyable {

// cppcheck-suppress constParameter
explicit postings(writer_t& writer)
: map_{0, value_ref_hash{}, term_id_eq{postings_}}, writer_(writer) {}
: terms_{0, ValueRefHash{}, TermEq{postings_}}, writer_(writer) {}

void clear() noexcept {
map_.clear();
terms_.clear();
postings_.clear();
}

/// @brief fill a provided vector with terms and corresponding postings in
/// sorted order
void get_sorted_postings(std::vector<const posting*>& postings) const;

/// @note on error returns std::ptr(nullptr, false)
/// @note on error returns nullptr
/// @note returned poitern remains valid until the next call
std::pair<posting*, bool> emplace(bytes_view term);
posting* emplace(bytes_view term);

bool empty() const noexcept { return map_.empty(); }
size_t size() const noexcept { return map_.size(); }
bool empty() const noexcept { return terms_.empty(); }
size_t size() const noexcept { return terms_.size(); }

private:
class term_id_eq : public value_ref_eq<size_t> {
public:
explicit term_id_eq(const std::vector<posting>& data) noexcept
: data_(&data) {}
struct TermEq : ValueRefEq<size_t> {
using is_transparent = void;
using Self::operator();

using self_t::operator();
explicit TermEq(const std::vector<posting>& data) noexcept : data_{&data} {}

bool operator()(const ref_t& lhs,
bool operator()(const Ref& lhs,
const hashed_bytes_view& rhs) const noexcept {
IRS_ASSERT(lhs.second < data_->size());
return (*data_)[lhs.second].term == rhs;
IRS_ASSERT(lhs.ref < data_->size());
return (*data_)[lhs.ref].term == rhs;
}

bool operator()(const hashed_bytes_view& lhs,
const ref_t& rhs) const noexcept {
const Ref& rhs) const noexcept {
return this->operator()(rhs, lhs);
}

private:
const std::vector<posting>* data_;
};

using map_t = flat_hash_set<term_id_eq>;

// TODO(MBkkt) Maybe just flat_hash_set<unique_ptr<posting>>?
std::vector<posting> postings_;
map_t map_;
flat_hash_set<TermEq> terms_;
writer_t& writer_;
};

Expand Down
Loading