From 160595433f99a495e594925ead333703e05efdd7 Mon Sep 17 00:00:00 2001 From: Valery Mironov <32071355+MBkkt@users.noreply.github.com> Date: Thu, 30 Nov 2023 14:11:49 +0100 Subject: [PATCH] Wildcard analyzer helpers (#578) * WIP * WIP --- core/analysis/analyzers.cpp | 91 +++++++- core/analysis/analyzers.hpp | 9 + core/analysis/minhash_token_stream.cpp | 283 ++++++++---------------- core/analysis/ngram_token_stream.cpp | 6 +- core/search/ngram_similarity_filter.cpp | 28 +-- core/search/ngram_similarity_filter.hpp | 8 +- tests/tests_main.cpp | 3 + 7 files changed, 208 insertions(+), 220 deletions(-) diff --git a/core/analysis/analyzers.cpp b/core/analysis/analyzers.cpp index 9239215cd..eedcaf681 100644 --- a/core/analysis/analyzers.cpp +++ b/core/analysis/analyzers.cpp @@ -23,12 +23,18 @@ #include "analysis/analyzers.hpp" +#include +#include + +#include "analysis/token_streams.hpp" #include "utils/hash_utils.hpp" #include "utils/register.hpp" - -namespace { +#include "utils/vpack_utils.hpp" using namespace irs; +using namespace arangodb; + +namespace { struct key { key(std::string_view type, const irs::type_info& args_format) @@ -70,10 +76,10 @@ struct hash<::key> { }; } // namespace std - +namespace irs::analysis { namespace { -constexpr std::string_view kFileNamePrefix{"libanalyzer-"}; +constexpr std::string_view kFileNamePrefix = "libanalyzer-"; class analyzer_register final : public irs::tagged_generic_register<::key, ::value, std::string_view, @@ -93,9 +99,27 @@ class analyzer_register final } }; -} // namespace +constexpr std::string_view kTypeParam = "type"; +constexpr std::string_view kPropertiesParam = "properties"; +constexpr std::string_view kAnalyzerParam = "analyzer"; -namespace irs::analysis { +std::string_view GetType(velocypack::Slice& input) { + IRS_ASSERT(input.isObject()); + input = input.get(kAnalyzerParam); + if (input.isNone() || input.isNull() || input.isEmptyObject()) { + return irs::string_token_stream::type_name(); + } + if (!input.isObject()) { + return {}; + } + auto type = input.get(kTypeParam); + if (!type.isString()) { + return {}; + } + return type.stringView(); +} + +} // namespace analyzer_registrar::analyzer_registrar( const type_info& type, const type_info& args_format, @@ -195,5 +219,60 @@ bool visit( return analyzer_register::instance().visit(wrapper); } +bool MakeAnalyzer(velocypack::Slice input, analyzer::ptr& output) { + auto type = GetType(input); + if (type.empty()) { + return false; + } + if (type == irs::string_token_stream::type_name()) { + output = {}; + return true; + } + input = input.get(kPropertiesParam); + if (input.isNone()) { + input = velocypack::Slice::emptyObjectSlice(); + } + output = get(type, irs::type::get(), + {input.startAs(), input.byteSize()}); + if (!output) { + // fallback to json format if vpack isn't available + output = get(type, irs::type::get(), + irs::slice_to_string(input)); + } + return output != nullptr; +} + +bool NormalizeAnalyzer(velocypack::Slice input, velocypack::Builder& output) { + auto type = GetType(input); + if (type.empty()) { + return false; + } + velocypack::ObjectBuilder scope{&output, kAnalyzerParam}; + if (type == irs::string_token_stream::type_name()) { + return true; + } + output.add(kTypeParam, velocypack::Value{type}); + input = input.get(kPropertiesParam); + if (input.isNone()) { + input = velocypack::Slice::emptyObjectSlice(); + } + std::string normalized; + if (normalize(normalized, type, irs::type::get(), + {input.startAs(), input.byteSize()})) { + output.add( + kPropertiesParam, + velocypack::Slice{reinterpret_cast(normalized.data())}); + return true; + } + // fallback to json format if vpack isn't available + if (normalize(normalized, type, irs::type::get(), + slice_to_string(input))) { + auto vpack = velocypack::Parser::fromJson(normalized); + output.add(kPropertiesParam, vpack->slice()); + return true; + } + return false; +} + } // namespace analyzers } // namespace irs::analysis diff --git a/core/analysis/analyzers.hpp b/core/analysis/analyzers.hpp index 50535615e..3367b5cff 100644 --- a/core/analysis/analyzers.hpp +++ b/core/analysis/analyzers.hpp @@ -23,12 +23,17 @@ #pragma once +#include + #include #include "analyzer.hpp" #include "shared.hpp" #include "utils/text_format.hpp" +namespace arangodb::velocypack { +class Builder; +} // namespace arangodb::velocypack namespace irs::analysis { using factory_f = analysis::analyzer::ptr (*)(std::string_view args); @@ -71,6 +76,10 @@ void load_all(std::string_view path); bool visit( const std::function& visitor); +bool MakeAnalyzer(arangodb::velocypack::Slice input, analyzer::ptr& output); +bool NormalizeAnalyzer(arangodb::velocypack::Slice input, + arangodb::velocypack::Builder& output); + } // namespace analyzers } // namespace irs::analysis diff --git a/core/analysis/minhash_token_stream.cpp b/core/analysis/minhash_token_stream.cpp index 0dc26ef86..2bc2e2d06 100644 --- a/core/analysis/minhash_token_stream.cpp +++ b/core/analysis/minhash_token_stream.cpp @@ -205,237 +205,140 @@ uint64_t CityHash64(const char* s, size_t len) { } } // namespace - +namespace irs::analysis { namespace { using namespace arangodb; -using namespace irs; -using namespace irs::analysis; -constexpr uint32_t kMinHashes = 1; -constexpr std::string_view kTypeParam{"type"}; -constexpr std::string_view kPropertiesParam{"properties"}; -constexpr std::string_view kAnalyzerParam{"analyzer"}; -constexpr std::string_view kNumHashes{"numHashes"}; +constexpr std::string_view kParseError = + ", failed to parse options for MinHashTokenStream"; +constexpr offset kEmptyOffset; -const offset kEmptyOffset; - -std::pair ParseAnalyzer( - velocypack::Slice slice) { - if (!slice.isObject()) { - return {}; +constexpr uint32_t kMinHashes = 1; +constexpr std::string_view kNumHashes = "numHashes"; + +bool ParseNumHashes(velocypack::Slice input, uint32_t& num_hashes) { + IRS_ASSERT(input.isObject()); + input = input.get(kNumHashes); + if (!input.isNumber()) { + IRS_LOG_ERROR(absl::StrCat( + kNumHashes, " attribute must be positive integer", kParseError)); + return false; } - - const auto typeSlice = slice.get(kTypeParam); - - if (!typeSlice.isString()) { - IRS_LOG_ERROR(absl::StrCat("Failed to read '", kTypeParam, - "' attribute of '", kAnalyzerParam, - "' member as string while constructing " - "MinHashTokenStream from VPack arguments")); - return {}; + num_hashes = input.getNumber(); + if (num_hashes < kMinHashes) { + IRS_LOG_ERROR(absl::StrCat(kNumHashes, " attribute must be at least ", + kMinHashes, kParseError)); + return false; } - - return {typeSlice.stringView(), slice.get(kPropertiesParam)}; + return true; } -bool ParseVPack(velocypack::Slice slice, MinHashTokenStream::Options* opts) { - IRS_ASSERT(opts); - - if (const auto num_hashesSlice = slice.get(kNumHashes); - !num_hashesSlice.isNumber()) { - IRS_LOG_ERROR(absl::StrCat("Failed to read '", kNumHashes, - "' attribute as number while constructing " - "MinHashTokenStream from VPack arguments")); +bool ParseOptions(velocypack::Slice slice, + MinHashTokenStream::Options& options) { + if (!slice.isObject()) { return false; - } else { - opts->num_hashes = num_hashesSlice.getNumbernum_hashes)>(); } - - if (opts->num_hashes < kMinHashes) { - IRS_LOG_ERROR( - "Number of hashes must be at least 1, failed to construct " - "MinHashTokenStream from VPack arguments"); + if (!ParseNumHashes(slice, options.num_hashes)) { return false; } - - if (const auto analyzerSlice = slice.get(kAnalyzerParam); - analyzerSlice.isNone() || analyzerSlice.isNull()) { - opts->analyzer.reset(); - return true; - } else { - auto [type, props] = ParseAnalyzer(analyzerSlice); - - if (IsNull(type)) { - return false; - } - - if (props.isNone()) { - props = velocypack::Slice::emptyObjectSlice(); - } - - auto analyzer = - analyzers::get(type, irs::type::get(), - {props.startAs(), props.byteSize()}); - - if (!analyzer) { - // fallback to json format if vpack isn't available - analyzer = analyzers::get(type, irs::type::get(), - irs::slice_to_string(props)); - } - - if (analyzer) { - opts->analyzer = std::move(analyzer); - return true; - } else { - IRS_LOG_ERROR(absl::StrCat("Failed to create analyzer of type '", type, - "' with properties '", - irs::slice_to_string(props), - "' while constructing MinHashTokenStream " - "pipeline_token_stream from VPack arguments")); - } + if (!analyzers::MakeAnalyzer(slice, options.analyzer)) { + IRS_LOG_ERROR(absl::StrCat("Invalid analyzer definition in ", + slice_to_string(slice), kParseError)); + return false; } + return true; +} - return false; +std::shared_ptr ParseArgs(std::string_view args) try { + return velocypack::Parser::fromJson(args.data(), args.size()); +} catch (const std::exception& e) { + IRS_LOG_ERROR(absl::StrCat("Caught exception: ", e.what(), kParseError)); + return {}; +} catch (...) { + IRS_LOG_ERROR(absl::StrCat("Caught unknown exception", kParseError)); + return {}; } -analyzer::ptr MakeVPack(velocypack::Slice slice) { - MinHashTokenStream::Options opts; - if (ParseVPack(slice, &opts)) { +analyzer::ptr MakeImpl(velocypack::Slice slice) { + if (MinHashTokenStream::Options opts; ParseOptions(slice, opts)) { return std::make_unique(std::move(opts)); } - return nullptr; -} - -irs::analysis::analyzer::ptr MakeVPack(std::string_view args) { - VPackSlice slice(reinterpret_cast(args.data())); - return MakeVPack(slice); + return {}; } -// `args` is a JSON encoded object with the following attributes: -// "analyzer"(object) the analyzer definition containing "type"(string) and -// optional "properties"(object) -analyzer::ptr MakeJson(std::string_view args) { - try { - if (IsNull(args)) { - IRS_LOG_ERROR("Null arguments while constructing MinHashAnalyzer"); - return nullptr; - } - auto vpack = velocypack::Parser::fromJson(args.data(), args.size()); - return MakeVPack(vpack->slice()); - } catch (const VPackException& ex) { - IRS_LOG_ERROR( - absl::StrCat("Caught error '", ex.what(), - "' while constructing MinHashAnalyzer from JSON")); - } catch (...) { - IRS_LOG_ERROR("Caught error while constructing MinHashAnalyzer from JSON"); +bool NormalizeImpl(velocypack::Slice input, velocypack::Builder& output) { + if (!input.isObject()) { + return false; } - return nullptr; -} - -bool MakeVPackOptions(const MinHashTokenStream::Options& opts, - VPackSlice analyzerSlice, velocypack::Builder* out) { - velocypack::Slice props = velocypack::Slice::emptyObjectSlice(); - - if (analyzerSlice.isObject()) { - props = analyzerSlice.get(kPropertiesParam); - if (props.isNone()) { - props = velocypack::Slice::emptyObjectSlice(); - } - } else if (!analyzerSlice.isNone()) { - IRS_LOG_ERROR( - "Failed to normalize definition of MinHashAnalyzer, 'properties' field " - "must be object"); + uint32_t num_hashes = 0; + if (!ParseNumHashes(input, num_hashes)) { return false; } - - velocypack::ObjectBuilder root_scope{out}; - out->add(kNumHashes, velocypack::Value{opts.num_hashes}); - - if (props.isObject() && opts.analyzer) { - const auto type = opts.analyzer->type()().name(); - std::string normalized; - - velocypack::ObjectBuilder analyzer_scope{out, kAnalyzerParam}; - out->add(kTypeParam, velocypack::Value{type}); - - if (analyzers::normalize(normalized, type, - irs::type::get(), - {props.startAs(), props.byteSize()})) { - out->add(kPropertiesParam, - velocypack::Slice{ - reinterpret_cast(normalized.c_str())}); - - return true; - } - - // fallback to json format if vpack isn't available - if (analyzers::normalize(normalized, type, - irs::type::get(), - irs::slice_to_string(props))) { - auto vpack = velocypack::Parser::fromJson(normalized); - out->add(kPropertiesParam, vpack->slice()); - return true; - } - } else if (!opts.analyzer) { - out->add(kAnalyzerParam, velocypack::Slice::emptyObjectSlice()); - return true; + velocypack::ObjectBuilder scope{&output}; + output.add(kNumHashes, velocypack::Value{num_hashes}); + if (!analyzers::NormalizeAnalyzer(input, output)) { + IRS_LOG_ERROR(absl::StrCat("Invalid analyzer definition in ", + slice_to_string(input), kParseError)); + return false; } + return true; +} - return false; +analyzer::ptr MakeVPack(std::string_view args) { + if (args.empty()) { + IRS_LOG_ERROR(absl::StrCat("Empty arguments", kParseError)); + return {}; + } + velocypack::Slice slice{reinterpret_cast(args.data())}; + return MakeImpl(slice); } -bool NormalizeVPack(velocypack::Slice slice, velocypack::Builder* out) { - MinHashTokenStream::Options opts; - if (ParseVPack(slice, &opts)) { - return MakeVPackOptions(opts, slice.get(kAnalyzerParam), out); +analyzer::ptr MakeJson(std::string_view args) { + if (args.empty()) { + IRS_LOG_ERROR(absl::StrCat("Empty arguments", kParseError)); + return {}; } - return false; + auto builder = ParseArgs(args); + if (!builder) { + return {}; + } + return MakeImpl(builder->slice()); } bool NormalizeVPack(std::string_view args, std::string& definition) { - VPackSlice slice(reinterpret_cast(args.data())); - VPackBuilder builder; - bool res = NormalizeVPack(slice, &builder); - if (res) { - definition.assign(builder.slice().startAs(), - builder.slice().byteSize()); + if (args.empty()) { + IRS_LOG_ERROR(absl::StrCat("Empty arguments", kParseError)); + return false; } - return res; + velocypack::Slice input{reinterpret_cast(args.data())}; + velocypack::Builder output; + if (!NormalizeImpl(input, output)) { + return false; + } + definition.assign(output.slice().startAs(), output.slice().byteSize()); + return true; } bool NormalizeJson(std::string_view args, std::string& definition) { - try { - if (IsNull(args)) { - IRS_LOG_ERROR("Null arguments while normalizing MinHashAnalyzer"); - return false; - } - auto vpack = velocypack::Parser::fromJson(args.data(), args.size()); - VPackBuilder builder; - if (NormalizeVPack(vpack->slice(), &builder)) { - definition = builder.toString(); - return !definition.empty(); - } - } catch (const VPackException& ex) { - IRS_LOG_ERROR( - absl::StrCat("Caught error '", ex.what(), - "' while normalizing MinHashAnalyzer from JSON")); - } catch (...) { - IRS_LOG_ERROR( - "Caught error while normalizing MinHashAnalyzerfrom from JSON"); + if (args.empty()) { + IRS_LOG_ERROR(absl::StrCat("Empty arguments", kParseError)); + return false; } - return false; + auto input = ParseArgs(args); + if (!input) { + return {}; + } + velocypack::Builder output; + if (!NormalizeImpl(input->slice(), output)) { + return false; + } + definition = output.toString(); + return !definition.empty(); } -auto sRegisterTypes = []() { - MinHashTokenStream::init(); - return std::nullopt; -}(); - } // namespace -namespace irs::analysis { - void MinHashTokenStream::init() { REGISTER_ANALYZER_VPACK(irs::analysis::MinHashTokenStream, MakeVPack, NormalizeVPack); diff --git a/core/analysis/ngram_token_stream.cpp b/core/analysis/ngram_token_stream.cpp index e9c01788f..7d10e0b56 100644 --- a/core/analysis/ngram_token_stream.cpp +++ b/core/analysis/ngram_token_stream.cpp @@ -455,11 +455,7 @@ bool ngram_token_stream_base::reset(std::string_view value) noexcept { ? data_.size() : std::min(data_.size(), options_.max_gram); buffer_size += max_marker_size; - if (buffer_size > - marked_term_buffer_ - .capacity()) { // until c++20 this check is needed to avoid shrinking - marked_term_buffer_.reserve(buffer_size); - } + marked_term_buffer_.reserve(buffer_size); } return true; } diff --git a/core/search/ngram_similarity_filter.cpp b/core/search/ngram_similarity_filter.cpp index b3740e651..be54d5ee1 100644 --- a/core/search/ngram_similarity_filter.cpp +++ b/core/search/ngram_similarity_filter.cpp @@ -31,33 +31,27 @@ namespace irs { -filter::prepared::ptr by_ngram_similarity::prepare( - const PrepareContext& ctx) const { - const auto& ngrams = options().ngrams; +filter::prepared::ptr by_ngram_similarity::Prepare( + const PrepareContext& ctx, std::string_view field_name, + const options_type& options) { + const auto& ngrams = options.ngrams; - if (ngrams.empty() || field().empty()) { + if (ngrams.empty() || field_name.empty()) { // empty field or terms or invalid threshold return filter::prepared::empty(); } - const auto threshold = std::clamp(options().threshold, 0.f, 1.f); + const auto threshold = std::clamp(options.threshold, 0.f, 1.f); const auto min_match_count = std::clamp(static_cast(std::ceil(ngrams.size() * threshold)), size_t{1}, ngrams.size()); - const auto sub_boost = ctx.boost * boost(); if (ctx.scorers.empty() && 1 == min_match_count) { irs::by_terms disj; - for (auto& terms = disj.mutable_options()->terms; - auto& term : options().ngrams) { + for (auto& terms = disj.mutable_options()->terms; auto& term : ngrams) { terms.emplace(term, irs::kNoBoost); } - *disj.mutable_field() = this->field(); - return disj.prepare({ - .index = ctx.index, - .memory = ctx.memory, - .ctx = ctx.ctx, - .boost = sub_boost, - }); + *disj.mutable_field() = field_name; + return disj.prepare(ctx); } NGramStates query_states{ctx.memory, ctx.index.size()}; @@ -70,8 +64,6 @@ filter::prepared::ptr by_ngram_similarity::prepare( field_collectors field_stats{ctx.scorers}; term_collectors term_stats{ctx.scorers, terms_count}; - const std::string_view field_name = this->field(); - for (const auto& segment : ctx.index) { // get term dictionary for field const term_reader* field = segment.field(field_name); @@ -129,7 +121,7 @@ filter::prepared::ptr by_ngram_similarity::prepare( return memory::make_tracked( ctx.memory, min_match_count, std::move(query_states), std::move(stats), - sub_boost); + ctx.boost); } } // namespace irs diff --git a/core/search/ngram_similarity_filter.hpp b/core/search/ngram_similarity_filter.hpp index 973d788f8..6980bd860 100644 --- a/core/search/ngram_similarity_filter.hpp +++ b/core/search/ngram_similarity_filter.hpp @@ -51,7 +51,13 @@ struct by_ngram_similarity_options { class by_ngram_similarity : public filter_base { public: - filter::prepared::ptr prepare(const PrepareContext& ctx) const final; + static prepared::ptr Prepare(const PrepareContext& ctx, + std::string_view field_name, + const options_type& options); + + prepared::ptr prepare(const PrepareContext& ctx) const final { + return Prepare(ctx.Boost(boost()), field(), options()); + } }; } // namespace irs diff --git a/tests/tests_main.cpp b/tests/tests_main.cpp index f58ce6ad8..f2f91996c 100644 --- a/tests/tests_main.cpp +++ b/tests/tests_main.cpp @@ -51,6 +51,7 @@ #include +#include #include #include #include @@ -295,6 +296,8 @@ int test_env::initialize(int argc, char* argv[]) { ::testing::AddGlobalTestEnvironment(new IterationTracker()); ::testing::InitGoogleTest(&argc_, argv_); + irs::analysis::MinHashTokenStream::init(); + return RUN_ALL_TESTS(); }