diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index f0cc7901d..28cf67a46 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -474,6 +474,7 @@ set_target_properties(iresearch-analyzer-multi-delimiter-static target_link_libraries(iresearch-analyzer-multi-delimiter-static iresearch-static + iresearch-ofst ) ################################################################################ diff --git a/core/analysis/multi_delimited_token_stream.cpp b/core/analysis/multi_delimited_token_stream.cpp index 03819459f..6628e0dfa 100644 --- a/core/analysis/multi_delimited_token_stream.cpp +++ b/core/analysis/multi_delimited_token_stream.cpp @@ -20,10 +20,14 @@ #include "multi_delimited_token_stream.hpp" +#include +#include #include #include +#include "utils/automaton_utils.hpp" +#include "utils/fstext/fst_draw.hpp" #include "utils/vpack_utils.hpp" #include "velocypack/Builder.h" #include "velocypack/Parser.h" @@ -150,15 +154,122 @@ class multi_delimited_token_stream_generic_single_chars final std::array bytes_; }; +// TODO move to automaton_utils +automaton make_string(bytes_view str) { + // if we find a character c that we don't expect, we have to find + // the longest prefix of `str` that is a suffix of the already matched text + // including c. then go to that state. + + std::unordered_multimap positions; + + for (int i = 0; i < str.length(); i++) { + positions.emplace(str[i], i); + } + + automaton a; + a.AddStates(str.length() + 1); + a.SetStart(0); + a.SetFinal(str.length()); + + for (int i = 0; i < str.length(); i++) { + auto expected = str[i]; + int last_no_match = -1; + + for (int c = 0; c <= UCHAR_MAX; c++) { + if (c == expected) { + // add reset edges + if (last_no_match != -1) { + a.EmplaceArc(i, range_label::fromRange(last_no_match, c - 1), 0); + last_no_match = -1; + } + // add forward edge + a.EmplaceArc(i, range_label::fromRange(c), i + 1); + + } else if (auto iter = positions.find(c); iter != positions.end()) { + // add reset edges + if (last_no_match != -1) { + a.EmplaceArc(i, range_label::fromRange(last_no_match, c - 1), 0); + last_no_match = -1; + } + + // find the biggest prefix of `str` + // TODO pull this out of the loop + bstring matched; + matched.reserve(i + 1); + matched.assign(str.begin(), str.begin() + i); + matched.push_back(c); + + size_t best = 0; + while (iter != positions.end() && iter->first == c) { + auto view = bytes_view{str.begin(), str.begin() + iter->second}; + if (matched.ends_with(view) && iter->second > best) { + best = iter->second; + } + ++iter; + } + + a.EmplaceArc(i, range_label::fromRange(c), best); + + } else if (last_no_match == -1) { + last_no_match = c; + } + } + + if (last_no_match != -1) { + a.EmplaceArc(i, range_label::fromRange(last_no_match, UCHAR_MAX), 0); + last_no_match = -1; + } + } + + return a; +} + class multi_delimited_token_stream_generic final : public multi_delimited_token_stream { public: - explicit multi_delimited_token_stream_generic(options&& opts) - : options_(std::move(opts)) {} + explicit multi_delimited_token_stream_generic(options&& opts) { + automaton nfa; + nfa.SetStart(nfa.AddState()); + nfa.SetFinal(0, true); + + std::vector parts; + parts.reserve(opts.delimiters.size()); + + for (const auto& str : opts.delimiters) { + irs::automaton a = make_string(str); + std::cout << "Automaton for " << ViewCast(bytes_view{str}); + ///fst::drawFst(a, std::cout); + std::cout << "number of states = " << a.NumStates() << std::endl; + fst::Union(&nfa, a); + + std::cout << "number of states (union) = " << nfa.NumStates() + << std::endl; + } - bool next() override { return false; } + ///fst::drawFst(nfa, std::cout); + +#ifdef IRESEARCH_DEBUG + // ensure nfa is sorted + static constexpr auto EXPECTED_NFA_PROPERTIES = + fst::kILabelSorted | fst::kOLabelSorted | fst::kAcceptor | fst::kUnweighted; + + IRS_ASSERT(EXPECTED_NFA_PROPERTIES == + nfa.Properties(EXPECTED_NFA_PROPERTIES, true)); +#endif + + automaton dfa; + fst::DeterminizeStar(nfa, &dfa); + std::cout << "number of states (dfa) = " << nfa.NumStates() << std::endl; + + fst::Minimize(&dfa); - options options_; + std::cout << "number of states = " << dfa.NumStates() << std::endl; + + auto matcher = make_automaton_matcher(dfa); + auto result = match(matcher, std::string_view{"foobar"}); + } + + bool next() override { return false; } }; class multi_delimited_token_stream_single final @@ -313,7 +424,6 @@ bool normalize_vpack_config(std::string_view args, std::string& definition) { return res; } - REGISTER_ANALYZER_VPACK(irs::analysis::multi_delimited_token_stream, make_vpack, normalize_vpack_config); /* diff --git a/core/utils/fstext/fst_draw.hpp b/core/utils/fstext/fst_draw.hpp index 4f7d40132..15d5597a0 100644 --- a/core/utils/fstext/fst_draw.hpp +++ b/core/utils/fstext/fst_draw.hpp @@ -18,9 +18,11 @@ namespace fst { template struct LabelToString { - std::string operator()(const Arc&, typename Arc::Label label, + std::string operator()(const Arc& arc, typename Arc::Label label, std::string_view) const { - return std::to_string(label); + std::stringstream ss; + ss << arc; + return ss.str(); } }; @@ -245,7 +247,7 @@ class FstDrawer { }; template> + typename LabelToString = fst::LabelToString> inline void drawFst( const Fst& fst, std::ostream& strm, const LabelToString& label_to_string = {}, const std::string& dest = "", const SymbolTable* isyms = nullptr, diff --git a/tests/analysis/multi_delimited_token_stream_tests.cpp b/tests/analysis/multi_delimited_token_stream_tests.cpp index ec3a9c538..60e9a7d1f 100644 --- a/tests/analysis/multi_delimited_token_stream_tests.cpp +++ b/tests/analysis/multi_delimited_token_stream_tests.cpp @@ -55,116 +55,138 @@ TEST_F(multi_delimited_token_stream_tests, consts) { } TEST_F(multi_delimited_token_stream_tests, test_delimiter) { - // test delimiter std::string_view{} - { - auto stream = - irs::analysis::multi_delimited_token_stream::make({.delimiters = {"a"_b}}); - ASSERT_EQ(irs::type::id(), - stream->type()); - - ASSERT_TRUE(stream->reset("baccaad")); - - auto* payload = irs::get(*stream); - ASSERT_EQ(nullptr, payload); - auto* term = irs::get(*stream); - - ASSERT_TRUE(stream->next()); - ASSERT_EQ("b", irs::ViewCast(term->value)); - ASSERT_TRUE(stream->next()); - ASSERT_EQ("cc", irs::ViewCast(term->value)); - ASSERT_TRUE(stream->next()); - ASSERT_EQ("d", irs::ViewCast(term->value)); - ASSERT_FALSE(stream->next()); - } + auto stream = + irs::analysis::multi_delimited_token_stream::make({.delimiters = {"a"_b}}); + ASSERT_EQ(irs::type::id(), + stream->type()); + + ASSERT_TRUE(stream->reset("baccaad")); + + auto* payload = irs::get(*stream); + ASSERT_EQ(nullptr, payload); + auto* term = irs::get(*stream); + + ASSERT_TRUE(stream->next()); + ASSERT_EQ("b", irs::ViewCast(term->value)); + ASSERT_TRUE(stream->next()); + ASSERT_EQ("cc", irs::ViewCast(term->value)); + ASSERT_TRUE(stream->next()); + ASSERT_EQ("d", irs::ViewCast(term->value)); + ASSERT_FALSE(stream->next()); } TEST_F(multi_delimited_token_stream_tests, test_delimiter_empty_match) { - // test delimiter std::string_view{} - { - auto stream = - irs::analysis::multi_delimited_token_stream::make({.delimiters = {"."_b}}); - ASSERT_EQ(irs::type::id(), - stream->type()); + auto stream = + irs::analysis::multi_delimited_token_stream::make({.delimiters = {"."_b}}); + ASSERT_EQ(irs::type::id(), + stream->type()); - ASSERT_TRUE(stream->reset("..")); + ASSERT_TRUE(stream->reset("..")); - auto* payload = irs::get(*stream); - ASSERT_EQ(nullptr, payload); + auto* payload = irs::get(*stream); + ASSERT_EQ(nullptr, payload); - ASSERT_FALSE(stream->next()); - } + ASSERT_FALSE(stream->next()); } TEST_F(multi_delimited_token_stream_tests, test_delimiter_5) { - // test delimiter std::string_view{} - { - auto stream = irs::analysis::multi_delimited_token_stream::make( - {.delimiters = {";"_b, ","_b, "|"_b, "."_b, ":"_b}}); - ASSERT_EQ(irs::type::id(), - stream->type()); - - ASSERT_TRUE(stream->reset("a:b||c.d,ff.")); - - auto* payload = irs::get(*stream); - ASSERT_EQ(nullptr, payload); - auto* term = irs::get(*stream); - - ASSERT_TRUE(stream->next()); - ASSERT_EQ("a", irs::ViewCast(term->value)); - ASSERT_TRUE(stream->next()); - ASSERT_EQ("b", irs::ViewCast(term->value)); - ASSERT_TRUE(stream->next()); - ASSERT_EQ("c", irs::ViewCast(term->value)); - ASSERT_TRUE(stream->next()); - ASSERT_EQ("d", irs::ViewCast(term->value)); - ASSERT_TRUE(stream->next()); - ASSERT_EQ("ff", irs::ViewCast(term->value)); - ASSERT_FALSE(stream->next()); - } + auto stream = irs::analysis::multi_delimited_token_stream::make( + {.delimiters = {";"_b, ","_b, "|"_b, "."_b, ":"_b}}); + ASSERT_EQ(irs::type::id(), + stream->type()); + + ASSERT_TRUE(stream->reset("a:b||c.d,ff.")); + + auto* payload = irs::get(*stream); + ASSERT_EQ(nullptr, payload); + auto* term = irs::get(*stream); + + ASSERT_TRUE(stream->next()); + ASSERT_EQ("a", irs::ViewCast(term->value)); + ASSERT_TRUE(stream->next()); + ASSERT_EQ("b", irs::ViewCast(term->value)); + ASSERT_TRUE(stream->next()); + ASSERT_EQ("c", irs::ViewCast(term->value)); + ASSERT_TRUE(stream->next()); + ASSERT_EQ("d", irs::ViewCast(term->value)); + ASSERT_TRUE(stream->next()); + ASSERT_EQ("ff", irs::ViewCast(term->value)); + ASSERT_FALSE(stream->next()); } TEST_F(multi_delimited_token_stream_tests, test_delimiter_single_long) { - // test delimiter std::string_view{} - { - auto stream = irs::analysis::multi_delimited_token_stream::make( - {.delimiters = {"foo"_b}}); - ASSERT_EQ(irs::type::id(), - stream->type()); - - ASSERT_TRUE(stream->reset("foobarfoobazbarfoobar")); - - auto* payload = irs::get(*stream); - ASSERT_EQ(nullptr, payload); - auto* term = irs::get(*stream); - - ASSERT_TRUE(stream->next()); - ASSERT_EQ("bar", irs::ViewCast(term->value)); - ASSERT_TRUE(stream->next()); - ASSERT_EQ("bazbar", irs::ViewCast(term->value)); - ASSERT_TRUE(stream->next()); - ASSERT_EQ("bar", irs::ViewCast(term->value)); - ASSERT_FALSE(stream->next()); - } + auto stream = irs::analysis::multi_delimited_token_stream::make( + {.delimiters = {"foo"_b}}); + ASSERT_EQ(irs::type::id(), + stream->type()); + + ASSERT_TRUE(stream->reset("foobarfoobazbarfoobar")); + + auto* payload = irs::get(*stream); + ASSERT_EQ(nullptr, payload); + auto* term = irs::get(*stream); + + ASSERT_TRUE(stream->next()); + ASSERT_EQ("bar", irs::ViewCast(term->value)); + ASSERT_TRUE(stream->next()); + ASSERT_EQ("bazbar", irs::ViewCast(term->value)); + ASSERT_TRUE(stream->next()); + ASSERT_EQ("bar", irs::ViewCast(term->value)); + ASSERT_FALSE(stream->next()); } TEST_F(multi_delimited_token_stream_tests, no_delimiter) { - // test delimiter std::string_view{} - { - auto stream = irs::analysis::multi_delimited_token_stream::make( - {.delimiters = {}}); - ASSERT_EQ(irs::type::id(), - stream->type()); - - ASSERT_TRUE(stream->reset("foobar")); - - auto* payload = irs::get(*stream); - ASSERT_EQ(nullptr, payload); - auto* term = irs::get(*stream); - - ASSERT_TRUE(stream->next()); - ASSERT_EQ("foobar", irs::ViewCast(term->value)); - ASSERT_FALSE(stream->next()); - } + auto stream = + irs::analysis::multi_delimited_token_stream::make({.delimiters = {}}); + ASSERT_EQ(irs::type::id(), + stream->type()); + + ASSERT_TRUE(stream->reset("foobar")); + + auto* payload = irs::get(*stream); + ASSERT_EQ(nullptr, payload); + auto* term = irs::get(*stream); + + ASSERT_TRUE(stream->next()); + ASSERT_EQ("foobar", irs::ViewCast(term->value)); + ASSERT_FALSE(stream->next()); +} + +TEST_F(multi_delimited_token_stream_tests, multi_words) { + auto stream = irs::analysis::multi_delimited_token_stream::make( + //{.delimiters = {"foo"_b, "bar"_b, "baz"_b}}); + {.delimiters = {"fab1"_b, "goo2"_b, "puh3"_b}}); + ASSERT_EQ(irs::type::id(), + stream->type()); + + ASSERT_TRUE(stream->reset("fooxyzbarbazz")); + + auto* payload = irs::get(*stream); + ASSERT_EQ(nullptr, payload); + auto* term = irs::get(*stream); + + ASSERT_TRUE(stream->next()); + ASSERT_EQ("foobar", irs::ViewCast(term->value)); + ASSERT_FALSE(stream->next()); +} + +TEST_F(multi_delimited_token_stream_tests, trick_matching_1) { + auto stream = irs::analysis::multi_delimited_token_stream::make( + {.delimiters = {"foo"_b, "ffa"_b}}); + ASSERT_EQ(irs::type::id(), + stream->type()); + + ASSERT_TRUE(stream->reset("abcffoobar")); + + auto* payload = irs::get(*stream); + ASSERT_EQ(nullptr, payload); + auto* term = irs::get(*stream); + + ASSERT_TRUE(stream->next()); + ASSERT_EQ("abcf", irs::ViewCast(term->value)); + ASSERT_TRUE(stream->next()); + ASSERT_EQ("bar", irs::ViewCast(term->value)); + ASSERT_FALSE(stream->next()); } #endif