Skip to content
This repository has been archived by the owner on May 3, 2024. It is now read-only.

Commit

Permalink
Still not working.
Browse files Browse the repository at this point in the history
  • Loading branch information
maierlars committed Dec 7, 2023
1 parent 570d9b3 commit 8d53cd1
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 81 deletions.
141 changes: 62 additions & 79 deletions core/analysis/multi_delimited_token_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,102 +155,90 @@ class multi_delimited_token_stream_generic_single_chars final
};

// TODO move to automaton_utils
void make_string(automaton& a, bytes_view str) {
automaton make_strings(const std::vector<bstring>& strings) {
// if we find a character c that we don't expect, we have to find
// the longest prefix of `str` that is a suffix of the already matched text
// including c. then go to that state.
automaton a;
a.SetStart(a.AddState());
a.SetFinal(a.AddState(), true);

std::unordered_multimap<byte_type, int> positions;
for (const auto& str : strings) {
std::unordered_multimap<byte_type, int> positions;

for (int i = 0; i < str.length(); i++) {
positions.emplace(str[i], i);
}
for (int i = 0; i < str.length(); i++) {
positions.emplace(str[i], i);
}

auto first_state = a.NumStates();
a.AddStates(str.length());
auto first_state = a.NumStates();
a.AddStates(str.length());

for (int i = 0; i < str.length(); i++) {
auto expected = str[i];
int last_no_match = -1;
for (int i = 0; i < str.length(); i++) {
auto expected = str[i];
int last_no_match = -1;

auto current_state = int32_t{i + first_state};
auto current_state = int32_t{i + first_state};

for (int c = 1; c <= UCHAR_MAX; c++) {
if (c == expected) {
// add reset edges
if (last_no_match != -1) {
a.EmplaceArc(current_state,
range_label::fromRange(last_no_match, c - 1), 0);
last_no_match = -1;
}
// add forward edge
a.EmplaceArc(current_state, range_label::fromRange(c),
i == str.length() - 1 ? 1 : (current_state + 1));

} else if (auto iter = positions.find(c); iter != positions.end()) {
// add reset edges
if (last_no_match != -1) {
a.EmplaceArc(current_state,
range_label::fromRange(last_no_match, c - 1), 0);
last_no_match = -1;
}
for (int c = 1; c <= UCHAR_MAX; c++) {
if (c == expected) {
// add reset edges
if (last_no_match != -1) {
a.EmplaceArc(current_state,
range_label::fromRange(last_no_match, c - 1), 0);
last_no_match = -1;
}
// add forward edge
a.EmplaceArc(current_state, range_label::fromRange(c),
i == str.length() - 1 ? 1 : (current_state + 1));

} else if (auto iter = positions.find(c); iter != positions.end()) {
// add reset edges
if (last_no_match != -1) {
a.EmplaceArc(current_state,
range_label::fromRange(last_no_match, c - 1), 0);
last_no_match = -1;
}

// find the biggest prefix of `str`
// TODO pull this out of the loop
bstring matched;
matched.reserve(i + 1);
matched.assign(str.begin(), str.begin() + i);
matched.push_back(c);

size_t best = 0;
while (iter != positions.end() && iter->first == c) {
auto view = bytes_view{str.begin(), str.begin() + iter->second};
if (matched.ends_with(view) && iter->second > best) {
best = iter->second;
// find the biggest prefix of `str`
// TODO pull this out of the loop
bstring matched;
matched.reserve(i + 1);
matched.assign(str.begin(), str.begin() + i);
matched.push_back(c);

size_t best = 0;
while (iter != positions.end() && iter->first == c) {
auto view = bytes_view{str.begin(), str.begin() + iter->second};
if (matched.ends_with(view) && iter->second > best) {
best = iter->second;
}
++iter;
}
++iter;
}

a.EmplaceArc(current_state, range_label::fromRange(c),
first_state + best);
a.EmplaceArc(current_state, range_label::fromRange(c),
first_state + best);

} else if (last_no_match == -1) {
last_no_match = c;
} else if (last_no_match == -1) {
last_no_match = c;
}
}
}

if (last_no_match != -1) {
a.EmplaceArc(current_state,
range_label::fromRange(last_no_match, UCHAR_MAX), 0);
last_no_match = -1;
if (last_no_match != -1) {
a.EmplaceArc(current_state,
range_label::fromRange(last_no_match, UCHAR_MAX), 0);
last_no_match = -1;
}
}
a.EmplaceArc(0, range_label::fromRange(0), first_state);
}

// a.EmplaceArc(first_state + str.length(), range_label::fromRange(0), 1);

a.EmplaceArc(0, range_label::fromRange(0), first_state);
return a;
}

class multi_delimited_token_stream_generic final
: public multi_delimited_token_stream {
public:
explicit multi_delimited_token_stream_generic(options&& opts) {
automaton nfa;
nfa.SetStart(nfa.AddState());
nfa.SetFinal(nfa.AddState(), true);

std::vector<irs::automaton> parts;
parts.reserve(opts.delimiters.size());

for (const auto& str : opts.delimiters) {
make_string(nfa, str);
std::cout << "Automaton for " << ViewCast<char>(bytes_view{str})
<< std::endl;

std::cout << "number of states (union) = " << nfa.NumStates()
<< std::endl;
}

automaton nfa = make_strings(opts.delimiters);
fst::drawFst(nfa, std::cout);

#ifdef IRESEARCH_DEBUG
Expand All @@ -265,14 +253,9 @@ class multi_delimited_token_stream_generic final

automaton dfa;
fst::DeterminizeStar(nfa, &dfa);
std::cout << "number of states (dfa) = " << dfa.NumStates() << std::endl;

// fst::Minimize(&dfa);
std::cout << "HUI\n";
fst::drawFst(dfa, std::cout);
std::cout << "HUI\n";

std::cout << "number of states = " << dfa.NumStates() << std::endl;
//fst::Minimize(&dfa);

auto matcher = make_automaton_matcher(dfa);
auto result = match(matcher, std::string_view{"foobar"});
Expand Down
3 changes: 1 addition & 2 deletions tests/analysis/multi_delimited_token_stream_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,7 @@ TEST_F(multi_delimited_token_stream_tests, no_delimiter) {

TEST_F(multi_delimited_token_stream_tests, multi_words) {
auto stream = irs::analysis::multi_delimited_token_stream::make(
{.delimiters = {"foo"_b, "bar"_b, "bas"_b}});
//{.delimiters = {"f"_b, "g"_b, "h"_b, "j"_b}});
{.delimiters = {"foo"_b, "bar"_b, "baz"_b}});
ASSERT_EQ(irs::type<irs::analysis::multi_delimited_token_stream>::id(),
stream->type());

Expand Down

0 comments on commit 8d53cd1

Please sign in to comment.