Skip to content
This repository has been archived by the owner on May 3, 2024. It is now read-only.

Commit

Permalink
Fixes for utf-8 (#582)
Browse files Browse the repository at this point in the history
* Small fixes

* Adjust test for ArangoDB

* Fix

* Fix

* Fix

* Fix
  • Loading branch information
MBkkt authored Dec 16, 2023
1 parent 6aacbcc commit 5d6061d
Show file tree
Hide file tree
Showing 37 changed files with 1,552 additions and 2,103 deletions.
7 changes: 6 additions & 1 deletion .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,22 @@ Checks: '*,
-fuchsia-default-arguments-*,
-fuchsia-overloaded-operator,
-readability-identifier-length,
-readability-suspicious-call-argument,
-readability-function-cognitive-complexity,
-cppcoreguidelines-owning-memory,
-cppcoreguidelines-avoid-do-while,
-cppcoreguidelines-avoid-c-arrays,
-cppcoreguidelines-avoid-do-while,
-cppcoreguidelines-avoid-magic-numbers,
-cppcoreguidelines-pro-type-union-access,
-cppcoreguidelines-pro-bounds-pointer-arithmetic,
-cppcoreguidelines-pro-bounds-constant-array-index,
-cppcoreguidelines-pro-bounds-array-to-pointer-decay,
-modernize-use-nodiscard,
-modernize-avoid-c-arrays,
-modernize-use-trailing-return-type,
-hicpp-avoid-c-arrays,
-hicpp-no-array-decay,
-bugprone-easily-swappable-parameters,
-llvm-header-guard,
-cert-err58-cpp,
-google-build-using-namespace'
Expand Down
58 changes: 21 additions & 37 deletions core/analysis/ngram_token_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -464,27 +464,15 @@ template<irs::analysis::ngram_token_stream_base::InputType StreamType>
bool ngram_token_stream<StreamType>::next_symbol(
const byte_type*& it) const noexcept {
IRS_ASSERT(it);
if (it < data_end_) {
if constexpr (StreamType == InputType::Binary) {
++it;
} else if constexpr (StreamType == InputType::UTF8) {
const uint32_t cp_start = *it++;
if (IRS_UNLIKELY(cp_start >= 0b1000'0000)) {
if (cp_start < 0b1110'0000) {
++it;
} else if (cp_start < 0b1111'0000) {
it += 2;
} else if (cp_start < 0b1111'1000) {
it += 3;
}
if (it > data_end_) {
it = data_end_;
}
}
}
return true;
if (IRS_UNLIKELY(it == data_end_)) {
return false;
}
return false;
if constexpr (StreamType == InputType::Binary) {
++it;
} else if constexpr (StreamType == InputType::UTF8) {
it = utf8_utils::Next(it, data_end_);
}
return true;
}

template<irs::analysis::ngram_token_stream_base::InputType StreamType>
Expand Down Expand Up @@ -543,25 +531,21 @@ bool ngram_token_stream<StreamType>::next() noexcept {
}
return true;
}
} else {
} else if (EmitOriginal::None == emit_original_) {
// need to move to next position
if (EmitOriginal::None == emit_original_) {
if (next_symbol(begin_)) {
next_inc_val_ = 1;
length_ = 0;
ngram_end_ = begin_;
offset.start =
static_cast<uint32_t>(std::distance(data_.data(), begin_));
} else {
return false; // stream exhausted
}
} else {
// as stream has unsigned incremet attribute
// we cannot go back, so we must emit original before we leave start pos
// in stream (as it starts from pos=0 in stream)
emit_original();
return true;
if (IRS_UNLIKELY(!next_symbol(begin_))) {
return false; // stream exhausted
}
next_inc_val_ = 1;
length_ = 0;
ngram_end_ = begin_;
offset.start = static_cast<uint32_t>(std::distance(data_.data(), begin_));
} else {
// as stream has unsigned incremet attribute
// we cannot go back, so we must emit original before we leave start pos
// in stream (as it starts from pos=0 in stream)
emit_original();
return true;
}
}
return false;
Expand Down
59 changes: 28 additions & 31 deletions core/analysis/segmentation_token_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,13 @@ namespace {

using namespace irs;

constexpr std::string_view CASE_CONVERT_PARAM_NAME{"case"};
constexpr std::string_view BREAK_PARAM_NAME{"break"};
constexpr std::string_view kCaseConvertParamName{"case"};
constexpr std::string_view kBreakParamName{"break"};

constexpr frozen::unordered_map<
std::string_view,
analysis::segmentation_token_stream::options_t::case_convert_t, 3>
CASE_CONVERT_MAP = {
kCaseConvertMap = {
{"lower",
analysis::segmentation_token_stream::options_t::case_convert_t::LOWER},
{"none",
Expand All @@ -59,7 +59,7 @@ constexpr frozen::unordered_map<
constexpr frozen::unordered_map<
std::string_view,
analysis::segmentation_token_stream::options_t::word_break_t, 3>
BREAK_CONVERT_MAP = {
kBreakConvertMap = {
{"all", analysis::segmentation_token_stream::options_t::word_break_t::ALL},
{"alpha",
analysis::segmentation_token_stream::options_t::word_break_t::ALPHA},
Expand All @@ -74,43 +74,43 @@ bool parse_vpack_options(
IRS_LOG_ERROR("Slice for segmentation_token_stream is not an object");
return false;
}
if (auto case_convert_slice = slice.get(CASE_CONVERT_PARAM_NAME);
if (auto case_convert_slice = slice.get(kCaseConvertParamName);
!case_convert_slice.isNone()) {
if (!case_convert_slice.isString()) {
IRS_LOG_WARN(
absl::StrCat("Invalid type '", CASE_CONVERT_PARAM_NAME,
absl::StrCat("Invalid type '", kCaseConvertParamName,
"' (string expected) for segmentation_token_stream from "
"VPack arguments"));
return false;
}
auto case_convert = case_convert_slice.stringView();
auto itr = CASE_CONVERT_MAP.find(
auto itr = kCaseConvertMap.find(
std::string_view(case_convert.data(), case_convert.size()));

if (itr == CASE_CONVERT_MAP.end()) {
if (itr == kCaseConvertMap.end()) {
IRS_LOG_WARN(
absl::StrCat("Invalid value in '", CASE_CONVERT_PARAM_NAME,
absl::StrCat("Invalid value in '", kCaseConvertParamName,
"' for segmentation_token_stream from VPack arguments"));
return false;
}
options.case_convert = itr->second;
}
if (auto break_type_slice = slice.get(BREAK_PARAM_NAME);
if (auto break_type_slice = slice.get(kBreakParamName);
!break_type_slice.isNone()) {
if (!break_type_slice.isString()) {
IRS_LOG_WARN(
absl::StrCat("Invalid type '", BREAK_PARAM_NAME,
absl::StrCat("Invalid type '", kBreakParamName,
"' (string expected) for segmentation_token_stream from "
"VPack arguments"));
return false;
}
auto break_type = break_type_slice.stringView();
auto itr = BREAK_CONVERT_MAP.find(
auto itr = kBreakConvertMap.find(
std::string_view(break_type.data(), break_type.size()));

if (itr == BREAK_CONVERT_MAP.end()) {
if (itr == kBreakConvertMap.end()) {
IRS_LOG_WARN(
absl::StrCat("Invalid value in '", BREAK_PARAM_NAME,
absl::StrCat("Invalid value in '", kBreakParamName,
"' for segmentation_token_stream from VPack arguments"));
return false;
}
Expand All @@ -124,32 +124,32 @@ bool make_vpack_config(
VPackBuilder* builder) {
VPackObjectBuilder object(builder);
{
auto it = std::find_if(CASE_CONVERT_MAP.begin(), CASE_CONVERT_MAP.end(),
auto it = std::find_if(kCaseConvertMap.begin(), kCaseConvertMap.end(),
[v = options.case_convert](
const decltype(CASE_CONVERT_MAP)::value_type& m) {
const decltype(kCaseConvertMap)::value_type& m) {
return m.second == v;
});
if (it != CASE_CONVERT_MAP.end()) {
builder->add(CASE_CONVERT_PARAM_NAME, VPackValue(it->first));
if (it != kCaseConvertMap.end()) {
builder->add(kCaseConvertParamName, VPackValue(it->first));
} else {
IRS_LOG_WARN(absl::StrCat(
"Invalid value in '", CASE_CONVERT_PARAM_NAME,
"Invalid value in '", kCaseConvertParamName,
"' for normalizing segmentation_token_stream from Value is: ",
options.case_convert));
return false;
}
}
{
auto it = std::find_if(BREAK_CONVERT_MAP.begin(), BREAK_CONVERT_MAP.end(),
auto it = std::find_if(kBreakConvertMap.begin(), kBreakConvertMap.end(),
[v = options.word_break](
const decltype(BREAK_CONVERT_MAP)::value_type& m) {
const decltype(kBreakConvertMap)::value_type& m) {
return m.second == v;
});
if (it != BREAK_CONVERT_MAP.end()) {
builder->add(BREAK_PARAM_NAME, VPackValue(it->first));
if (it != kBreakConvertMap.end()) {
builder->add(kBreakParamName, VPackValue(it->first));
} else {
IRS_LOG_WARN(absl::StrCat(
"Invalid value in '", BREAK_PARAM_NAME,
"Invalid value in '", kBreakParamName,
"' for normalizing segmentation_token_stream from Value is: ",
options.word_break));
return false;
Expand Down Expand Up @@ -272,10 +272,9 @@ bool accept_token(Iterator begin, Iterator end, word_break_t wb) {
case word_break_t::ALL:
return true;
case word_break_t::GRAPHIC:
return std::find_if_not(begin, end, utf8_utils::char_is_white_space) !=
end;
return std::find_if_not(begin, end, utf8_utils::CharIsWhiteSpace) != end;
case word_break_t::ALPHA:
return std::find_if(begin, end, utf8_utils::char_is_alphanumeric) != end;
return std::find_if(begin, end, utf8_utils::CharIsAlphanumeric) != end;
default:
IRS_ASSERT(false);
return false;
Expand All @@ -284,8 +283,7 @@ bool accept_token(Iterator begin, Iterator end, word_break_t wb) {

} // namespace

namespace irs {
namespace analysis {
namespace irs::analysis {

using namespace boost::text;

Expand Down Expand Up @@ -378,5 +376,4 @@ bool segmentation_token_stream::reset(std::string_view data) {
return true;
}

} // namespace analysis
} // namespace irs
} // namespace irs::analysis
15 changes: 7 additions & 8 deletions core/analysis/text_token_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1053,18 +1053,17 @@ bool text_token_stream::next_word() {
continue;
}

// TODO(MBkkt) simdutf::utf8_length_from_utf16
auto utf8_length = [data = &state_->data](uint32_t begin,
uint32_t end) noexcept {
uint32_t length = 0;
while (begin < end) {
const auto cp = data->char32At(begin);

// icu::UnicodeString::kInvalidUChar is private
if (IRS_UNLIKELY(0xFFFF == cp)) {
return uint32_t{0};
if (cp == utf8_utils::kInvalidChar32) {
IRS_ASSERT(length == 0);
return 0U;
}

length += utf8_utils::cp_length(cp);
length += utf8_utils::LengthFromChar32(cp);
begin += 1U + uint32_t{!U_IS_BMP(cp)};
}
return length;
Expand Down Expand Up @@ -1092,13 +1091,13 @@ bool text_token_stream::next_ngram() {
inc.value = 1;
// find the first ngram > min
do {
state_->ngram.it = utf8_utils::next(state_->ngram.it, end);
state_->ngram.it = utf8_utils::Next(state_->ngram.it, end);
} while (++state_->ngram.length < state_->options.min_gram &&
state_->ngram.it != end);
} else {
// not first ngram in a word
inc.value = 0; // staying on the current pos
state_->ngram.it = utf8_utils::next(state_->ngram.it, end);
state_->ngram.it = utf8_utils::Next(state_->ngram.it, end);
++state_->ngram.length;
}

Expand Down
15 changes: 7 additions & 8 deletions core/analysis/token_attributes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,22 @@

#include "shared.hpp"

namespace irs {
namespace {

struct empty_position final : irs::position {
void reset() final {}
bool next() final { return false; }
attribute* get_mutable(irs::type_info::type_id) noexcept final {
struct EmptyPosition final : position {
attribute* get_mutable(type_info::type_id /*type*/) noexcept final {
return nullptr;
}

bool next() final { return false; }
};

empty_position NO_POSITION;
EmptyPosition kNoPosition;

} // namespace

namespace irs {

irs::position* position::empty() noexcept { return &NO_POSITION; }
position& position::empty() noexcept { return kNoPosition; }

REGISTER_ATTRIBUTE(frequency);
REGISTER_ATTRIBUTE(position);
Expand Down
16 changes: 6 additions & 10 deletions core/analysis/token_attributes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,26 +110,22 @@ class position : public attribute, public attribute_provider {
// DO NOT CHANGE NAME
static constexpr std::string_view type_name() noexcept { return "position"; }

static position* empty() noexcept;
static position& empty() noexcept;

template<typename Provider>
static position& get_mutable(Provider& attrs) {
auto* pos = irs::get_mutable<position>(&attrs);
return pos ? *pos : *empty();
}

virtual value_t seek(value_t target) {
while ((value_ < target) && next())
;
return value_;
return pos ? *pos : empty();
}

value_t value() const noexcept { return value_; }

virtual void reset() = 0;

virtual bool next() = 0;

virtual value_t seek(value_t /*target*/) { return pos_limits::invalid(); }

virtual void reset() {}

protected:
value_t value_{pos_limits::invalid()};
};
Expand Down
Loading

0 comments on commit 5d6061d

Please sign in to comment.