Fixes for utf-8 (#582)

* Small fixes * Adjust test for ArangoDB * Fix * Fix * Fix * Fix
iresearch-toolkit · Dec 16, 2023 · 5d6061d · 5d6061d
1 parent 6aacbcc
commit 5d6061d
Show file tree

Hide file tree

Showing 37 changed files with 1,552 additions and 2,103 deletions.
diff --git a/.clang-tidy b/.clang-tidy
@@ -10,17 +10,22 @@ Checks: '*,
          -fuchsia-default-arguments-*,
          -fuchsia-overloaded-operator,
          -readability-identifier-length,
+         -readability-suspicious-call-argument,
+         -readability-function-cognitive-complexity,
          -cppcoreguidelines-owning-memory,
-         -cppcoreguidelines-avoid-do-while,
          -cppcoreguidelines-avoid-c-arrays,
+         -cppcoreguidelines-avoid-do-while,
+         -cppcoreguidelines-avoid-magic-numbers,
          -cppcoreguidelines-pro-type-union-access,
+         -cppcoreguidelines-pro-bounds-pointer-arithmetic,
          -cppcoreguidelines-pro-bounds-constant-array-index,
          -cppcoreguidelines-pro-bounds-array-to-pointer-decay,
          -modernize-use-nodiscard,
          -modernize-avoid-c-arrays,
          -modernize-use-trailing-return-type,
          -hicpp-avoid-c-arrays,
          -hicpp-no-array-decay,
+         -bugprone-easily-swappable-parameters,
          -llvm-header-guard,
          -cert-err58-cpp,
          -google-build-using-namespace'

diff --git a/core/analysis/ngram_token_stream.cpp b/core/analysis/ngram_token_stream.cpp
@@ -464,27 +464,15 @@ template<irs::analysis::ngram_token_stream_base::InputType StreamType>
 bool ngram_token_stream<StreamType>::next_symbol(
   const byte_type*& it) const noexcept {
   IRS_ASSERT(it);
-  if (it < data_end_) {
-    if constexpr (StreamType == InputType::Binary) {
-      ++it;
-    } else if constexpr (StreamType == InputType::UTF8) {
-      const uint32_t cp_start = *it++;
-      if (IRS_UNLIKELY(cp_start >= 0b1000'0000)) {
-        if (cp_start < 0b1110'0000) {
-          ++it;
-        } else if (cp_start < 0b1111'0000) {
-          it += 2;
-        } else if (cp_start < 0b1111'1000) {
-          it += 3;
-        }
-        if (it > data_end_) {
-          it = data_end_;
-        }
-      }
-    }
-    return true;
+  if (IRS_UNLIKELY(it == data_end_)) {
+    return false;
   }
-  return false;
+  if constexpr (StreamType == InputType::Binary) {
+    ++it;
+  } else if constexpr (StreamType == InputType::UTF8) {
+    it = utf8_utils::Next(it, data_end_);
+  }
+  return true;
 }
 
 template<irs::analysis::ngram_token_stream_base::InputType StreamType>
@@ -543,25 +531,21 @@ bool ngram_token_stream<StreamType>::next() noexcept {
         }
         return true;
       }
-    } else {
+    } else if (EmitOriginal::None == emit_original_) {
       // need to move to next position
-      if (EmitOriginal::None == emit_original_) {
-        if (next_symbol(begin_)) {
-          next_inc_val_ = 1;
-          length_ = 0;
-          ngram_end_ = begin_;
-          offset.start =
-            static_cast<uint32_t>(std::distance(data_.data(), begin_));
-        } else {
-          return false;  // stream exhausted
-        }
-      } else {
-        // as stream has unsigned incremet attribute
-        // we cannot go back, so we must emit original before we leave start pos
-        // in stream (as it starts from pos=0 in stream)
-        emit_original();
-        return true;
+      if (IRS_UNLIKELY(!next_symbol(begin_))) {
+        return false;  // stream exhausted
       }
+      next_inc_val_ = 1;
+      length_ = 0;
+      ngram_end_ = begin_;
+      offset.start = static_cast<uint32_t>(std::distance(data_.data(), begin_));
+    } else {
+      // as stream has unsigned incremet attribute
+      // we cannot go back, so we must emit original before we leave start pos
+      // in stream (as it starts from pos=0 in stream)
+      emit_original();
+      return true;
     }
   }
   return false;

diff --git a/core/analysis/segmentation_token_stream.cpp b/core/analysis/segmentation_token_stream.cpp
@@ -41,13 +41,13 @@ namespace {
 
 using namespace irs;
 
-constexpr std::string_view CASE_CONVERT_PARAM_NAME{"case"};
-constexpr std::string_view BREAK_PARAM_NAME{"break"};
+constexpr std::string_view kCaseConvertParamName{"case"};
+constexpr std::string_view kBreakParamName{"break"};
 
 constexpr frozen::unordered_map<
   std::string_view,
   analysis::segmentation_token_stream::options_t::case_convert_t, 3>
-  CASE_CONVERT_MAP = {
+  kCaseConvertMap = {
     {"lower",
      analysis::segmentation_token_stream::options_t::case_convert_t::LOWER},
     {"none",
@@ -59,7 +59,7 @@ constexpr frozen::unordered_map<
 constexpr frozen::unordered_map<
   std::string_view,
   analysis::segmentation_token_stream::options_t::word_break_t, 3>
-  BREAK_CONVERT_MAP = {
+  kBreakConvertMap = {
     {"all", analysis::segmentation_token_stream::options_t::word_break_t::ALL},
     {"alpha",
      analysis::segmentation_token_stream::options_t::word_break_t::ALPHA},
@@ -74,43 +74,43 @@ bool parse_vpack_options(
     IRS_LOG_ERROR("Slice for segmentation_token_stream is not an object");
     return false;
   }
-  if (auto case_convert_slice = slice.get(CASE_CONVERT_PARAM_NAME);
+  if (auto case_convert_slice = slice.get(kCaseConvertParamName);
       !case_convert_slice.isNone()) {
     if (!case_convert_slice.isString()) {
       IRS_LOG_WARN(
-        absl::StrCat("Invalid type '", CASE_CONVERT_PARAM_NAME,
+        absl::StrCat("Invalid type '", kCaseConvertParamName,
                      "' (string expected) for segmentation_token_stream from "
                      "VPack arguments"));
       return false;
     }
     auto case_convert = case_convert_slice.stringView();
-    auto itr = CASE_CONVERT_MAP.find(
+    auto itr = kCaseConvertMap.find(
       std::string_view(case_convert.data(), case_convert.size()));
 
-    if (itr == CASE_CONVERT_MAP.end()) {
+    if (itr == kCaseConvertMap.end()) {
       IRS_LOG_WARN(
-        absl::StrCat("Invalid value in '", CASE_CONVERT_PARAM_NAME,
+        absl::StrCat("Invalid value in '", kCaseConvertParamName,
                      "' for segmentation_token_stream from VPack arguments"));
       return false;
     }
     options.case_convert = itr->second;
   }
-  if (auto break_type_slice = slice.get(BREAK_PARAM_NAME);
+  if (auto break_type_slice = slice.get(kBreakParamName);
       !break_type_slice.isNone()) {
     if (!break_type_slice.isString()) {
       IRS_LOG_WARN(
-        absl::StrCat("Invalid type '", BREAK_PARAM_NAME,
+        absl::StrCat("Invalid type '", kBreakParamName,
                      "' (string expected) for segmentation_token_stream from "
                      "VPack arguments"));
       return false;
     }
     auto break_type = break_type_slice.stringView();
-    auto itr = BREAK_CONVERT_MAP.find(
+    auto itr = kBreakConvertMap.find(
       std::string_view(break_type.data(), break_type.size()));
 
-    if (itr == BREAK_CONVERT_MAP.end()) {
+    if (itr == kBreakConvertMap.end()) {
       IRS_LOG_WARN(
-        absl::StrCat("Invalid value in '", BREAK_PARAM_NAME,
+        absl::StrCat("Invalid value in '", kBreakParamName,
                      "' for segmentation_token_stream from VPack arguments"));
       return false;
     }
@@ -124,32 +124,32 @@ bool make_vpack_config(
   VPackBuilder* builder) {
   VPackObjectBuilder object(builder);
   {
-    auto it = std::find_if(CASE_CONVERT_MAP.begin(), CASE_CONVERT_MAP.end(),
+    auto it = std::find_if(kCaseConvertMap.begin(), kCaseConvertMap.end(),
                            [v = options.case_convert](
-                             const decltype(CASE_CONVERT_MAP)::value_type& m) {
+                             const decltype(kCaseConvertMap)::value_type& m) {
                              return m.second == v;
                            });
-    if (it != CASE_CONVERT_MAP.end()) {
-      builder->add(CASE_CONVERT_PARAM_NAME, VPackValue(it->first));
+    if (it != kCaseConvertMap.end()) {
+      builder->add(kCaseConvertParamName, VPackValue(it->first));
     } else {
       IRS_LOG_WARN(absl::StrCat(
-        "Invalid value in '", CASE_CONVERT_PARAM_NAME,
+        "Invalid value in '", kCaseConvertParamName,
         "' for normalizing segmentation_token_stream from Value is: ",
         options.case_convert));
       return false;
     }
   }
   {
-    auto it = std::find_if(BREAK_CONVERT_MAP.begin(), BREAK_CONVERT_MAP.end(),
+    auto it = std::find_if(kBreakConvertMap.begin(), kBreakConvertMap.end(),
                            [v = options.word_break](
-                             const decltype(BREAK_CONVERT_MAP)::value_type& m) {
+                             const decltype(kBreakConvertMap)::value_type& m) {
                              return m.second == v;
                            });
-    if (it != BREAK_CONVERT_MAP.end()) {
-      builder->add(BREAK_PARAM_NAME, VPackValue(it->first));
+    if (it != kBreakConvertMap.end()) {
+      builder->add(kBreakParamName, VPackValue(it->first));
     } else {
       IRS_LOG_WARN(absl::StrCat(
-        "Invalid value in '", BREAK_PARAM_NAME,
+        "Invalid value in '", kBreakParamName,
         "' for normalizing segmentation_token_stream from Value is: ",
         options.word_break));
       return false;
@@ -272,10 +272,9 @@ bool accept_token(Iterator begin, Iterator end, word_break_t wb) {
     case word_break_t::ALL:
       return true;
     case word_break_t::GRAPHIC:
-      return std::find_if_not(begin, end, utf8_utils::char_is_white_space) !=
-             end;
+      return std::find_if_not(begin, end, utf8_utils::CharIsWhiteSpace) != end;
     case word_break_t::ALPHA:
-      return std::find_if(begin, end, utf8_utils::char_is_alphanumeric) != end;
+      return std::find_if(begin, end, utf8_utils::CharIsAlphanumeric) != end;
     default:
       IRS_ASSERT(false);
       return false;
@@ -284,8 +283,7 @@ bool accept_token(Iterator begin, Iterator end, word_break_t wb) {
 
 }  // namespace
 
-namespace irs {
-namespace analysis {
+namespace irs::analysis {
 
 using namespace boost::text;
 
@@ -378,5 +376,4 @@ bool segmentation_token_stream::reset(std::string_view data) {
   return true;
 }
 
-}  // namespace analysis
-}  // namespace irs
+}  // namespace irs::analysis
diff --git a/core/analysis/text_token_stream.cpp b/core/analysis/text_token_stream.cpp
@@ -1053,18 +1053,17 @@ bool text_token_stream::next_word() {
       continue;
     }
 
+    // TODO(MBkkt) simdutf::utf8_length_from_utf16
     auto utf8_length = [data = &state_->data](uint32_t begin,
                                               uint32_t end) noexcept {
       uint32_t length = 0;
       while (begin < end) {
         const auto cp = data->char32At(begin);
-
-        // icu::UnicodeString::kInvalidUChar is private
-        if (IRS_UNLIKELY(0xFFFF == cp)) {
-          return uint32_t{0};
+        if (cp == utf8_utils::kInvalidChar32) {
+          IRS_ASSERT(length == 0);
+          return 0U;
         }
-
-        length += utf8_utils::cp_length(cp);
+        length += utf8_utils::LengthFromChar32(cp);
         begin += 1U + uint32_t{!U_IS_BMP(cp)};
       }
       return length;
@@ -1092,13 +1091,13 @@ bool text_token_stream::next_ngram() {
     inc.value = 1;
     // find the first ngram > min
     do {
-      state_->ngram.it = utf8_utils::next(state_->ngram.it, end);
+      state_->ngram.it = utf8_utils::Next(state_->ngram.it, end);
     } while (++state_->ngram.length < state_->options.min_gram &&
              state_->ngram.it != end);
   } else {
     // not first ngram in a word
     inc.value = 0;  // staying on the current pos
-    state_->ngram.it = utf8_utils::next(state_->ngram.it, end);
+    state_->ngram.it = utf8_utils::Next(state_->ngram.it, end);
     ++state_->ngram.length;
   }
 

diff --git a/core/analysis/token_attributes.cpp b/core/analysis/token_attributes.cpp
@@ -24,23 +24,22 @@
 
 #include "shared.hpp"
 
+namespace irs {
 namespace {
 
-struct empty_position final : irs::position {
-  void reset() final {}
-  bool next() final { return false; }
-  attribute* get_mutable(irs::type_info::type_id) noexcept final {
+struct EmptyPosition final : position {
+  attribute* get_mutable(type_info::type_id /*type*/) noexcept final {
     return nullptr;
   }
+
+  bool next() final { return false; }
 };
 
-empty_position NO_POSITION;
+EmptyPosition kNoPosition;
 
 }  // namespace
 
-namespace irs {
-
-irs::position* position::empty() noexcept { return &NO_POSITION; }
+position& position::empty() noexcept { return kNoPosition; }
 
 REGISTER_ATTRIBUTE(frequency);
 REGISTER_ATTRIBUTE(position);

diff --git a/core/analysis/token_attributes.hpp b/core/analysis/token_attributes.hpp
@@ -110,26 +110,22 @@ class position : public attribute, public attribute_provider {
   // DO NOT CHANGE NAME
   static constexpr std::string_view type_name() noexcept { return "position"; }
 
-  static position* empty() noexcept;
+  static position& empty() noexcept;
 
   template<typename Provider>
   static position& get_mutable(Provider& attrs) {
     auto* pos = irs::get_mutable<position>(&attrs);
-    return pos ? *pos : *empty();
-  }
-
-  virtual value_t seek(value_t target) {
-    while ((value_ < target) && next())
-      ;
-    return value_;
+    return pos ? *pos : empty();
   }
 
   value_t value() const noexcept { return value_; }
 
-  virtual void reset() = 0;
-
   virtual bool next() = 0;
 
+  virtual value_t seek(value_t /*target*/) { return pos_limits::invalid(); }
+
+  virtual void reset() {}
+
  protected:
   value_t value_{pos_limits::invalid()};
 };