From fbb3f4982fac57299c7dedfa7c03e581e0ff69a3 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Sun, 6 Oct 2024 21:37:19 -0400 Subject: [PATCH 1/2] add neon simd version of find_authority_delimiter --- src/helpers.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/helpers.cpp b/src/helpers.cpp index b84b533ec..a07264abf 100644 --- a/src/helpers.cpp +++ b/src/helpers.cpp @@ -6,6 +6,10 @@ #include #include +#if ADA_NEON +#include +#endif // ADA_NEON + namespace ada::helpers { template @@ -762,6 +766,42 @@ static constexpr std::array authority_delimiter = []() consteval { } return result; }(); +#if ADA_NEON +ada_really_inline size_t +find_authority_delimiter(std::string_view view) noexcept { + const auto* data = reinterpret_cast(view.data()); + size_t length = view.size(); + size_t i = 0; + + // Prepare NEON registers + uint8x16_t lookup = vld1q_u8(authority_delimiter.data()); + + // SIMD processing for 16-byte chunks + for (; i + 16 <= length; i += 16) { + uint8x16_t chunk = vld1q_u8(data + i); + uint8x16_t result = vqtbl1q_u8(lookup, chunk); + + uint64x2_t mask64 = vreinterpretq_u64_u8(result); + uint64_t low_bits = vgetq_lane_u64(mask64, 0); + uint64_t high_bits = vgetq_lane_u64(mask64, 1); + + if (low_bits != 0) { + return i + __builtin_ctzll(low_bits); + } else if (high_bits != 0) { + return i + 64 + __builtin_ctzll(high_bits); + } + } + + // Handle remaining bytes + for (; i < length; ++i) { + if (authority_delimiter[data[i]]) { + return i; + } + } + + return length; +} +#else // credit: @the-moisrex recommended a table-based approach ada_really_inline size_t find_authority_delimiter(std::string_view view) noexcept { @@ -774,6 +814,7 @@ find_authority_delimiter(std::string_view view) noexcept { } return size_t(view.size()); } +#endif } // namespace ada::helpers From ffe4b5ec1ae41b1718f1f1b87429c4f722184221 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Mon, 7 Oct 2024 14:33:10 -0400 Subject: [PATCH 2/2] lint --- src/helpers.cpp | 58 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/src/helpers.cpp b/src/helpers.cpp index a07264abf..2d2cd145c 100644 --- a/src/helpers.cpp +++ b/src/helpers.cpp @@ -745,6 +745,38 @@ static constexpr std::array authority_delimiter_special = } return result; }(); +#if ADA_NEON +ada_really_inline size_t +find_authority_delimiter_special(std::string_view view) noexcept { + const auto* data = reinterpret_cast(view.data()); + size_t length = view.size(); + size_t i = 0; + // Prepare NEON register + uint8x16_t lookup = + ada_make_uint8x16_t(0x40, 0x40, 0x2f, 0x3f, 0x40, 0x5c, 0x40, 0x40, 0x40, + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40); + + // SIMD processing for 16-byte chunks + for (; i + 16 <= length; i += 16) { + // Here we can do (x == '@' || x == '/' || x == '?') which is 5 + // instructions. Or we can do lookup(x>>4) == x which is 3 instructions. + uint8x16_t chunk = vld1q_u8(data + i); + uint8x16_t result = vqtbl1q_u8(lookup, vshrq_n_u8(chunk, 4)); + uint8x16_t match = vceqq_u8(result, chunk); + uint8x8_t narrow_match = vshrn_n_u16(vreinterpretq_u16_u8(match), 4); + uint64_t nibblemask = vget_lane_u64(vreinterpret_u64_u8(narrow_match), 0); + if (nibblemask != 0) { + return i + (std::countr_zero(nibblemask) >> 2); + } + } + for (auto pos = view.begin(); pos != view.end(); ++pos) { + if (authority_delimiter_special[(uint8_t)*pos]) { + return pos - view.begin(); + } + } + return size_t(view.size()); +} +#else // credit: @the-moisrex recommended a table-based approach ada_really_inline size_t find_authority_delimiter_special(std::string_view view) noexcept { @@ -757,6 +789,7 @@ find_authority_delimiter_special(std::string_view view) noexcept { } return size_t(view.size()); } +#endif // @ / ? static constexpr std::array authority_delimiter = []() consteval { @@ -772,23 +805,22 @@ find_authority_delimiter(std::string_view view) noexcept { const auto* data = reinterpret_cast(view.data()); size_t length = view.size(); size_t i = 0; - - // Prepare NEON registers - uint8x16_t lookup = vld1q_u8(authority_delimiter.data()); + // Prepare NEON register + uint8x16_t lookup = + ada_make_uint8x16_t(0x40, 0x40, 0x2f, 0x3f, 0x40, 0x40, 0x40, 0x40, 0x40, + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40); // SIMD processing for 16-byte chunks for (; i + 16 <= length; i += 16) { + // Here we can do (x == '@' || x == '/' || x == '?') which is 5 + // instructions. Or we can do lookup(x>>4) == x which is 3 instructions. uint8x16_t chunk = vld1q_u8(data + i); - uint8x16_t result = vqtbl1q_u8(lookup, chunk); - - uint64x2_t mask64 = vreinterpretq_u64_u8(result); - uint64_t low_bits = vgetq_lane_u64(mask64, 0); - uint64_t high_bits = vgetq_lane_u64(mask64, 1); - - if (low_bits != 0) { - return i + __builtin_ctzll(low_bits); - } else if (high_bits != 0) { - return i + 64 + __builtin_ctzll(high_bits); + uint8x16_t result = vqtbl1q_u8(lookup, vshrq_n_u8(chunk, 4)); + uint8x16_t match = vceqq_u8(result, chunk); + uint8x8_t narrow_match = vshrn_n_u16(vreinterpretq_u16_u8(match), 4); + uint64_t nibblemask = vget_lane_u64(vreinterpret_u64_u8(narrow_match), 0); + if (nibblemask != 0) { + return i + (std::countr_zero(nibblemask) >> 2); } }