Skip to content

Commit

Permalink
optimize has_tabs_or_newline for NEON (#639)
Browse files Browse the repository at this point in the history
* optimize has_tabs_or_newline for NEON

* Update unicode.cpp

* adding description

* fix: replace vmaxvq_u8 by vmaxvq_u32 (for performance)

* fix: rnt_array was wrong

* fix: linting
  • Loading branch information
lemire authored May 1, 2024
1 parent 02c389e commit 08aad0b
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 14 deletions.
8 changes: 4 additions & 4 deletions src/helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ ada_really_inline size_t find_next_host_delimiter_special(
uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
uint8x16_t classify = vandq_u8(lowpart, highpart);
if (vmaxvq_u8(classify) != 0) {
if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
uint8x16_t is_zero = vceqq_u8(classify, zero);
uint16_t is_non_zero = ~to_bitmask(is_zero);
return i + trailing_zeroes(is_non_zero);
Expand All @@ -256,7 +256,7 @@ ada_really_inline size_t find_next_host_delimiter_special(
uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
uint8x16_t classify = vandq_u8(lowpart, highpart);
if (vmaxvq_u8(classify) != 0) {
if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
uint8x16_t is_zero = vceqq_u8(classify, zero);
uint16_t is_non_zero = ~to_bitmask(is_zero);
return view.length() - 16 + trailing_zeroes(is_non_zero);
Expand Down Expand Up @@ -381,7 +381,7 @@ ada_really_inline size_t find_next_host_delimiter(std::string_view view,
uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
uint8x16_t classify = vandq_u8(lowpart, highpart);
if (vmaxvq_u8(classify) != 0) {
if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
uint8x16_t is_zero = vceqq_u8(classify, zero);
uint16_t is_non_zero = ~to_bitmask(is_zero);
return i + trailing_zeroes(is_non_zero);
Expand All @@ -394,7 +394,7 @@ ada_really_inline size_t find_next_host_delimiter(std::string_view view,
uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
uint8x16_t classify = vandq_u8(lowpart, highpart);
if (vmaxvq_u8(classify) != 0) {
if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
uint8x16_t is_zero = vceqq_u8(classify, zero);
uint16_t is_non_zero = ~to_bitmask(is_zero);
return view.length() - 16 + trailing_zeroes(is_non_zero);
Expand Down
31 changes: 21 additions & 10 deletions src/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,24 +60,34 @@ ada_really_inline bool has_tabs_or_newline(
}
// fast path for long strings (expected to be common)
size_t i = 0;
const uint8x16_t mask1 = vmovq_n_u8('\r');
const uint8x16_t mask2 = vmovq_n_u8('\n');
const uint8x16_t mask3 = vmovq_n_u8('\t');
/**
* The fastest way to check for `\t` (==9), '\n'(== 10) and `\r` (==13) relies
* on table lookup instruction. We notice that these are all unique numbers
* between 0..15. Let's prepare a special register, where we put '\t' in the
* 9th position, '\n' - 10th and '\r' - 13th. Then we shuffle this register by
* input register. If the input had `\t` in position X then this shuffled
* register will also have '\t' in that position. Comparing input with this
* shuffled register will mark us all interesting characters in the input.
*
* credit for algorithmic idea: @aqrit, credit for description:
* @DenisYaroshevskiy
*/
static uint8_t rnt_array[16] = {1, 0, 0, 0, 0, 0, 0, 0,
0, 9, 10, 0, 0, 13, 0, 0};
const uint8x16_t rnt = vld1q_u8(rnt_array);
// m['0xd', '0xa', '0x9']
uint8x16_t running{0};
for (; i + 15 < user_input.size(); i += 16) {
uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1),
vceqq_u8(word, mask2))),
vceqq_u8(word, mask3));

running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
}
if (i < user_input.size()) {
uint8x16_t word =
vld1q_u8((const uint8_t*)user_input.data() + user_input.length() - 16);
running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1),
vceqq_u8(word, mask2))),
vceqq_u8(word, mask3));
running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
}
return vmaxvq_u8(running) != 0;
return vmaxvq_u32(vreinterpretq_u32_u8(running)) != 0;
}
#elif ADA_SSE2
ada_really_inline bool has_tabs_or_newline(
Expand All @@ -97,6 +107,7 @@ ada_really_inline bool has_tabs_or_newline(
const __m128i mask1 = _mm_set1_epi8('\r');
const __m128i mask2 = _mm_set1_epi8('\n');
const __m128i mask3 = _mm_set1_epi8('\t');
// If we supported SSSE3, we could use the algorithm that we use for NEON.
__m128i running{0};
for (; i + 15 < user_input.size(); i += 16) {
__m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));
Expand Down

0 comments on commit 08aad0b

Please sign in to comment.