From 08aad0b7c86784519d3c1826587eda2cfe23dbda Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Wed, 1 May 2024 14:53:09 -0400 Subject: [PATCH] optimize has_tabs_or_newline for NEON (#639) * optimize has_tabs_or_newline for NEON * Update unicode.cpp * adding description * fix: replace vmaxvq_u8 by vmaxvq_u32 (for performance) * fix: rnt_array was wrong * fix: linting --- src/helpers.cpp | 8 ++++---- src/unicode.cpp | 31 +++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/helpers.cpp b/src/helpers.cpp index 6c91e31bf..e1491bd08 100644 --- a/src/helpers.cpp +++ b/src/helpers.cpp @@ -243,7 +243,7 @@ ada_really_inline size_t find_next_host_delimiter_special( uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask)); uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4)); uint8x16_t classify = vandq_u8(lowpart, highpart); - if (vmaxvq_u8(classify) != 0) { + if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) { uint8x16_t is_zero = vceqq_u8(classify, zero); uint16_t is_non_zero = ~to_bitmask(is_zero); return i + trailing_zeroes(is_non_zero); @@ -256,7 +256,7 @@ ada_really_inline size_t find_next_host_delimiter_special( uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask)); uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4)); uint8x16_t classify = vandq_u8(lowpart, highpart); - if (vmaxvq_u8(classify) != 0) { + if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) { uint8x16_t is_zero = vceqq_u8(classify, zero); uint16_t is_non_zero = ~to_bitmask(is_zero); return view.length() - 16 + trailing_zeroes(is_non_zero); @@ -381,7 +381,7 @@ ada_really_inline size_t find_next_host_delimiter(std::string_view view, uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask)); uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4)); uint8x16_t classify = vandq_u8(lowpart, highpart); - if (vmaxvq_u8(classify) != 0) { + if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) { uint8x16_t is_zero = vceqq_u8(classify, zero); uint16_t is_non_zero = ~to_bitmask(is_zero); return i + trailing_zeroes(is_non_zero); @@ -394,7 +394,7 @@ ada_really_inline size_t find_next_host_delimiter(std::string_view view, uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask)); uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4)); uint8x16_t classify = vandq_u8(lowpart, highpart); - if (vmaxvq_u8(classify) != 0) { + if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) { uint8x16_t is_zero = vceqq_u8(classify, zero); uint16_t is_non_zero = ~to_bitmask(is_zero); return view.length() - 16 + trailing_zeroes(is_non_zero); diff --git a/src/unicode.cpp b/src/unicode.cpp index 554195871..3f3a84c21 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -60,24 +60,34 @@ ada_really_inline bool has_tabs_or_newline( } // fast path for long strings (expected to be common) size_t i = 0; - const uint8x16_t mask1 = vmovq_n_u8('\r'); - const uint8x16_t mask2 = vmovq_n_u8('\n'); - const uint8x16_t mask3 = vmovq_n_u8('\t'); + /** + * The fastest way to check for `\t` (==9), '\n'(== 10) and `\r` (==13) relies + * on table lookup instruction. We notice that these are all unique numbers + * between 0..15. Let's prepare a special register, where we put '\t' in the + * 9th position, '\n' - 10th and '\r' - 13th. Then we shuffle this register by + * input register. If the input had `\t` in position X then this shuffled + * register will also have '\t' in that position. Comparing input with this + * shuffled register will mark us all interesting characters in the input. + * + * credit for algorithmic idea: @aqrit, credit for description: + * @DenisYaroshevskiy + */ + static uint8_t rnt_array[16] = {1, 0, 0, 0, 0, 0, 0, 0, + 0, 9, 10, 0, 0, 13, 0, 0}; + const uint8x16_t rnt = vld1q_u8(rnt_array); + // m['0xd', '0xa', '0x9'] uint8x16_t running{0}; for (; i + 15 < user_input.size(); i += 16) { uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i); - running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1), - vceqq_u8(word, mask2))), - vceqq_u8(word, mask3)); + + running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word)); } if (i < user_input.size()) { uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + user_input.length() - 16); - running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1), - vceqq_u8(word, mask2))), - vceqq_u8(word, mask3)); + running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word)); } - return vmaxvq_u8(running) != 0; + return vmaxvq_u32(vreinterpretq_u32_u8(running)) != 0; } #elif ADA_SSE2 ada_really_inline bool has_tabs_or_newline( @@ -97,6 +107,7 @@ ada_really_inline bool has_tabs_or_newline( const __m128i mask1 = _mm_set1_epi8('\r'); const __m128i mask2 = _mm_set1_epi8('\n'); const __m128i mask3 = _mm_set1_epi8('\t'); + // If we supported SSSE3, we could use the algorithm that we use for NEON. __m128i running{0}; for (; i + 15 < user_input.size(); i += 16) { __m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));