diff --git a/third-party/folly/src/folly/algorithm/simd/Movemask.h b/third-party/folly/src/folly/algorithm/simd/Movemask.h index 88d8012b73b1a1..3f0c6407b97e08 100644 --- a/third-party/folly/src/folly/algorithm/simd/Movemask.h +++ b/third-party/folly/src/folly/algorithm/simd/Movemask.h @@ -91,22 +91,7 @@ struct movemask_fn { auto operator()(Reg reg) const; template - FOLLY_ERASE auto operator()(Reg reg, Ignore ignore) const { - auto [bits, bitsPerElement] = operator()(reg); - - if constexpr (std::is_same_v) { - return std::pair{bits, bitsPerElement}; - } else { - static constexpr int kCardinal = sizeof(Reg) / sizeof(Scalar); - - int bitsToKeep = (kCardinal - ignore.last) * bitsPerElement; - - bits = - clear_n_least_significant_bits(bits, ignore.first * bitsPerElement); - bits = clear_n_most_significant_bits(bits, sizeof(bits) * 8 - bitsToKeep); - return std::pair{bits, bitsPerElement}; - } - } + auto operator()(Reg reg, Ignore ignore) const; }; template @@ -116,7 +101,7 @@ inline constexpr movemask_fn movemask; template template -auto movemask_fn::operator()(Reg reg) const { +FOLLY_ERASE auto movemask_fn::operator()(Reg reg) const { std::integral_constant bitsPerElement; @@ -154,7 +139,7 @@ auto movemask_fn::operator()(Reg reg) const { namespace detail { -inline auto movemaskChars16Aarch64(uint8x16_t reg) { +FOLLY_ERASE auto movemaskChars16Aarch64(uint8x16_t reg) { uint16x8_t u16s = vreinterpretq_u16_u8(reg); u16s = vshrq_n_u16(u16s, 4); uint8x8_t packed = vmovn_u16(u16s); @@ -163,7 +148,7 @@ inline auto movemaskChars16Aarch64(uint8x16_t reg) { } template -uint64x1_t asUint64x1Aarch64(Reg reg) { +FOLLY_ERASE uint64x1_t asUint64x1Aarch64(Reg reg) { if constexpr (std::is_same_v) { return vreinterpret_u64_u32(reg); } else if constexpr (std::is_same_v) { @@ -177,7 +162,7 @@ uint64x1_t asUint64x1Aarch64(Reg reg) { template template -auto movemask_fn::operator()(Reg reg) const { +FOLLY_ERASE auto movemask_fn::operator()(Reg reg) const { if constexpr (std::is_same_v) { return movemask(vmovn_u64(reg)); } else if constexpr (std::is_same_v) { @@ -195,6 +180,28 @@ auto movemask_fn::operator()(Reg reg) const { #endif +#if FOLLY_X64 || FOLLY_AARCH64 + +template +template +FOLLY_ERASE auto movemask_fn::operator()(Reg reg, Ignore ignore) const { + auto [bits, bitsPerElement] = operator()(reg); + + if constexpr (std::is_same_v) { + return std::pair{bits, bitsPerElement}; + } else { + static constexpr int kCardinal = sizeof(Reg) / sizeof(Scalar); + + int bitsToKeep = (kCardinal - ignore.last) * bitsPerElement; + + bits = clear_n_least_significant_bits(bits, ignore.first * bitsPerElement); + bits = clear_n_most_significant_bits(bits, sizeof(bits) * 8 - bitsToKeep); + return std::pair{bits, bitsPerElement}; + } +} + +#endif + } // namespace folly::simd FOLLY_POP_WARNING diff --git a/third-party/folly/src/folly/algorithm/simd/detail/ContainsImpl.h b/third-party/folly/src/folly/algorithm/simd/detail/ContainsImpl.h index 66d049f4545f24..e1bfdc3a82c43c 100644 --- a/third-party/folly/src/folly/algorithm/simd/detail/ContainsImpl.h +++ b/third-party/folly/src/folly/algorithm/simd/detail/ContainsImpl.h @@ -23,7 +23,7 @@ #include #include -#include +#include #include namespace folly::simd::detail { @@ -62,20 +62,20 @@ FOLLY_ERASE bool containsImplStd(folly::span haystack, T needle) { template constexpr bool hasHandwrittenContains() { - return std::is_same_v && - !std::is_same_v; + return !std::is_same_v, void> && + (std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v); } -template +template > FOLLY_ERASE bool containsImplHandwritten( folly::span haystack, T needle) { - static_assert(std::is_same_v, ""); - auto as_chars = folly::reinterpret_span_cast(haystack); - return simdAnyOf( - as_chars.data(), - as_chars.data() + as_chars.size(), - [&](SimdCharPlatform::reg_t x) { - return SimdCharPlatform::equal(x, static_cast(needle)); + static_assert(!std::is_same_v, ""); + return simdAnyOf( + haystack.data(), + haystack.data() + haystack.size(), + [&](typename Platform::reg_t x) { + return Platform::equal(x, static_cast(needle)); }); } diff --git a/third-party/folly/src/folly/algorithm/simd/detail/SimdCharPlatform.h b/third-party/folly/src/folly/algorithm/simd/detail/SimdCharPlatform.h deleted file mode 100644 index 9d1254acf9d343..00000000000000 --- a/third-party/folly/src/folly/algorithm/simd/detail/SimdCharPlatform.h +++ /dev/null @@ -1,267 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include - -#if FOLLY_X64 -#include -#endif - -#if FOLLY_AARCH64 -#include -#endif - -namespace folly { -namespace simd::detail { - -/** - * SimdCharPlatform - * - * Common interface for some SIMD operations on chars between: sse2, avx2, - * arm-neon. (maybe we will move to sse4.2 at some point, we don't care much for - * pure sse2). - * - * If it's not one of the supported platforms, std::same_as. - * There is also a macro: FOLLY_DETAIL_HAS_SIMD_CHAR_PLATFORM set to 1 or 0 - * - * Nested types: - * - reg_t - type of a simd register (__m128i) - * - logical_t - type of a simd logical register (matches reg_t so far) - * - * Nested constants: - * - kCardinal - number of elements in a register - * - * loads: - * - loadu(const char*, ignore_none) - * - unsafeLoadU(const char*, ignore_none) - * - loada(const char*, ignore) - * - * a/u stand for aligned/unaligned. Ignored values can be garbage. unsafe - * disables sanitizers. - * - * reg ops: - * - equal(reg_t, char) - by lane comparison against a char. - * - le_unsigned(reg_t, char) - by lane less than or equal to char. - * - * logical ops: - * - any(logical_t, ignore) - return true if any the lanes are true - * - logical_or(logical_t, logical_t) - by lane logical or - * - */ - -#if FOLLY_X64 || FOLLY_AARCH64 - -template -struct SimdCharPlatformCommon : Platform { - using logical_t = typename Platform::logical_t; - - // These are aligned loads but there is no point in generating - // aligned load instructions, so we call loadu. - FOLLY_ALWAYS_INLINE - static auto loada(const char* ptr, ignore_none) { - return Platform::loadu(ptr, ignore_none{}); - } - - FOLLY_ALWAYS_INLINE - static auto loada(const char* ptr, ignore_extrema) { - return Platform::unsafeLoadu(ptr, ignore_none{}); - } - - using Platform::any; - - FOLLY_ALWAYS_INLINE - static bool any(typename Platform::logical_t log, ignore_extrema ignore) { - std::pair mmask = movemask(log, ignore); - return mmask.first; - } - - static auto toArray(typename Platform::reg_t x) { - std::array buf; - std::memcpy(buf.data(), &x, Platform::kCardinal); - return buf; - } -}; - -#endif - -#if FOLLY_X64 - -struct SimdCharSse2PlatformSpecific { - using reg_t = __m128i; - using logical_t = reg_t; - - static constexpr int kCardinal = 16; - - // Even for aligned loads intel people don't recommend using - // aligned load instruction - FOLLY_ALWAYS_INLINE - static reg_t loadu(const char* p, ignore_none) { - return _mm_loadu_si128(reinterpret_cast(p)); - } - - FOLLY_DISABLE_SANITIZERS - FOLLY_ALWAYS_INLINE - static reg_t unsafeLoadu(const char* p, ignore_none) { - return _mm_loadu_si128(reinterpret_cast(p)); - } - - FOLLY_ALWAYS_INLINE - static logical_t equal(reg_t reg, char x) { - return _mm_cmpeq_epi8(reg, _mm_set1_epi8(x)); - } - - FOLLY_ALWAYS_INLINE - static logical_t le_unsigned(reg_t reg, char x) { - // No unsigned comparisons on x86 - // less equal <=> equal (min) - reg_t min = _mm_min_epu8(reg, _mm_set1_epi8(x)); - return _mm_cmpeq_epi8(reg, min); - } - - FOLLY_ALWAYS_INLINE - static logical_t logical_or(logical_t x, logical_t y) { - return _mm_or_si128(x, y); - } - - FOLLY_ALWAYS_INLINE - static bool any(logical_t log, ignore_none) { - return movemask(log).first; - } -}; - -#define FOLLY_DETAIL_HAS_SIMD_CHAR_PLATFORM 1 - -using SimdCharSse2Platform = - SimdCharPlatformCommon; - -#if defined(__AVX2__) - -struct SimdCharAvx2PlatformSpecific { - using reg_t = __m256i; - using logical_t = reg_t; - - static constexpr int kCardinal = 32; - - // We can actually use aligned loads but our Intel people don't recommend - FOLLY_ALWAYS_INLINE - static reg_t loadu(const char* p, ignore_none) { - return _mm256_loadu_si256(reinterpret_cast(p)); - } - - FOLLY_DISABLE_SANITIZERS - FOLLY_ALWAYS_INLINE - static reg_t unsafeLoadu(const char* p, ignore_none) { - return _mm256_loadu_si256(reinterpret_cast(p)); - } - - FOLLY_ALWAYS_INLINE - static logical_t equal(reg_t reg, char x) { - return _mm256_cmpeq_epi8(reg, _mm256_set1_epi8(x)); - } - - FOLLY_ALWAYS_INLINE - static logical_t le_unsigned(reg_t reg, char x) { - // See SSE comment - reg_t min = _mm256_min_epu8(reg, _mm256_set1_epi8(x)); - return _mm256_cmpeq_epi8(reg, min); - } - - FOLLY_ALWAYS_INLINE - static logical_t logical_or(logical_t x, logical_t y) { - return _mm256_or_si256(x, y); - } - - FOLLY_ALWAYS_INLINE - static bool any(logical_t log, ignore_none) { - return simd::movemask(log).first; - } -}; - -using SimdCharAvx2Platform = - SimdCharPlatformCommon; - -using SimdCharPlatform = SimdCharAvx2Platform; - -#else -using SimdCharPlatform = SimdCharSse2Platform; -#endif - -#elif FOLLY_AARCH64 - -struct SimdCharAarch64PlatformSpecific { - using reg_t = uint8x16_t; - using logical_t = reg_t; - - static constexpr int kCardinal = 16; - - FOLLY_ALWAYS_INLINE - static reg_t loadu(const char* p, ignore_none) { - return vld1q_u8(reinterpret_cast(p)); - } - - FOLLY_DISABLE_SANITIZERS - FOLLY_ALWAYS_INLINE - static reg_t unsafeLoadu(const char* p, ignore_none) { - return vld1q_u8(reinterpret_cast(p)); - } - - FOLLY_ALWAYS_INLINE - static logical_t equal(reg_t reg, char x) { - return vceqq_u8(reg, vdupq_n_u8(static_cast(x))); - } - - FOLLY_ALWAYS_INLINE - static logical_t le_unsigned(reg_t reg, char x) { - return vcleq_u8(reg, vdupq_n_u8(static_cast(x))); - } - - FOLLY_ALWAYS_INLINE - static logical_t logical_or(logical_t x, logical_t y) { - return vorrq_u8(x, y); - } - - FOLLY_ALWAYS_INLINE - static bool any(logical_t log, ignore_none) { return vmaxvq_u8(log); } -}; - -#define FOLLY_DETAIL_HAS_SIMD_CHAR_PLATFORM 1 - -using SimdCharAarch64Platform = - SimdCharPlatformCommon; - -using SimdCharPlatform = SimdCharAarch64Platform; - -#define FOLLY_DETAIL_HAS_SIMD_CHAR_PLATFORM 1 - -#else - -#define FOLLY_DETAIL_HAS_SIMD_CHAR_PLATFORM 0 - -using SimdCharPlatform = void; - -#endif - -} // namespace simd::detail -} // namespace folly diff --git a/third-party/folly/src/folly/algorithm/simd/detail/SimdForEach.h b/third-party/folly/src/folly/algorithm/simd/detail/SimdForEach.h index 71f80ea4c71c65..3dd8b42dbd8168 100644 --- a/third-party/folly/src/folly/algorithm/simd/detail/SimdForEach.h +++ b/third-party/folly/src/folly/algorithm/simd/detail/SimdForEach.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -67,15 +68,12 @@ FOLLY_ALWAYS_INLINE void simdForEachAligning( /** * previousAlignedAddress * - * Given a pointer returns a closest pointer aligned to a given size. - * (it just masks out some lower bits) + * Given a pointer returns a closest pointer aligned to a given size + * (in elements). */ template FOLLY_ALWAYS_INLINE T* previousAlignedAddress(T* ptr, int to) { - std::uintptr_t uptr = reinterpret_cast(ptr); - std::uintptr_t uto = static_cast(to); - uptr &= ~(uto - 1); - return reinterpret_cast(uptr); + return align_floor(ptr, sizeof(T) * to); } /** diff --git a/third-party/folly/src/folly/algorithm/simd/detail/SimdPlatform.h b/third-party/folly/src/folly/algorithm/simd/detail/SimdPlatform.h new file mode 100644 index 00000000000000..027f8bfe1cbcc0 --- /dev/null +++ b/third-party/folly/src/folly/algorithm/simd/detail/SimdPlatform.h @@ -0,0 +1,445 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +#if FOLLY_X64 && FOLLY_SSE_PREREQ(4, 2) +#include +#endif + +#if FOLLY_AARCH64 +#include +#endif + +namespace folly { +namespace simd::detail { + +/** + * SimdPlatform + * + * Common interface for some SIMD operations between: sse4.2, avx2, + * arm-neon. + * + * Supported types for T at the moment are uint8_16/uint16_t/uint32_t/uint64_t + * + * If it's not one of the supported platforms: + * std::same_as, void> + * There is also a macro: FOLLY_DETAIL_HAS_SIMD_PLATFORM set to 1 or 0 + * + **/ + +#if FOLLY_X64 && FOLLY_SSE_PREREQ(4, 2) || FOLLY_AARCH64 + +template +struct SimdPlatformCommon { + /** + * sclar_t - type of scalar we operate on (uint8_t, uint16_t etc) + * reg_t - type of a simd register (__m128i) + * logical_t - type of a simd logical register (matches reg_t so far) + **/ + using scalar_t = typename Platform::scalar_t; + using reg_t = typename Platform::reg_t; + using logical_t = typename Platform::logical_t; + + static constexpr int kCardinal = sizeof(reg_t) / sizeof(scalar_t); + + /** + * loads: + * precondition: at least one element should be not ignored. + * + * loada - load from an aligned (to sizeof(reg_t)) address + * loadu - load from an unaligned address + * unsafeLoadu - load from an unaligned address that disables sanitizers. + * This is for reading a register within a page + * but maybe outside of the array's boundary. + * + * Ignored values can be garbage. + **/ + template + static reg_t loada(const scalar_t* ptr, Ignore); + static reg_t loadu(const scalar_t* ptr, ignore_none); + static reg_t unsafeLoadu(const scalar_t* ptr, ignore_none); + + /** + * Comparing reg_t against the scalar. + * + * NOTE: less_equal only implemented for uint8_t + * for now. + **/ + static logical_t equal(reg_t reg, scalar_t x); + static logical_t less_equal(reg_t reg, scalar_t x); + + /** + * logical reduction + **/ + template + static bool any(logical_t logical, Ignore ignore); + + /** + * logical operations + **/ + static logical_t logical_or(logical_t x, logical_t y); + + /** + * Converting register to an array for debugging + **/ + static auto toArray(reg_t x); +}; + +template +template +FOLLY_ERASE auto SimdPlatformCommon::loada( + const scalar_t* ptr, Ignore) -> reg_t { + if constexpr (std::is_same_v) { + // There is not point to aligned load instructions + // on modern cpus. Arm doesn't even have any. + return loadu(ptr, ignore_none{}); + } else { + // We have a precondition: at least one element is loaded. + // From this we can prove that we can unsafely load from + // and aligned address. + // + // Here is an explanation from Stephen Canon: + // https://stackoverflow.com/questions/25566302/vectorized-strlen-getting-away-with-reading-unallocated-memory + return unsafeLoadu(ptr, ignore_none{}); + } +} + +template +FOLLY_ERASE auto SimdPlatformCommon::loadu( + const scalar_t* ptr, ignore_none) -> reg_t { + return Platform::loadu(ptr); +} + +template +FOLLY_ERASE auto SimdPlatformCommon::unsafeLoadu( + const scalar_t* ptr, ignore_none) -> reg_t { + return Platform::unsafeLoadu(ptr); +} + +template +FOLLY_ERASE auto SimdPlatformCommon::equal(reg_t reg, scalar_t x) + -> logical_t { + return Platform::equal(reg, Platform::broadcast(x)); +} + +template +FOLLY_ERASE auto SimdPlatformCommon::less_equal(reg_t reg, scalar_t x) + -> logical_t { + static_assert(std::is_same_v, "not implemented"); + return Platform::less_equal(reg, Platform::broadcast(x)); +} + +template +template +FOLLY_ERASE bool SimdPlatformCommon::any( + logical_t logical, Ignore ignore) { + if constexpr (std::is_same_v) { + return Platform::any(logical); + } else { + return movemask(logical, ignore).first; + } +} + +template +FOLLY_ERASE auto SimdPlatformCommon::logical_or( + logical_t x, logical_t y) -> logical_t { + return Platform::logical_or(x, y); +} + +template +FOLLY_ERASE auto SimdPlatformCommon::toArray(reg_t x) { + std::array res; + std::memcpy(&res, &x, sizeof(x)); + return res; +} + +#endif + +#if FOLLY_X64 && FOLLY_SSE_PREREQ(4, 2) + +template +struct SimdSse42PlatformSpecific { + using scalar_t = T; + using reg_t = __m128i; + using logical_t = reg_t; + + FOLLY_ERASE + static reg_t loadu(const scalar_t* p) { + return _mm_loadu_si128(reinterpret_cast(p)); + } + + FOLLY_DISABLE_SANITIZERS + FOLLY_ERASE + static reg_t unsafeLoadu(const scalar_t* p) { + return _mm_loadu_si128(reinterpret_cast(p)); + } + + FOLLY_ERASE + static reg_t broadcast(scalar_t x) { + if constexpr (std::is_same_v) { + return _mm_set1_epi8(x); + } else if constexpr (std::is_same_v) { + return _mm_set1_epi16(x); + } else if constexpr (std::is_same_v) { + return _mm_set1_epi32(x); + } else if constexpr (std::is_same_v) { + return _mm_set1_epi64x(x); + } + } + + FOLLY_ERASE + static logical_t equal(reg_t x, reg_t y) { + if constexpr (std::is_same_v) { + return _mm_cmpeq_epi8(x, y); + } else if constexpr (std::is_same_v) { + return _mm_cmpeq_epi16(x, y); + } else if constexpr (std::is_same_v) { + return _mm_cmpeq_epi32(x, y); + } else if constexpr (std::is_same_v) { + return _mm_cmpeq_epi64(x, y); + } + } + + FOLLY_ERASE + static logical_t less_equal(reg_t x, reg_t y) { + static_assert( + std::is_same_v, "other types not implemented"); + // No unsigned comparisons on x86 + // less equal <=> equal (min) + reg_t min = _mm_min_epu8(x, y); + return equal(x, min); + } + + FOLLY_ERASE + static logical_t logical_or(logical_t x, logical_t y) { + return _mm_or_si128(x, y); + } + + FOLLY_ERASE + static bool any(logical_t log) { return movemask(log).first; } +}; + +#define FOLLY_DETAIL_HAS_SIMD_PLATFORM 1 + +template +using SimdSse42Platform = SimdPlatformCommon>; + +#if defined(__AVX2__) + +template +struct SimdAvx2PlatformSpecific { + using scalar_t = T; + using reg_t = __m256i; + using logical_t = reg_t; + + FOLLY_ERASE + static reg_t loadu(const scalar_t* p) { + return _mm256_loadu_si256(reinterpret_cast(p)); + } + + FOLLY_DISABLE_SANITIZERS + FOLLY_ERASE + static reg_t unsafeLoadu(const scalar_t* p) { + return _mm256_loadu_si256(reinterpret_cast(p)); + } + + FOLLY_ERASE + static reg_t broadcast(scalar_t x) { + if constexpr (std::is_same_v) { + return _mm256_set1_epi8(x); + } else if constexpr (std::is_same_v) { + return _mm256_set1_epi16(x); + } else if constexpr (std::is_same_v) { + return _mm256_set1_epi32(x); + } else if constexpr (std::is_same_v) { + return _mm256_set1_epi64x(x); + } + } + + FOLLY_ERASE + static logical_t equal(reg_t x, reg_t y) { + if constexpr (std::is_same_v) { + return _mm256_cmpeq_epi8(x, y); + } else if constexpr (std::is_same_v) { + return _mm256_cmpeq_epi16(x, y); + } else if constexpr (std::is_same_v) { + return _mm256_cmpeq_epi32(x, y); + } else if constexpr (std::is_same_v) { + return _mm256_cmpeq_epi64(x, y); + } + } + + FOLLY_ERASE + static logical_t less_equal(reg_t x, reg_t y) { + static_assert( + std::is_same_v, "other types not implemented"); + // See SSE comment + reg_t min = _mm256_min_epu8(x, y); + return _mm256_cmpeq_epi8(x, min); + } + + FOLLY_ERASE + static logical_t logical_or(logical_t x, logical_t y) { + return _mm256_or_si256(x, y); + } + + FOLLY_ERASE + static bool any(logical_t log) { + return simd::movemask(log).first; + } +}; + +template +using SimdAvx2Platform = SimdPlatformCommon>; + +template +using SimdPlatform = SimdAvx2Platform; + +#else + +template +using SimdPlatform = SimdPlatformCommon>; + +#endif + +#elif FOLLY_AARCH64 + +template +struct SimdAarch64PlatformSpecific { + using scalar_t = T; + + FOLLY_ERASE + static auto loadu(const scalar_t* p) { + if constexpr (std::is_same_v) { + return vld1q_u8(p); + } else if constexpr (std::is_same_v) { + return vld1q_u16(p); + } else if constexpr (std::is_same_v) { + return vld1q_u32(p); + } else if constexpr (std::is_same_v) { + return vld1q_u64(p); + } + } + + using reg_t = decltype(loadu(nullptr)); + using logical_t = reg_t; + + FOLLY_DISABLE_SANITIZERS + FOLLY_ERASE + static reg_t unsafeLoadu(const scalar_t* p) { + if constexpr (std::is_same_v) { + return vld1q_u8(p); + } else if constexpr (std::is_same_v) { + return vld1q_u16(p); + } else if constexpr (std::is_same_v) { + return vld1q_u32(p); + } else if constexpr (std::is_same_v) { + return vld1q_u64(p); + } + } + + FOLLY_ERASE + static reg_t broadcast(scalar_t x) { + if constexpr (std::is_same_v) { + return vdupq_n_u8(x); + } else if constexpr (std::is_same_v) { + return vdupq_n_u16(x); + } else if constexpr (std::is_same_v) { + return vdupq_n_u32(x); + } else if constexpr (std::is_same_v) { + return vdupq_n_u64(x); + } + } + + FOLLY_ERASE + static logical_t equal(reg_t x, reg_t y) { + if constexpr (std::is_same_v) { + return vceqq_u8(x, y); + } else if constexpr (std::is_same_v) { + return vceqq_u16(x, y); + } else if constexpr (std::is_same_v) { + return vceqq_u32(x, y); + } else if constexpr (std::is_same_v) { + return vceqq_u64(x, y); + } + } + + FOLLY_ERASE + static logical_t less_equal(reg_t x, reg_t y) { + if constexpr (std::is_same_v) { + return vcleq_u8(x, y); + } else if constexpr (std::is_same_v) { + return vcleq_u16(x, y); + } else if constexpr (std::is_same_v) { + return vcleq_u32(x, y); + } else if constexpr (std::is_same_v) { + return vcleq_u64(x, y); + } + } + + FOLLY_ALWAYS_INLINE + static logical_t logical_or(logical_t x, logical_t y) { + if constexpr (std::is_same_v) { + return vorrq_u8(x, y); + } else if constexpr (std::is_same_v) { + return vorrq_u16(x, y); + } else if constexpr (std::is_same_v) { + return vorrq_u32(x, y); + } else if constexpr (std::is_same_v) { + return vorrq_u64(x, y); + } + } + + FOLLY_ALWAYS_INLINE + static bool any(logical_t log) { + // https://github.com/dotnet/runtime/pull/75864 + auto u32 = bit_cast(log); + u32 = vpmaxq_u32(u32, u32); + auto u64 = bit_cast(u32); + return vgetq_lane_u64(u64, 0); + } +}; + +#define FOLLY_DETAIL_HAS_SIMD_PLATFORM 1 + +template +using SimdAarch64Platform = SimdPlatformCommon>; + +template +using SimdPlatform = SimdAarch64Platform; + +#define FOLLY_DETAIL_HAS_SIMD_PLATFORM 1 + +#else + +#define FOLLY_DETAIL_HAS_SIMD_PLATFORM 0 + +template +using SimdPlatform = void; + +#endif + +} // namespace simd::detail +} // namespace folly diff --git a/third-party/folly/src/folly/algorithm/simd/detail/test/SimdAnyOfTest.cpp b/third-party/folly/src/folly/algorithm/simd/detail/test/SimdAnyOfTest.cpp index 8ea006f0eae731..f4fd2510a4f4ac 100644 --- a/third-party/folly/src/folly/algorithm/simd/detail/test/SimdAnyOfTest.cpp +++ b/third-party/folly/src/folly/algorithm/simd/detail/test/SimdAnyOfTest.cpp @@ -17,27 +17,31 @@ #include #include -#include +#include +#include #include #include -#if FOLLY_DETAIL_HAS_SIMD_CHAR_PLATFORM +#if FOLLY_DETAIL_HAS_SIMD_PLATFORM namespace folly { namespace simd::detail { template -void anySpacesTestForPlatformUnrolling(folly::StringPiece s, bool expected) { +void anySpacesTestForPlatformUnrolling( + folly::span s, bool expected) { bool actual = simdAnyOf( s.data(), s.data() + s.size(), [](typename Platform::reg_t x) { return Platform::equal(x, ' '); }); - ASSERT_EQ(expected, actual) << s; + ASSERT_EQ(expected, actual) + << folly::StringPiece(folly::reinterpret_span_cast(s)); } template -void anySpacesTestForPlatform(folly::StringPiece s, bool expected) { +void anySpacesTestForPlatform( + folly::span s, bool expected) { ASSERT_NO_FATAL_FAILURE( (anySpacesTestForPlatformUnrolling(s, expected))); ASSERT_NO_FATAL_FAILURE( @@ -48,20 +52,23 @@ void anySpacesTestForPlatform(folly::StringPiece s, bool expected) { (anySpacesTestForPlatformUnrolling(s, expected))); } -void anySpacesTest(folly::StringPiece s, bool expected) { +void anySpacesTest(folly::StringPiece sChars, bool expected) { + auto s = + folly::reinterpret_span_cast(folly::span(sChars)); + ASSERT_NO_FATAL_FAILURE( - anySpacesTestForPlatform(s, expected)); -#if FOLLY_X64 + anySpacesTestForPlatform>(s, expected)); +#if FOLLY_X64 && FOLLY_SSE_PREREQ(4, 2) ASSERT_NO_FATAL_FAILURE( - anySpacesTestForPlatform(s, expected)); + anySpacesTestForPlatform>(s, expected)); #if defined(__AVX2__) ASSERT_NO_FATAL_FAILURE( - anySpacesTestForPlatform(s, expected)); + anySpacesTestForPlatform>(s, expected)); #endif #endif #if FOLLY_AARCH64 ASSERT_NO_FATAL_FAILURE( - anySpacesTestForPlatform(s, expected)); + anySpacesTestForPlatform>(s, expected)); #endif } @@ -114,4 +121,4 @@ TEST(SimdAnyOfSimple, BigChunk) { } // namespace simd::detail } // namespace folly -#endif // FOLLY_DETAIL_HAS_SIMD_CHAR_PLATFORM +#endif // FOLLY_DETAIL_HAS_SIMD_PLATFORM diff --git a/third-party/folly/src/folly/algorithm/simd/test/ContainsTest.cpp b/third-party/folly/src/folly/algorithm/simd/test/ContainsTest.cpp index 0b628b4ee76093..55f1966d458a72 100644 --- a/third-party/folly/src/folly/algorithm/simd/test/ContainsTest.cpp +++ b/third-party/folly/src/folly/algorithm/simd/test/ContainsTest.cpp @@ -64,7 +64,7 @@ void testSimdContainsVerify(folly::span haystack, T needle, bool expected) { ASSERT_EQ(expected, actual2) << " haystack.size(): " << haystack.size(); } - if constexpr (std::is_same_v) { + if constexpr (simd::detail::hasHandwrittenContains()) { bool actual3 = simd::detail::containsImplHandwritten(const_haystack, needle); ASSERT_EQ(expected, actual3) << " haystack.size(): " << haystack.size(); @@ -80,11 +80,13 @@ TYPED_TEST(ContainsTest, Basic) { ++offset) { folly::span haystack(buf.data() + offset, buf.data() + buf.size()); T needle{1}; - testSimdContainsVerify(haystack, needle, /*expected*/ false); + ASSERT_NO_FATAL_FAILURE( + testSimdContainsVerify(haystack, needle, /*expected*/ false)); for (auto& x : haystack) { x = needle; - testSimdContainsVerify(haystack, needle, /*expected*/ true); + ASSERT_NO_FATAL_FAILURE( + testSimdContainsVerify(haystack, needle, /*expected*/ true)); x = 0; } } diff --git a/third-party/folly/src/folly/detail/SimpleSimdStringUtils.cpp b/third-party/folly/src/folly/detail/SimpleSimdStringUtils.cpp index f426ebc1e9096f..8ef8deeff61448 100644 --- a/third-party/folly/src/folly/detail/SimpleSimdStringUtils.cpp +++ b/third-party/folly/src/folly/detail/SimpleSimdStringUtils.cpp @@ -16,7 +16,7 @@ #include -#include +#include #include namespace folly { @@ -24,7 +24,7 @@ namespace detail { bool simdHasSpaceOrCntrlSymbols(folly::StringPiece s) { return SimpleSimdStringUtilsImpl< - simd::detail::SimdCharPlatform>::hasSpaceOrCntrlSymbols(s); + simd::detail::SimdPlatform>::hasSpaceOrCntrlSymbols(s); } } // namespace detail diff --git a/third-party/folly/src/folly/detail/SimpleSimdStringUtilsImpl.h b/third-party/folly/src/folly/detail/SimpleSimdStringUtilsImpl.h index d0e2e73efe8d38..8a99a9a323c3cf 100644 --- a/third-party/folly/src/folly/detail/SimpleSimdStringUtilsImpl.h +++ b/third-party/folly/src/folly/detail/SimpleSimdStringUtilsImpl.h @@ -18,7 +18,7 @@ #include #include -#include +#include namespace folly { namespace detail { @@ -35,14 +35,16 @@ struct SimpleSimdStringUtilsImpl { logical_t operator()(reg_t reg) { // This happens to be equivalent to std::isspace(c) || std::iscntrl(c) return Platform::logical_or( - Platform::le_unsigned(reg, 0x20), Platform::equal(reg, 0x7F)); + Platform::less_equal(reg, 0x20), Platform::equal(reg, 0x7F)); } }; FOLLY_ALWAYS_INLINE static bool hasSpaceOrCntrlSymbols(folly::StringPiece s) { return simd::detail::simdAnyOf( - s.data(), s.data() + s.size(), HasSpaceOrCntrlSymbolsLambda{}); + reinterpret_cast(s.data()), + reinterpret_cast(s.data() + s.size()), + HasSpaceOrCntrlSymbolsLambda{}); } }; diff --git a/third-party/folly/src/folly/detail/SplitStringSimd.cpp b/third-party/folly/src/folly/detail/SplitStringSimd.cpp index b658f12059374f..e868a2f9638e83 100644 --- a/third-party/folly/src/folly/detail/SplitStringSimd.cpp +++ b/third-party/folly/src/folly/detail/SplitStringSimd.cpp @@ -28,7 +28,7 @@ template void SimdSplitByCharImpl::keepEmpty( char sep, folly::StringPiece what, Container& res) { PlatformSimdSplitByChar< - simd::detail::SimdCharPlatform, + simd::detail::SimdPlatform, /*ignoreEmpty*/ false>{}(sep, what, res); } @@ -36,7 +36,7 @@ template void SimdSplitByCharImpl::dropEmpty( char sep, folly::StringPiece what, Container& res) { PlatformSimdSplitByChar< - simd::detail::SimdCharPlatform, + simd::detail::SimdPlatform, /*ignoreEmpty*/ true>{}(sep, what, res); } @@ -44,7 +44,7 @@ template void SimdSplitByCharImplToStrings::keepEmpty( char sep, folly::StringPiece what, Container& res) { PlatformSimdSplitByChar< - simd::detail::SimdCharPlatform, + simd::detail::SimdPlatform, /*ignoreEmpty*/ false>{}(sep, what, res); } @@ -52,7 +52,7 @@ template void SimdSplitByCharImplToStrings::dropEmpty( char sep, folly::StringPiece what, Container& res) { PlatformSimdSplitByChar< - simd::detail::SimdCharPlatform, + simd::detail::SimdPlatform, /*ignoreEmpty*/ true>{}(sep, what, res); } diff --git a/third-party/folly/src/folly/detail/SplitStringSimdImpl.h b/third-party/folly/src/folly/detail/SplitStringSimdImpl.h index 2120527ebe2fe3..34b26b6c247cf1 100644 --- a/third-party/folly/src/folly/detail/SplitStringSimdImpl.h +++ b/third-party/folly/src/folly/detail/SplitStringSimdImpl.h @@ -20,8 +20,8 @@ #include #include #include -#include #include +#include #include #if FOLLY_X64 @@ -73,18 +73,18 @@ struct PlatformSimdSplitByChar { template FOLLY_ALWAYS_INLINE void emplaceBack( - Container& res, const char* f, const char* l) const { + Container& res, const std::uint8_t* f, const std::uint8_t* l) const { if (ignoreEmpty && f == l) { return; } - res.emplace_back(f, l - f); + res.emplace_back(reinterpret_cast(f), l - f); } template FOLLY_ALWAYS_INLINE void outputStringsFoMmask( std::pair mmask, - const char* pos, - const char*& prev, + const std::uint8_t* pos, + const std::uint8_t*& prev, Container& res) const { // reserve was not beneficial on benchmarks. Uint mmaskBits = mmask.first; @@ -94,7 +94,7 @@ struct PlatformSimdSplitByChar { mmaskBits >>= BitsPerElement{}; auto firstSet = counted / BitsPerElement{}; - const char* split = pos + firstSet; + const std::uint8_t* split = pos + firstSet; pos = split + 1; emplaceBack(res, prev, split); prev = pos; @@ -104,13 +104,13 @@ struct PlatformSimdSplitByChar { template struct ForEachDelegate { const PlatformSimdSplitByChar& self; - char sep; - const char*& prev; + std::uint8_t sep; + const std::uint8_t*& prev; Container& res; template FOLLY_ALWAYS_INLINE bool step( - const char* ptr, Ignore ignore, UnrollIndex) const { + const std::uint8_t* ptr, Ignore ignore, UnrollIndex) const { reg_t loaded = Platform::loada(ptr, ignore); auto mmask = simd::movemask(Platform::equal(loaded, sep), ignore); @@ -122,11 +122,17 @@ struct PlatformSimdSplitByChar { template FOLLY_ALWAYS_INLINE void operator()( char sep, folly::StringPiece what, Container& res) const { - const char* prev = what.data(); - ForEachDelegate delegate{*this, sep, prev, res}; + const std::uint8_t* what_f = + reinterpret_cast(what.data()); + const std::uint8_t* what_l = what_f + what.size(); + + const std::uint8_t* prev = what_f; + + ForEachDelegate delegate{ + *this, static_cast(sep), prev, res}; simd::detail::simdForEachAligning( - Platform::kCardinal, what.data(), what.data() + what.size(), delegate); - emplaceBack(res, prev, what.data() + what.size()); + Platform::kCardinal, what_f, what_l, delegate); + emplaceBack(res, prev, what_l); } }; diff --git a/third-party/folly/src/folly/detail/test/SimpleSimdStringUtilsTest.cpp b/third-party/folly/src/folly/detail/test/SimpleSimdStringUtilsTest.cpp index cd3ba2337382b4..3b1d3409518261 100644 --- a/third-party/folly/src/folly/detail/test/SimpleSimdStringUtilsTest.cpp +++ b/third-party/folly/src/folly/detail/test/SimpleSimdStringUtilsTest.cpp @@ -16,7 +16,7 @@ #include -#include +#include #include #include @@ -35,15 +35,21 @@ void testHasSpaceOrCntrlSymbols(folly::StringPiece s, bool r) { using namespace simd::detail; ASSERT_EQ(r, hasSpaceOrCntrlSymbolsForPlatform(s)) << s; -#if FOLLY_X64 - ASSERT_EQ(r, hasSpaceOrCntrlSymbolsForPlatform(s)) << s; +#if FOLLY_SSE_PREREQ(4, 2) + ASSERT_EQ( + r, hasSpaceOrCntrlSymbolsForPlatform>(s)) + << s; #if defined(__AVX2__) - ASSERT_EQ(r, hasSpaceOrCntrlSymbolsForPlatform(s)) << s; + ASSERT_EQ( + r, hasSpaceOrCntrlSymbolsForPlatform>(s)) + << s; #endif #endif #if FOLLY_AARCH64 - ASSERT_EQ(r, hasSpaceOrCntrlSymbolsForPlatform(s)) + ASSERT_EQ( + r, + hasSpaceOrCntrlSymbolsForPlatform>(s)) << s; #endif } diff --git a/third-party/folly/src/folly/detail/test/SplitStringSimdTest.cpp b/third-party/folly/src/folly/detail/test/SplitStringSimdTest.cpp index ec0c6009754c3f..adbe2951ac3518 100644 --- a/third-party/folly/src/folly/detail/test/SplitStringSimdTest.cpp +++ b/third-party/folly/src/folly/detail/test/SplitStringSimdTest.cpp @@ -144,21 +144,24 @@ void runTestStringSplitOneType(folly::StringPiece s) { actuals.emplace_back(); simdSplitByChar(',', s, actuals.back(), ignoreEmpty); -#if FOLLY_X64 +#if FOLLY_SSE_PREREQ(4, 2) actuals.emplace_back(); - PlatformSimdSplitByChar{}( - ',', s, actuals.back()); + PlatformSimdSplitByChar< + simd::detail::SimdSse42Platform, + ignoreEmpty>{}(',', s, actuals.back()); #if defined(__AVX2__) actuals.emplace_back(); - PlatformSimdSplitByChar{}( - ',', s, actuals.back()); + PlatformSimdSplitByChar< + simd::detail::SimdAvx2Platform, + ignoreEmpty>{}(',', s, actuals.back()); #endif #endif #if FOLLY_AARCH64 actuals.emplace_back(); - PlatformSimdSplitByChar{}( - ',', s, actuals.back()); + PlatformSimdSplitByChar< + simd::detail::SimdAarch64Platform, + ignoreEmpty>{}(',', s, actuals.back()); #endif for (const auto& actual : actuals) {