From 6999172b83a52057636859b59031fcbcab74b020 Mon Sep 17 00:00:00 2001 From: Michael van der Westhuizen Date: Sun, 29 Sep 2024 16:55:52 -0700 Subject: [PATCH] Improve large buffer crc32c performance on Arm [3/3] Summary: Integrate neon and neon+eor3+sha3 crc32c implementations into the Folly hash library. Reviewed By: skrueger Differential Revision: D59322056 fbshipit-source-id: cb6fba0ec6677f439235d90e2d95d248ba7c47e2 --- third-party/folly/src/folly/hash/Checksum.cpp | 38 +++++++ .../src/folly/hash/detail/ChecksumDetail.h | 12 ++ .../src/folly/hash/test/ChecksumTest.cpp | 104 ++++++++++++++++++ 3 files changed, 154 insertions(+) diff --git a/third-party/folly/src/folly/hash/Checksum.cpp b/third-party/folly/src/folly/hash/Checksum.cpp index d2aaa1e6cafaa8..7fa8fa55cc2563 100644 --- a/third-party/folly/src/folly/hash/Checksum.cpp +++ b/third-party/folly/src/folly/hash/Checksum.cpp @@ -24,6 +24,8 @@ #include #include #include // @manual +#include // @manual +#include // @manual #include // @manual #include @@ -90,6 +92,14 @@ bool crc32_hw_supported() { return id.sse42(); } +bool crc32c_hw_supported_neon() { + return false; +} + +bool crc32c_hw_supported_neon_eor3_sha3() { + return false; +} + #elif FOLLY_ARM_FEATURE_CRC32 // crc32_hw is defined in folly/external/nvidia/hash/Checksum.cpp @@ -106,6 +116,16 @@ bool crc32c_hw_supported_avx512() { return false; } +bool crc32c_hw_supported_neon() { + static bool has_neon = has_neon_crc32c_v3s4x2e_v2(); + return has_neon; +} + +bool crc32c_hw_supported_neon_eor3_sha3() { + static bool has_neon_eor3 = has_neon_eor3_crc32c_v8s2x4_s3(); + return has_neon_eor3; +} + bool crc32_hw_supported() { return true; } @@ -134,6 +154,14 @@ bool crc32c_hw_supported_avx512() { bool crc32_hw_supported() { return false; } + +bool crc32c_hw_supported_neon() { + return false; +} + +bool crc32c_hw_supported_neon_eor3_sha3() { + return false; +} #endif template @@ -179,6 +207,16 @@ uint32_t crc32c(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) { } #endif +#if FOLLY_AARCH64 + if (nbytes >= 2048 && detail::crc32c_hw_supported_neon_eor3_sha3()) { + return detail::neon_eor3_crc32c_v8s2x4_s3(data, nbytes, startingChecksum); + } + + if (nbytes >= 4096 && detail::crc32c_hw_supported_neon()) { + return detail::neon_crc32c_v3s4x2e_v2(data, nbytes, startingChecksum); + } +#endif + if (detail::crc32c_hw_supported()) { #if defined(FOLLY_ENABLE_SSE42_CRC32C_V8S3X3) if (nbytes > 4096) { diff --git a/third-party/folly/src/folly/hash/detail/ChecksumDetail.h b/third-party/folly/src/folly/hash/detail/ChecksumDetail.h index b0d525ad8d99a5..66bf258a22f683 100644 --- a/third-party/folly/src/folly/hash/detail/ChecksumDetail.h +++ b/third-party/folly/src/folly/hash/detail/ChecksumDetail.h @@ -60,6 +60,18 @@ bool crc32c_hw_supported(); */ bool crc32c_hw_supported_avx512(); +/** + * Check whether a NEON hardware-accelerated CRC-32C implementation is + * supported on the current CPU. + */ +bool crc32c_hw_supported_neon(); + +/** + * Check whether a NEON+EOR3+SHA3 hardware-accelerated CRC-32C implementation + * is supported on the current CPU. + */ +bool crc32c_hw_supported_neon_eor3_sha3(); + /** * Compute a CRC-32C checksum of a buffer using a portable, * software-only implementation. diff --git a/third-party/folly/src/folly/hash/test/ChecksumTest.cpp b/third-party/folly/src/folly/hash/test/ChecksumTest.cpp index 4e7253dff6de62..65c013fed97bb3 100644 --- a/third-party/folly/src/folly/hash/test/ChecksumTest.cpp +++ b/third-party/folly/src/folly/hash/test/ChecksumTest.cpp @@ -19,8 +19,11 @@ #include #include +#include #include #include +#include +#include #include #include #include @@ -119,8 +122,10 @@ TEST(Checksum, crc32cHardware) { if (folly::detail::crc32c_hw_supported()) { testCRC32C(folly::detail::crc32c_hw); } else { +#if FOLLY_X64 LOG(WARNING) << "skipping hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -132,8 +137,10 @@ TEST(Checksum, crc32cHardwareEq) { EXPECT_EQ(sw, hw); } } else { +#if FOLLY_X64 LOG(WARNING) << "skipping hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -141,8 +148,10 @@ TEST(Checksum, crc32cContinuationHardware) { if (folly::detail::crc32c_hw_supported()) { testCRC32CContinuation(folly::detail::crc32c_hw); } else { +#if FOLLY_X64 LOG(WARNING) << "skipping hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -150,8 +159,10 @@ TEST(Checksum, crc32cHardwareSse42) { if (folly::detail::crc32c_hw_supported_sse42()) { testCRC32C(folly::detail::sse_crc32c_v8s3x3); } else { +#if FOLLY_X64 LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -163,8 +174,10 @@ TEST(Checksum, crc32cHardwareEqSse42) { ASSERT_EQ(sw, hw); } } else { +#if FOLLY_X64 LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -172,8 +185,10 @@ TEST(Checksum, crc32cContinuationHardwareSse42) { if (folly::detail::crc32c_hw_supported_sse42()) { testCRC32CContinuation(folly::detail::sse_crc32c_v8s3x3); } else { +#if FOLLY_X64 LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -181,8 +196,10 @@ TEST(Checksum, crc32cHardwareAvx512) { if (folly::detail::crc32c_hw_supported_avx512()) { testCRC32C(folly::detail::avx512_crc32c_v8s3x4); } else { +#if FOLLY_X64 LOG(WARNING) << "skipping AVX512 hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -194,8 +211,10 @@ TEST(Checksum, crc32cHardwareEqAvx512) { ASSERT_EQ(sw, hw); } } else { +#if FOLLY_X64 LOG(WARNING) << "skipping AVX512 hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -203,8 +222,84 @@ TEST(Checksum, crc32cContinuationHardwareAvx512) { if (folly::detail::crc32c_hw_supported_avx512()) { testCRC32CContinuation(folly::detail::avx512_crc32c_v8s3x4); } else { +#if FOLLY_X64 LOG(WARNING) << "skipping AVX512 hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif + } +} + +TEST(Checksum, crc32cHardwareNeon) { + if (folly::detail::crc32c_hw_supported_neon()) { + testCRC32C(folly::detail::neon_crc32c_v3s4x2e_v2); + } else { +#if FOLLY_AARCH64 + LOG(WARNING) << "skipping NEON hardware-accelerated CRC-32C tests" + << " (not supported on this CPU)"; +#endif + } +} + +TEST(Checksum, crc32cHardwareEqNeon) { + if (folly::detail::crc32c_hw_supported_neon()) { + for (size_t i = 0; i < 1000; i++) { + auto sw = folly::detail::crc32c_sw(buffer, i, 0); + auto hw = folly::detail::neon_crc32c_v3s4x2e_v2(buffer, i, 0); + ASSERT_EQ(sw, hw); + } + } else { +#if FOLLY_AARCH64 + LOG(WARNING) << "skipping NEON hardware-accelerated CRC-32C tests" + << " (not supported on this CPU)"; +#endif + } +} + +TEST(Checksum, crc32cContinuationHardwareNeon) { + if (folly::detail::crc32c_hw_supported_neon()) { + testCRC32CContinuation(folly::detail::neon_crc32c_v3s4x2e_v2); + } else { +#if FOLLY_AARCH64 + LOG(WARNING) << "skipping NEON hardware-accelerated CRC-32C tests" + << " (not supported on this CPU)"; +#endif + } +} + +TEST(Checksum, crc32cHardwareNeonEor3Sha3) { + if (folly::detail::crc32c_hw_supported_neon_eor3_sha3()) { + testCRC32C(folly::detail::neon_eor3_crc32c_v8s2x4_s3); + } else { +#if FOLLY_AARCH64 + LOG(WARNING) << "skipping NEON+EOR3+SHA3 hardware-accelerated CRC-32C tests" + << " (not supported on this CPU)"; +#endif + } +} + +TEST(Checksum, crc32cHardwareEqNeonEor3Sha3) { + if (folly::detail::crc32c_hw_supported_neon_eor3_sha3()) { + for (size_t i = 0; i < 1000; i++) { + auto sw = folly::detail::crc32c_sw(buffer, i, 0); + auto hw = folly::detail::neon_eor3_crc32c_v8s2x4_s3(buffer, i, 0); + ASSERT_EQ(sw, hw); + } + } else { +#if FOLLY_AARCH64 + LOG(WARNING) << "skipping NEON+EOR3+SHA3 hardware-accelerated CRC-32C tests" + << " (not supported on this CPU)"; +#endif + } +} + +TEST(Checksum, crc32cContinuationHardwareNeonEor3Sha3) { + if (folly::detail::crc32c_hw_supported_neon_eor3_sha3()) { + testCRC32CContinuation(folly::detail::neon_eor3_crc32c_v8s2x4_s3); + } else { +#if FOLLY_AARCH64 + LOG(WARNING) << "skipping NEON+EOR3+SHA3 hardware-accelerated CRC-32C tests" + << " (not supported on this CPU)"; +#endif } } @@ -230,6 +325,15 @@ TEST(Checksum, crc32clargeBuffers) { auto crcAvx = folly::detail::avx512_crc32c_v8s3x4(bufp, kLargeBufSz, ~0); ASSERT_EQ(kCrc, crcAvx); } + if (folly::detail::crc32c_hw_supported_neon()) { + auto crcHw = folly::detail::neon_crc32c_v3s4x2e_v2(bufp, kLargeBufSz, ~0); + ASSERT_EQ(kCrc, crcHw); + } + if (folly::detail::crc32c_hw_supported_neon_eor3_sha3()) { + auto crcHw = + folly::detail::neon_eor3_crc32c_v8s2x4_s3(bufp, kLargeBufSz, ~0); + ASSERT_EQ(kCrc, crcHw); + } } #endif