From 5eb6704d1ac67aede5c72635945a27aca303dd29 Mon Sep 17 00:00:00 2001 From: Yanfei Guo Date: Tue, 24 Sep 2024 16:10:09 -0500 Subject: [PATCH] configure: reenable SSE2 and AVX optimization options for MPICH Previous PR#7074 consolidated SSE2 and AVX related optimization options into MPL's configure because only MPL explicitly use them. This change showed no performance degradation with GNU compiler. But, with Intel compilers, this does results in some performance degradation. Therefore, we should add them back in the main configure. Currently, the main configure checks for availability of SSE2, AVX and AVX512F, and add them to CFLAGS. The MPL configure will further check for specific instructions that is used in MPL. --- configure.ac | 84 ++++++++++++- src/mpl/configure.ac | 278 +++++++++++++++---------------------------- 2 files changed, 180 insertions(+), 182 deletions(-) diff --git a/configure.ac b/configure.ac index bdc8392af71..5c1f1b93050 100644 --- a/configure.ac +++ b/configure.ac @@ -906,11 +906,14 @@ for option in $enable_fast ; do ;; alwaysinline) # No op in MPICH. See mpl/configure.ac ;; - sse2) # No op in MPICH. See mpl/configure.ac + sse2) + enable_fast_sse2_instr=yes ;; - avx) # No op in MPICH. See mpl/configure.ac + avx) + enable_fast_avx_instr=yes ;; - avx512f) # No op in MPICH. See mpl/configure.ac + avx512f) + enable_fast_avx512f_instr=yes ;; all|yes) enable_fast_ndebug=yes @@ -955,6 +958,81 @@ if test -z "$enable_fast_no_strict_alignment" ; then AC_DEFINE(NEEDS_STRICT_ALIGNMENT,1,[Define if strict alignment memory access is required]) fi +if test "$enable_fast_sse2_instr" = "yes" ; then + AC_CACHE_CHECK([whether -msse2 is supported], pac_cv_found_sse2, + [PAC_C_CHECK_COMPILER_OPTION([-msse2],pac_cv_found_sse2=yes,pac_cv_found_sse2=no)], + pac_cv_found_sse2=no,pac_cv_found_sse2=yes) + PAC_PUSH_FLAG([CFLAGS]) + PAC_APPEND_FLAG([-msse2],[CFLAGS]) + AC_CACHE_CHECK([whether SSE2 is supported by the CPU], pac_cv_found_sse2_runnable,[ + AC_RUN_IFELSE([AC_LANG_SOURCE([[ + #include + + int main() { + __m128i a = _mm_set1_epi32(1); + __asm__ volatile("" : : "x" (a) : "memory"); + return 0; + } + ]])], pac_cv_found_sse2_runnable="yes", + pac_cv_found_sse2_runnable="no", + pac_cv_found_sse2_runnable="unknown") + ]) + PAC_POP_FLAG([CFLAGS]) + if test "$pac_cv_found_sse2" = "yes" && test "$pac_cv_found_sse2_runnable" = "yes"; then + PAC_APPEND_FLAG([-msse2],[CFLAGS]) + fi +fi + +if test "$enable_fast_avx_instr" = "yes" ; then + AC_CACHE_CHECK([whether -mavx is supported], pac_cv_found_avx, + [PAC_C_CHECK_COMPILER_OPTION([-mavx],pac_cv_found_avx=yes,pac_cv_found_avx=no)], + pac_cv_found_avx=no,pac_cv_found_avx=yes) + PAC_PUSH_FLAG([CFLAGS]) + PAC_APPEND_FLAG([-mavx],[CFLAGS]) + AC_CACHE_CHECK([whether AVX is supported by the CPU], pac_cv_found_avx_runnable,[ + AC_RUN_IFELSE([AC_LANG_SOURCE([[ + #include + + int main() { + __m256i a = _mm256_set1_epi32(1); + __asm__ volatile("" : : "x" (a) : "memory"); + return 0; + } + ]])], pac_cv_found_avx_runnable="yes", + pac_cv_found_avx_runnable="no", + pac_cv_found_avx_runnable="unknown") + ]) + PAC_POP_FLAG([CFLAGS]) + if test "$pac_cv_found_avx" = "yes" && test "$pac_cv_found_avx_runnable" = "yes"; then + PAC_APPEND_FLAG([-mavx],[CFLAGS]) + fi +fi + +if test "$enable_fast_avx512f_instr" = "yes" ; then + AC_CACHE_CHECK([whether -mavx512f is supported], pac_cv_found_avx512f, + [PAC_C_CHECK_COMPILER_OPTION([-mavx512f],pac_cv_found_avx512f=yes,pac_cv_found_avx512f=no)], + pac_cv_found_avx512f=no,pac_cv_found_avx512f=yes) + PAC_PUSH_FLAG([CFLAGS]) + PAC_APPEND_FLAG([-mavx512f],[CFLAGS]) + AC_CACHE_CHECK([whether AVX512F is supported by the CPU], pac_cv_found_avx512f_runnable,[ + AC_RUN_IFELSE([AC_LANG_SOURCE([[ + #include + + int main() { + __m512i a = _mm512_set1_epi32(1); + __asm__ volatile("" : : "x" (a) : "memory"); + return 0; + } + ]])], pac_cv_found_avx512f_runnable="yes", + pac_cv_found_avx512f_runnable="no", + pac_cv_found_avx512f_runnable="unknown") + ]) + PAC_POP_FLAG([CFLAGS]) + if test "$pac_cv_found_avx512f" = "yes" && test "$pac_cv_found_avx512f_runnable" = "yes"; then + PAC_APPEND_FLAG([-mavx512f],[CFLAGS]) + fi +fi + # error-checking # Change default into the specific value of the default if test "$enable_error_checking" = "yes" ; then diff --git a/src/mpl/configure.ac b/src/mpl/configure.ac index 3986121fdeb..e85c9aa6d1f 100644 --- a/src/mpl/configure.ac +++ b/src/mpl/configure.ac @@ -164,15 +164,6 @@ for option in $enable_fast ; do alwaysinline) enable_fast_alwaysinline=yes ;; - sse2) - enable_fast_sse2_instr=yes - ;; - avx) - enable_fast_avx_instr=yes - ;; - avx512f) - enable_fast_avx512f_instr=yes - ;; all|yes) enable_fast_sse2_instr=yes enable_fast_avx_instr=yes @@ -189,182 +180,111 @@ if test "$enable_fast_alwaysinline" = "yes"; then AC_DEFINE(ENABLE_ALWAYS_INLINE,1,[Define if force compiler to always inline functions with MPL_STATIC_INLINE_PREFIX|SUFFIX]) fi -if test "$enable_fast_sse2_instr" = "yes" ; then - AC_CACHE_CHECK([whether -msse2 is supported], pac_cv_found_sse2, - [PAC_C_CHECK_COMPILER_OPTION([-msse2],pac_cv_found_sse2=yes,pac_cv_found_sse2=no)], - pac_cv_found_sse2=no,pac_cv_found_sse2=yes) - PAC_PUSH_FLAG([CFLAGS]) - PAC_APPEND_FLAG([-msse2],[CFLAGS]) - AC_CACHE_CHECK([whether SSE2 is supported by the CPU], pac_cv_found_sse2_runnable,[ - AC_RUN_IFELSE([AC_LANG_SOURCE([[ - #include - - int main() { - __m128i a = _mm_set1_epi32(1); - __asm__ volatile("" : : "x" (a) : "memory"); - return 0; - } - ]])], pac_cv_found_sse2_runnable="yes", - pac_cv_found_sse2_runnable="no", - pac_cv_found_sse2_runnable="unknown") - ]) - PAC_POP_FLAG([CFLAGS]) - if test "$pac_cv_found_sse2" = "yes" && test "$pac_cv_found_sse2_runnable" = "yes"; then - PAC_APPEND_FLAG([-msse2],[CFLAGS]) - AC_CACHE_CHECK([whether _mm_stream_si128 is supported], pac_cv_found__mm_stream_si128,[ - AC_RUN_IFELSE([AC_LANG_SOURCE([[ - #include - #include - - int main(int argc, char **argv) { - int ret = 0; - char *source = NULL, *dest = NULL; - posix_memalign(&source, 64, 64); - posix_memalign(&dest, 64, 64); - for (int i = 0; i < 64; i++) source[i] = 'a'; - - __m128i xmm0 = _mm_loadu_si128((__m128i const *) source); - _mm_stream_si128((__m128i *) dest, xmm0); - _mm_sfence(); - - if (dest[0] == source[0]) ret = 0; - else ret = 1; - - free(source); - free(dest); - - return ret; - } - ]])], pac_cv_found__mm_stream_si128="yes", - pac_cv_found__mm_stream_si128"no", - pac_cv_found__mm_stream_si128="unknown") - ]) - if test "$pac_cv_found__mm_stream_si128" = "yes" ; then - AC_DEFINE(HAVE__MM_STREAM_SI128,1,[Define if _mm_stream_si128 is available]) - fi - fi +# checking specific SSE2 and AVX support +AC_CACHE_CHECK([whether _mm_stream_si128 is supported], pac_cv_found__mm_stream_si128,[ + AC_RUN_IFELSE([AC_LANG_SOURCE([[ + #include + #include + + int main(int argc, char **argv) { + int ret = 0; + char *source = NULL, *dest = NULL; + posix_memalign(&source, 64, 64); + posix_memalign(&dest, 64, 64); + for (int i = 0; i < 64; i++) source[i] = 'a'; + + __m128i xmm0 = _mm_loadu_si128((__m128i const *) source); + _mm_stream_si128((__m128i *) dest, xmm0); + _mm_sfence(); + + if (dest[0] == source[0]) ret = 0; + else ret = 1; + + free(source); + free(dest); + + return ret; + } + ]])], pac_cv_found__mm_stream_si128="yes", + pac_cv_found__mm_stream_si128"no", + pac_cv_found__mm_stream_si128="unknown") + ]) +if test "$pac_cv_found__mm_stream_si128" = "yes" ; then + AC_DEFINE(HAVE__MM_STREAM_SI128,1,[Define if _mm_stream_si128 is available]) fi -if test "$enable_fast_avx_instr" = "yes" ; then - AC_CACHE_CHECK([whether -mavx is supported], pac_cv_found_avx, - [PAC_C_CHECK_COMPILER_OPTION([-mavx],pac_cv_found_avx=yes,pac_cv_found_avx=no)], - pac_cv_found_avx=no,pac_cv_found_avx=yes) - PAC_PUSH_FLAG([CFLAGS]) - PAC_APPEND_FLAG([-mavx],[CFLAGS]) - AC_CACHE_CHECK([whether AVX is supported by the CPU], pac_cv_found_avx_runnable,[ - AC_RUN_IFELSE([AC_LANG_SOURCE([[ - #include - - int main() { - __m256i a = _mm256_set1_epi32(1); - __asm__ volatile("" : : "x" (a) : "memory"); - return 0; - } - ]])], pac_cv_found_avx_runnable="yes", - pac_cv_found_avx_runnable="no", - pac_cv_found_avx_runnable="unknown") - ]) - PAC_POP_FLAG([CFLAGS]) - if test "$pac_cv_found_avx" = "yes" && test "$pac_cv_found_avx_runnable" = "yes"; then - PAC_APPEND_FLAG([-mavx],[CFLAGS]) - AC_CACHE_CHECK([whether _mm256_storeu_si256 is supported], pac_cv_found__mm256_storeu_si256,[ - AC_RUN_IFELSE([AC_LANG_SOURCE([[ - #include - - int main(int argc, char **argv) { - char source[1024], dest[1024]; - for (int i = 0; i < 1024; i++) source[i] = 'a'; - - _mm256_storeu_si256((__m256i *) dest, _mm256_loadu_si256((__m256i const *) source)); - - if (dest[0] == source[0]) return 0; - else return 1; - } - ]])], pac_cv_found__mm256_storeu_si256="yes", - pac_cv_found__mm256_storeu_si256="no", - pac_cv_found__mm256_storeu_si256="unknown") - ]) - if test "$pac_cv_found__mm256_storeu_si256" = "yes" ; then - AC_DEFINE(HAVE__MM256_STOREU_SI256,1,[Define if _mm256_storeu_si256 is available]) - fi +AC_CACHE_CHECK([whether _mm256_storeu_si256 is supported], pac_cv_found__mm256_storeu_si256,[ + AC_RUN_IFELSE([AC_LANG_SOURCE([[ + #include + + int main(int argc, char **argv) { + char source[1024], dest[1024]; + for (int i = 0; i < 1024; i++) source[i] = 'a'; + + _mm256_storeu_si256((__m256i *) dest, _mm256_loadu_si256((__m256i const *) source)); + + if (dest[0] == source[0]) return 0; + else return 1; + } + ]])], pac_cv_found__mm256_storeu_si256="yes", + pac_cv_found__mm256_storeu_si256="no", + pac_cv_found__mm256_storeu_si256="unknown") + ]) +if test "$pac_cv_found__mm256_storeu_si256" = "yes" ; then + AC_DEFINE(HAVE__MM256_STOREU_SI256,1,[Define if _mm256_storeu_si256 is available]) +fi - AC_CACHE_CHECK([whether _mm256_stream_si256 is supported], pac_cv_found__mm256_stream_si256,[ - AC_RUN_IFELSE([AC_LANG_SOURCE([[ - #include - #include - - int main(int argc, char **argv) { - int ret = 0; - char *source = NULL, *dest = NULL; - posix_memalign(&source, 64, 64); - posix_memalign(&dest, 64, 64); - for (int i = 0; i < 64; i++) source[i] = 'a'; - - __m256i ymm0 = _mm256_loadu_si256((__m256i const *) source); - _mm256_stream_si256((__m256i *) dest, ymm0); - _mm_sfence(); - - if (dest[0] == source[0]) ret = 0; - else ret = 1; - - free(source); - free(dest); - - return ret; - } - ]])], pac_cv_found__mm256_stream_si256="yes", - pac_cv_found__mm256_stream_si256="no", - pac_cv_found__mm256_stream_si256="unknown") - ]) - if test "$pac_cv_found__mm256_stream_si256" = "yes" ; then - AC_DEFINE(HAVE__MM256_STREAM_SI256,1,[Define if _mm256_stream_si256 is available]) - fi - fi +AC_CACHE_CHECK([whether _mm256_stream_si256 is supported], pac_cv_found__mm256_stream_si256,[ + AC_RUN_IFELSE([AC_LANG_SOURCE([[ + #include + #include + + int main(int argc, char **argv) { + int ret = 0; + char *source = NULL, *dest = NULL; + posix_memalign(&source, 64, 64); + posix_memalign(&dest, 64, 64); + for (int i = 0; i < 64; i++) source[i] = 'a'; + + __m256i ymm0 = _mm256_loadu_si256((__m256i const *) source); + _mm256_stream_si256((__m256i *) dest, ymm0); + _mm_sfence(); + + if (dest[0] == source[0]) ret = 0; + else ret = 1; + + free(source); + free(dest); + + return ret; + } + ]])], pac_cv_found__mm256_stream_si256="yes", + pac_cv_found__mm256_stream_si256="no", + pac_cv_found__mm256_stream_si256="unknown") + ]) +if test "$pac_cv_found__mm256_stream_si256" = "yes" ; then + AC_DEFINE(HAVE__MM256_STREAM_SI256,1,[Define if _mm256_stream_si256 is available]) fi -if test "$enable_fast_avx512f_instr" = "yes" ; then - AC_CACHE_CHECK([whether -mavx512f is supported], pac_cv_found_avx512f, - [PAC_C_CHECK_COMPILER_OPTION([-mavx512f],pac_cv_found_avx512f=yes,pac_cv_found_avx512f=no)], - pac_cv_found_avx512f=no,pac_cv_found_avx512f=yes) - PAC_PUSH_FLAG([CFLAGS]) - PAC_APPEND_FLAG([-mavx512f],[CFLAGS]) - AC_CACHE_CHECK([whether AVX512F is supported by the CPU], pac_cv_found_avx512f_runnable,[ - AC_RUN_IFELSE([AC_LANG_SOURCE([[ - #include - - int main() { - __m512i a = _mm512_set1_epi32(1); - __asm__ volatile("" : : "x" (a) : "memory"); - return 0; +AC_CACHE_CHECK([whether _mm512_storeu_si512 is supported], pac_cv_found__mm512_storeu_si512,[ + AC_RUN_IFELSE([AC_LANG_SOURCE([[ + #include + + int main(int argc, char **argv) { + char source[1024], dest[1024]; + for (int i = 0; i < 1024; i++) source[i] = 'a'; + + _mm512_storeu_si512((__m512i *) dest, _mm512_loadu_si512((__m512i const *) source)); + + if (dest[0] == source[0]) return 0; + else return 1; } - ]])], pac_cv_found_avx512f_runnable="yes", - pac_cv_found_avx512f_runnable="no", - pac_cv_found_avx512f_runnable="unknown") - ]) - PAC_POP_FLAG([CFLAGS]) - if test "$pac_cv_found_avx512f" = "yes" && test "$pac_cv_found_avx512f_runnable" = "yes"; then - PAC_APPEND_FLAG([-mavx512f],[CFLAGS]) - AC_CACHE_CHECK([whether _mm512_storeu_si512 is supported], pac_cv_found__mm512_storeu_si512,[ - AC_RUN_IFELSE([AC_LANG_SOURCE([[ - #include - - int main(int argc, char **argv) { - char source[1024], dest[1024]; - for (int i = 0; i < 1024; i++) source[i] = 'a'; - - _mm512_storeu_si512((__m512i *) dest, _mm512_loadu_si512((__m512i const *) source)); - - if (dest[0] == source[0]) return 0; - else return 1; - } - ]])], pac_cv_found__mm512_storeu_si512="yes", - pac_cv_found__mm512_storeu_si512="no", - pac_cv_found__mm512_storeu_si512="unknown") - ]) - if test "$pac_cv_found__mm512_storeu_si512" = "yes" ; then - AC_DEFINE(HAVE__MM512_STOREU_SI512,1,[Define if _mm512_storeu_si512 is available]) - fi - fi + ]])], pac_cv_found__mm512_storeu_si512="yes", + pac_cv_found__mm512_storeu_si512="no", + pac_cv_found__mm512_storeu_si512="unknown") + ]) +if test "$pac_cv_found__mm512_storeu_si512" = "yes" ; then + AC_DEFINE(HAVE__MM512_STOREU_SI512,1,[Define if _mm512_storeu_si512 is available]) fi #######################################################################