From 33e5e35145626f3dede2dc6ec4c26c5443775788 Mon Sep 17 00:00:00 2001 From: flouris Date: Mon, 15 May 2017 03:04:57 +0200 Subject: [PATCH] added run-time hardware detection and check - issue #136 and #138 --- ChangeLog.md | 34 ++++++++++++ configure.ac | 80 +++++++++++++++++---------- man/libpll.3 | 10 ++++ src/Makefile.am | 4 +- src/core_derivatives.c | 18 +++--- src/core_likelihood.c | 24 ++++---- src/core_partials.c | 52 +++++++++--------- src/core_pmatrix.c | 6 +- src/fast_parsimony.c | 30 +++++----- src/hardware.c | 121 +++++++++++++++++++++++++++++++++++++++++ src/init.c | 34 ++++++++++++ src/pll.c | 16 ++++-- src/pll.h | 33 +++++++++++ 13 files changed, 361 insertions(+), 101 deletions(-) create mode 100644 src/hardware.c create mode 100644 src/init.c diff --git a/ChangeLog.md b/ChangeLog.md index 207a1db..ed3e43e 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -2,6 +2,40 @@ All notable changes to `libpll` will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). +## [0.3.0] - 2017-05-15 +### Added + - Run-time detection of cpu features + - Vectorized (AVX) computation of 20-state transition probability matrices + - Faster tip-inner kernels for 20-state models + - Improved AVX vectorization of derivatives + - Faster PHYLIP parser + - vectorized scaling for 20-state and arbitrary-state models + - AVX2 vectorizations for partials, likelihood and derivatives + - Unweighted parsimony functions including SSE, AVX and AVX2 vectorizations + - Randomized stepwise addition + - Portable functions for parsing trees from a C-string + - Optional per-rate category scalers to prevent numerical underflows on large + trees + - Setting of identity matrix if all exponentiations of eigenvalues multiplied + by branch length and rate are approximately equal to one + - Re-entrant cross-platform pseudo-random number generator + - Wrapper tree structures + - Custom exporting of tree structures using a callback function + - Support for median category rates in discrete gamma model + +### Fixed + - Derivatives computation + - Parsing of branch lengths in newick trees + - Invariant sites computation + - Multiplication of log-likelihood with pattern weight after scaling term + - Added destructors for eliminating memory leaks when tree parsing fails + - Sumtable computation when having multiple substitution matrices + - Ascertainment bias computation + - Per-site log-likelihood computation + - Uninitialized values in testing framework + + + ## [0.2.0] - 2016-09-09 ### Added - Methods for ascertainment bias correction (Lewis, Felsenstein, Stamatakis) diff --git a/configure.ac b/configure.ac index 98860e4..62ba2db 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ([2.63]) -AC_INIT([libpll], [0.2.0], [Tomas.Flouri@h-its.org]) +AC_INIT([libpll], [0.3.0], [Tomas.Flouri@h-its.org]) AM_INIT_AUTOMAKE([subdir-objects]) AC_LANG([C]) AC_CONFIG_SRCDIR([src/pll.c]) @@ -52,37 +52,39 @@ AC_CHECK_FUNCS([asprintf memcpy memset posix_memalign]) have_avx2=no have_avx=no -have_sse=no +have_sse3=no have_ps2pdf=no -AX_EXT - -if test "x${ax_cv_have_fma3_ext}" = "xyes"; then - have_avx2=yes -fi - -if test "x${ax_cv_have_avx_ext}" = "xyes"; then - have_avx=yes -fi - -if test "x${ax_cv_have_sse3_ext}" = "xyes"; then - have_sse3=yes -fi - -AC_ARG_ENABLE(avx2, AS_HELP_STRING([--disable-avx2], [Build without AVX2/FMA support])) -AS_IF([test "x${ax_cv_have_fma3_ext}" = "xyes"], [ - have_avx2=yes -]) - -AC_ARG_ENABLE(avx, AS_HELP_STRING([--disable-avx], [Build without AVX support])) -AS_IF([test "x${ax_cv_have_avx_ext}" = "xyes"], [ - have_avx=yes -]) - -AC_ARG_ENABLE(sse,i AS_HELP_STRING([--disable-sse],[Build without SSE support])) -AS_IF([test "x${ax_cv_have_sse3_ext}" = "xyes"], [ - have_sse3=yes -]) +# Compile-time detection of processor features - now disabled +#AX_EXT +# +#if test "x${ax_cv_have_fma3_ext}" = "xyes"; then +# have_avx2=yes +#fi +# +#if test "x${ax_cv_have_avx_ext}" = "xyes"; then +# have_avx=yes +#fi +# +#if test "x${ax_cv_have_sse3_ext}" = "xyes"; then +# have_sse3=yes +#fi +# +#AC_ARG_ENABLE(avx2, AS_HELP_STRING([--disable-avx2], [Build without AVX2/FMA support])) +#AS_IF([test "x${ax_cv_have_fma3_ext}" = "xyes"], [ +# have_avx2=yes +#]) +# +#AC_ARG_ENABLE(avx, AS_HELP_STRING([--disable-avx], [Build without AVX support])) +#AS_IF([test "x${ax_cv_have_avx_ext}" = "xyes"], [ +# have_avx=yes +#]) +# +#AC_ARG_ENABLE(sse, AS_HELP_STRING([--disable-sse],[Build without SSE support])) +#AS_IF([test "x${ax_cv_have_sse3_ext}" = "xyes"], [ +# have_sse3=yes +#]) +# AC_ARG_ENABLE(pdfman, AS_HELP_STRING([--disable-pdfman], [Disable PDF manual creation])) AS_IF([test "x$enable_pdfman" != "xno"], [ @@ -94,6 +96,24 @@ AS_IF([test "x$enable_pdfman" != "xno"], [ fi ]) +AC_ARG_ENABLE(sse, AS_HELP_STRING([--disable-sse], [Build without SSE support])) +AS_IF([test "x$enable_sse" != "xno"], [ + have_sse3=yes + AC_DEFINE([HAVE_SSE3], [1], [Define to 1 to support Streaming SIMD Extensions 3]) +]) + +AC_ARG_ENABLE(avx, AS_HELP_STRING([--disable-avx], [Build without AVX support])) +AS_IF([test "x$enable_avx" != "xno"], [ + have_avx=yes + AC_DEFINE([HAVE_AVX], [1], [Define to 1 to support Advanced Vector Extensions]) +]) + +AC_ARG_ENABLE(avx2, AS_HELP_STRING([--disable-avx2], [Build without AVX2/FMA support])) +AS_IF([test "x$enable_avx2" != "xno"], [ + have_avx2=yes + AC_DEFINE([HAVE_AVX2], [1], [Define to 1 to support Advanced Vector Extensions 2]) +]) + AM_CONDITIONAL(HAVE_AVX2, test "x${have_avx2}" = "xyes") AM_CONDITIONAL(HAVE_AVX, test "x${have_avx}" = "xyes") AM_CONDITIONAL(HAVE_SSE3, test "x${have_sse3}" = "xyes") diff --git a/man/libpll.3 b/man/libpll.3 index 43084ee..2ff6344 100644 --- a/man/libpll.3 +++ b/man/libpll.3 @@ -576,5 +576,15 @@ bug releases may not be mentioned): .TP .BR v0.2.0\~ "released September 9th, 2016" First public release. +.TP +.BR v0.3.0\~ "released May 15th, 2017" +Added faster vectorizations for 20-state and arbitrary-state models, unweighted +parsimony functions, randomized stepwise addition, portable functions for +parsing trees from C-strings, per-rate category scalers for preventing +numberical underflows. Modified newick exporting function to accept callbacks +for custom printing. Fixed derivatives computation, parsing of branch lengths, +invariant sites computation, log-likelihood computation for cases where we have +scaling and patterns, ascertainment bias computation, per-site log-likelihood +computation, memory leaks. Added run-time detection of hardware. .RE .LP diff --git a/src/Makefile.am b/src/Makefile.am index 7f7ee8b..23c16ed 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -36,7 +36,9 @@ lex_rtree.l \ fast_parsimony.c \ stepwise.c \ random.c \ -phylip.c +phylip.c \ +hardware.c \ +init.c libpll_la_CFLAGS = $(AM_CFLAGS) diff --git a/src/core_derivatives.c b/src/core_derivatives.c index 8c8860b..a5fea48 100644 --- a/src/core_derivatives.c +++ b/src/core_derivatives.c @@ -147,7 +147,7 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states, const double * t_freqs; #ifdef HAVE_SSE3 - if (attrib & PLL_ATTRIB_ARCH_SSE) + if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { return pll_core_update_sumtable_ii_sse(states, sites, @@ -164,7 +164,7 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states, } #endif #ifdef HAVE_AVX - if (attrib & PLL_ATTRIB_ARCH_AVX) + if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { return pll_core_update_sumtable_ii_avx(states, sites, @@ -181,7 +181,7 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states, } #endif #ifdef HAVE_AVX2 - if (attrib & PLL_ATTRIB_ARCH_AVX2) + if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { return pll_core_update_sumtable_ii_avx2(states, sites, @@ -299,7 +299,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states, unsigned int states_padded = states; #ifdef HAVE_SSE3 - if (attrib & PLL_ATTRIB_ARCH_SSE) + if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { return pll_core_update_sumtable_ti_sse(states, sites, @@ -316,7 +316,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states, } #endif #ifdef HAVE_AVX - if (attrib & PLL_ATTRIB_ARCH_AVX) + if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { return pll_core_update_sumtable_ti_avx(states, sites, @@ -334,7 +334,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states, } #endif #ifdef HAVE_AVX2 - if (attrib & PLL_ATTRIB_ARCH_AVX2) + if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { return pll_core_update_sumtable_ti_avx2(states, sites, @@ -530,14 +530,14 @@ PLL_EXPORT int pll_core_likelihood_derivatives(unsigned int states, // SSE3 vectorization in missing as of now #ifdef HAVE_SSE3 - if (attrib & PLL_ATTRIB_ARCH_SSE) + if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { states_padded = (states+1) & 0xFFFFFFFE; } #endif #ifdef HAVE_AVX2 - if (attrib & PLL_ATTRIB_ARCH_AVX2) + if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { states_padded = (states+3) & 0xFFFFFFFC; @@ -558,7 +558,7 @@ PLL_EXPORT int pll_core_likelihood_derivatives(unsigned int states, else #endif #ifdef HAVE_AVX - if (attrib & PLL_ATTRIB_ARCH_AVX) + if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { states_padded = (states+3) & 0xFFFFFFFC; diff --git a/src/core_likelihood.c b/src/core_likelihood.c index b2c632a..392bdc0 100644 --- a/src/core_likelihood.c +++ b/src/core_likelihood.c @@ -48,7 +48,7 @@ PLL_EXPORT double pll_core_root_loglikelihood(unsigned int states, unsigned int states_padded = states; #ifdef HAVE_SSE3 - if (attrib & PLL_ATTRIB_ARCH_SSE) + if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { if (states == 4) { @@ -85,7 +85,7 @@ PLL_EXPORT double pll_core_root_loglikelihood(unsigned int states, } #endif #ifdef HAVE_AVX - if (attrib & PLL_ATTRIB_ARCH_AVX) + if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { if (states == 4) { @@ -122,7 +122,7 @@ PLL_EXPORT double pll_core_root_loglikelihood(unsigned int states, } #endif #ifdef HAVE_AVX2 - if (attrib & PLL_ATTRIB_ARCH_AVX2) + if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { if (states == 4) { @@ -241,7 +241,7 @@ double pll_core_edge_loglikelihood_ti_4x4(unsigned int sites, unsigned int states_padded = states; #ifdef HAVE_SSE3 - if (attrib & PLL_ATTRIB_ARCH_SSE) + if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { return pll_core_edge_loglikelihood_ti_4x4_sse(sites, rate_cats, @@ -260,7 +260,7 @@ double pll_core_edge_loglikelihood_ti_4x4(unsigned int sites, } #endif #ifdef HAVE_AVX - if (attrib & PLL_ATTRIB_ARCH_AVX) + if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { return pll_core_edge_loglikelihood_ti_4x4_avx(sites, rate_cats, @@ -279,7 +279,7 @@ double pll_core_edge_loglikelihood_ti_4x4(unsigned int sites, } #endif #ifdef HAVE_AVX2 - if (attrib & PLL_ATTRIB_ARCH_AVX2) + if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { return pll_core_edge_loglikelihood_ti_4x4_avx(sites, rate_cats, @@ -445,7 +445,7 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, unsigned int states_padded = states; #ifdef HAVE_SSE3 - if (attrib & PLL_ATTRIB_ARCH_SSE) + if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { if (states == 4) { @@ -488,7 +488,7 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, } #endif #ifdef HAVE_AVX - if (attrib & PLL_ATTRIB_ARCH_AVX) + if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { if (states == 4) { @@ -549,7 +549,7 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, } #endif #ifdef HAVE_AVX2 - if (attrib & PLL_ATTRIB_ARCH_AVX2) + if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { if (states == 4) { @@ -705,7 +705,7 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, unsigned int states_padded = states; #ifdef HAVE_SSE3 - if (attrib & PLL_ATTRIB_ARCH_SSE) + if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { if (states == 4) { @@ -749,7 +749,7 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, } #endif #ifdef HAVE_AVX - if (attrib & PLL_ATTRIB_ARCH_AVX) + if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { if (states == 4) { @@ -793,7 +793,7 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, } #endif #ifdef HAVE_AVX2 - if (attrib & PLL_ATTRIB_ARCH_AVX2) + if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { if (states == 4) { diff --git a/src/core_partials.c b/src/core_partials.c index be9534c..e218c06 100644 --- a/src/core_partials.c +++ b/src/core_partials.c @@ -95,7 +95,7 @@ PLL_EXPORT void pll_core_update_partial_tt(unsigned int states, const double * offset; #ifdef HAVE_SSE3 - if (attrib & PLL_ATTRIB_ARCH_SSE) + if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { if (states == 4) pll_core_update_partial_tt_4x4_sse(sites, @@ -123,7 +123,7 @@ PLL_EXPORT void pll_core_update_partial_tt(unsigned int states, #endif #ifdef HAVE_AVX - if (attrib & PLL_ATTRIB_ARCH_AVX) + if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { if (states == 4) pll_core_update_partial_tt_4x4_avx(sites, @@ -150,7 +150,7 @@ PLL_EXPORT void pll_core_update_partial_tt(unsigned int states, } #endif #ifdef HAVE_AVX2 - if (attrib & PLL_ATTRIB_ARCH_AVX2) + if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { if (states == 4) pll_core_update_partial_tt_4x4_avx(sites, @@ -222,7 +222,7 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4(unsigned int sites, const double * rmat; #ifdef HAVE_SSE3 - if (attrib & PLL_ATTRIB_ARCH_SSE) + if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { pll_core_update_partial_ti_4x4_sse(sites, rate_cats, @@ -238,7 +238,7 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4(unsigned int sites, } #endif #ifdef HAVE_AVX - if (attrib & PLL_ATTRIB_ARCH_AVX) + if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { pll_core_update_partial_ti_4x4_avx(sites, rate_cats, @@ -254,7 +254,7 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4(unsigned int sites, } #endif #ifdef HAVE_AVX2 - if (attrib & PLL_ATTRIB_ARCH_AVX2) + if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { pll_core_update_partial_ti_4x4_avx(sites, rate_cats, @@ -375,7 +375,7 @@ PLL_EXPORT void pll_core_update_partial_ti(unsigned int states, const double * rmat; #ifdef HAVE_SSE3 - if ((attrib & PLL_ATTRIB_ARCH_SSE)) + if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { if (states == 4) pll_core_update_partial_ti_4x4_sse(sites, @@ -406,7 +406,7 @@ PLL_EXPORT void pll_core_update_partial_ti(unsigned int states, } #endif #ifdef HAVE_AVX - if ((attrib & PLL_ATTRIB_ARCH_AVX)) + if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { pll_core_update_partial_ti_avx(states, sites, @@ -425,7 +425,7 @@ PLL_EXPORT void pll_core_update_partial_ti(unsigned int states, } #endif #ifdef HAVE_AVX2 - if ((attrib & PLL_ATTRIB_ARCH_AVX2)) + if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { pll_core_update_partial_ti_avx(states, sites, @@ -532,7 +532,7 @@ PLL_EXPORT void pll_core_update_partial_ii(unsigned int states, unsigned int span = states * rate_cats; #ifdef HAVE_SSE3 - if (attrib & PLL_ATTRIB_ARCH_SSE) + if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { pll_core_update_partial_ii_sse(states, sites, @@ -550,7 +550,7 @@ PLL_EXPORT void pll_core_update_partial_ii(unsigned int states, } #endif #ifdef HAVE_AVX - if (attrib & PLL_ATTRIB_ARCH_AVX) + if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { pll_core_update_partial_ii_avx(states, sites, @@ -568,20 +568,20 @@ PLL_EXPORT void pll_core_update_partial_ii(unsigned int states, } #endif #ifdef HAVE_AVX2 - if (attrib & PLL_ATTRIB_ARCH_AVX2) + if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { pll_core_update_partial_ii_avx2(states, - sites, - rate_cats, - parent_clv, - parent_scaler, - left_clv, - right_clv, - left_matrix, - right_matrix, - left_scaler, - right_scaler, - attrib); + sites, + rate_cats, + parent_clv, + parent_scaler, + left_clv, + right_clv, + left_matrix, + right_matrix, + left_scaler, + right_scaler, + attrib); return; } #endif @@ -733,7 +733,7 @@ PLL_EXPORT void pll_core_create_lookup(unsigned int states, { #ifdef HAVE_SSE3 - if (attrib & PLL_ATTRIB_ARCH_SSE) + if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { if (states == 4) pll_core_create_lookup_4x4_sse(rate_cats, @@ -752,7 +752,7 @@ PLL_EXPORT void pll_core_create_lookup(unsigned int states, } #endif #ifdef HAVE_AVX - if (attrib & PLL_ATTRIB_ARCH_AVX) + if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { if (states == 4) pll_core_create_lookup_4x4_avx(rate_cats, @@ -771,7 +771,7 @@ PLL_EXPORT void pll_core_create_lookup(unsigned int states, } #endif #ifdef HAVE_AVX2 - if (attrib & PLL_ATTRIB_ARCH_AVX2) + if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { if (states == 4) pll_core_create_lookup_4x4_avx(rate_cats, diff --git a/src/core_pmatrix.c b/src/core_pmatrix.c index 739cd64..4fb1e84 100644 --- a/src/core_pmatrix.c +++ b/src/core_pmatrix.c @@ -48,7 +48,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix, #ifdef HAVE_SSE3 - if (attrib & PLL_ATTRIB_ARCH_SSE) + if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { if (states == 4) { @@ -70,7 +70,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix, } #endif #ifdef HAVE_AVX - if (attrib & PLL_ATTRIB_ARCH_AVX) + if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { if (states == 4) { @@ -106,7 +106,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix, } #endif #ifdef HAVE_AVX2 - if (attrib & PLL_ATTRIB_ARCH_AVX2) + if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { if (states == 4) { diff --git a/src/fast_parsimony.c b/src/fast_parsimony.c index c573884..564f7da 100644 --- a/src/fast_parsimony.c +++ b/src/fast_parsimony.c @@ -245,17 +245,17 @@ static int fill_parsimony_vectors(const pll_partition_t * partition, (bitcount % PLL_BITVECTOR_SIZE != 0); #ifdef HAVE_SSE3 - if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE) + if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) bitvectors = (bitvectors+3) & 0xFFFFFFFC; #endif #ifdef HAVE_AVX - if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX) + if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) bitvectors = (bitvectors+7) & 0xFFFFFFF8; #endif #ifdef HAVE_AVX2 - if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2) + if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) bitvectors = (bitvectors+7) & 0xFFFFFFF8; #endif @@ -651,17 +651,17 @@ static void fastparsimony_update_vectors_4x4(pll_parsimony_t * parsimony, { op = &(ops[i]); #ifdef HAVE_SSE3 - if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE) + if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) pll_fastparsimony_update_vector_4x4_sse(parsimony,op); else #endif #ifdef HAVE_AVX - if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX) + if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) pll_fastparsimony_update_vector_4x4_avx(parsimony,op); else #endif #ifdef HAVE_AVX2 - if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2) + if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) pll_fastparsimony_update_vector_4x4_avx2(parsimony,op); else #endif @@ -680,17 +680,17 @@ static int fastparsimony_update_vectors(pll_parsimony_t * parsimony, { op = &(ops[i]); #ifdef HAVE_SSE3 - if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE) + if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) pll_fastparsimony_update_vector_sse(parsimony,op); else #endif #ifdef HAVE_AVX - if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX) + if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) pll_fastparsimony_update_vector_avx(parsimony,op); else #endif #ifdef HAVE_AVX2 - if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2) + if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) pll_fastparsimony_update_vector_avx2(parsimony,op); else #endif @@ -716,19 +716,19 @@ PLL_EXPORT unsigned int pll_fastparsimony_edge_score(const pll_parsimony_t * par if (parsimony->states == 4) { #ifdef HAVE_SSE3 - if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE) + if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) return pll_fastparsimony_edge_score_4x4_sse(parsimony, node1_score_index, node2_score_index); #endif #ifdef HAVE_AVX - if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX) + if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) return pll_fastparsimony_edge_score_4x4_avx(parsimony, node1_score_index, node2_score_index); #endif #ifdef HAVE_AVX2 - if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2) + if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) return pll_fastparsimony_edge_score_4x4_avx2(parsimony, node1_score_index, node2_score_index); @@ -739,21 +739,21 @@ PLL_EXPORT unsigned int pll_fastparsimony_edge_score(const pll_parsimony_t * par } #ifdef HAVE_SSE3 - if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE) + if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) return pll_fastparsimony_edge_score_sse(parsimony, node1_score_index, node2_score_index); else #endif #ifdef HAVE_AVX - if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX) + if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) return pll_fastparsimony_edge_score_avx(parsimony, node1_score_index, node2_score_index); else #endif #ifdef HAVE_AVX2 - if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2) + if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) return pll_fastparsimony_edge_score_avx2(parsimony, node1_score_index, node2_score_index); diff --git a/src/hardware.c b/src/hardware.c new file mode 100644 index 0000000..49d414a --- /dev/null +++ b/src/hardware.c @@ -0,0 +1,121 @@ +/* + Copyright (C) 2017 Tomas Flouri + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + + Contact: Tomas Flouri , + Exelixis Lab, Heidelberg Instutute for Theoretical Studies + Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany +*/ + +#include "pll.h" + +#ifndef __PPC__ +#define cpuid(f1, f2, a, b, c, d) \ + __asm__ __volatile__ ("cpuid" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "a" (f1), "c" (f2)); +#endif + +static void cpu_features_detect() +{ + unsigned int a,b,c,d; + + memset(pll_hardware,0,sizeof(pll_hardware_t)); + +#ifdef __PPC__ + pll_hardware->altivec_present = 1; +#else + + cpuid(0,0,a,b,c,d); + unsigned int maxlevel = a & 0xff; + + if (maxlevel >= 1) + { + cpuid(1,0,a,b,c,d); + pll_hardware->mmx_present = (d >> 23) & 1; + pll_hardware->sse_present = (d >> 25) & 1; + pll_hardware->sse2_present = (d >> 26) & 1; + pll_hardware->sse3_present = (c >> 0) & 1; + pll_hardware->ssse3_present = (c >> 9) & 1; + pll_hardware->sse41_present = (c >> 19) & 1; + pll_hardware->sse42_present = (c >> 20) & 1; + pll_hardware->popcnt_present = (c >> 23) & 1; + pll_hardware->avx_present = (c >> 28) & 1; + + if (maxlevel >= 7) + { + cpuid(7,0,a,b,c,d); + pll_hardware->avx2_present = (b >> 5) & 1; + } + } +#endif +} + +static void cpu_features_show() +{ + if (!pll_hardware) + { + /* TODO: Add proper error control after we figure out + cross-platform compatibility */ + return; + } + + fprintf(stderr, "CPU features:"); + if (pll_hardware->altivec_present) + fprintf(stderr, " altivec"); + if (pll_hardware->mmx_present) + fprintf(stderr, " mmx"); + if (pll_hardware->sse_present) + fprintf(stderr, " sse"); + if (pll_hardware->sse2_present) + fprintf(stderr, " sse2"); + if (pll_hardware->sse3_present) + fprintf(stderr, " sse3"); + if (pll_hardware->ssse3_present) + fprintf(stderr, " ssse3"); + if (pll_hardware->sse41_present) + fprintf(stderr, " sse4.1"); + if (pll_hardware->sse42_present) + fprintf(stderr, " sse4.2"); + if (pll_hardware->popcnt_present) + fprintf(stderr, " popcnt"); + if (pll_hardware->avx_present) + fprintf(stderr, " avx"); + if (pll_hardware->avx2_present) + fprintf(stderr, " avx2"); + fprintf(stderr, "\n"); +} + +PLL_EXPORT int pll_hardware_probe() +{ + /* probe cpu features */ + if (!pll_hardware) + { + if (!(pll_hardware = (pll_hardware_t *)calloc(1,sizeof(pll_hardware_t)))) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf(pll_errmsg, 200, "Unable to allocate enough memory."); + return PLL_FAILURE; + } + } + cpu_features_detect(); + + return PLL_SUCCESS; +} + +PLL_EXPORT void pll_hardware_dump() +{ + cpu_features_show(); +} diff --git a/src/init.c b/src/init.c new file mode 100644 index 0000000..90d5bd3 --- /dev/null +++ b/src/init.c @@ -0,0 +1,34 @@ +/* + Copyright (C) 2017 Tomas Flouri + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + + Contact: Tomas Flouri , + Exelixis Lab, Heidelberg Instutute for Theoretical Studies + Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany +*/ + +#include "pll.h" + +PLL_EXPORT void pll_init() +{ + pll_hardware_probe(); +} + +PLL_EXPORT void pll_fini() +{ + if (pll_hardware) + free(pll_hardware); + pll_hardware = NULL; +} diff --git a/src/pll.c b/src/pll.c index d0d64f1..50254aa 100644 --- a/src/pll.c +++ b/src/pll.c @@ -24,6 +24,8 @@ __thread int pll_errno; __thread char pll_errmsg[200] = {0}; +pll_hardware_t * pll_hardware = NULL; + static void dealloc_partition_data(pll_partition_t * partition); static void dealloc_partition_data(pll_partition_t * partition) @@ -240,7 +242,9 @@ static int update_charmap(pll_partition_t * partition, const unsigned int * map) (partition->states_padded * partition->rate_cats); /* for AVX we do not need to reallocate ttlookup as it has fixed size */ - if ((partition->states == 4) && (partition->attributes & PLL_ATTRIB_ARCH_AVX)) + if ((partition->states == 4) && + (partition->attributes & PLL_ATTRIB_ARCH_AVX) && + PLL_STAT(avx_present)) return PLL_SUCCESS; free(partition->ttlookup); @@ -337,7 +341,9 @@ static int create_charmap(pll_partition_t * partition, const unsigned int * user /* dedicated 4x4 function - if AVX is not used we can allocate less space in case not all 16 possible ambiguities are present */ - if ((partition->states == 4) && (partition->attributes & PLL_ATTRIB_ARCH_AVX)) + if ((partition->states == 4) && + (partition->attributes & PLL_ATTRIB_ARCH_AVX) && + PLL_STAT(avx_present)) { partition->ttlookup = pll_aligned_alloc(1024 * partition->rate_cats * sizeof(double), @@ -425,21 +431,21 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips, partition->attributes = attributes; partition->states_padded = states; #ifdef HAVE_SSE3 - if (attributes & PLL_ATTRIB_ARCH_SSE) + if (attributes & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { partition->alignment = PLL_ALIGNMENT_SSE; partition->states_padded = (states+1) & 0xFFFFFFFE; } #endif #ifdef HAVE_AVX - if (attributes & PLL_ATTRIB_ARCH_AVX) + if (attributes & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present)) { partition->alignment = PLL_ALIGNMENT_AVX; partition->states_padded = (states+3) & 0xFFFFFFFC; } #endif #ifdef HAVE_AVX2 - if (attributes & PLL_ATTRIB_ARCH_AVX2) + if (attributes & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present)) { partition->alignment = PLL_ALIGNMENT_AVX; partition->states_padded = (states+3) & 0xFFFFFFFC; diff --git a/src/pll.h b/src/pll.h index b84b746..fe38a76 100644 --- a/src/pll.h +++ b/src/pll.h @@ -49,6 +49,7 @@ #define PLL_MIN(a,b) ((a) < (b) ? (a) : (b)) #define PLL_MAX(a,b) ((a) > (b) ? (a) : (b)) #define PLL_SWAP(x,y) do { __typeof__ (x) _t = x; x = y; y = _t; } while(0) +#define PLL_STAT(x) (pll_hardware && pll_hardware->x) /* constants */ @@ -160,6 +161,24 @@ /* structures and data types */ +typedef struct pll_hardware_s +{ + /* cpu features */ + int altivec_present; + int mmx_present; + int sse_present; + int sse2_present; + int sse3_present; + int ssse3_present; + int sse41_present; + int sse42_present; + int popcnt_present; + int avx_present; + int avx2_present; + + /* TODO: add chip,core,mem info */ +} pll_hardware_t; + typedef struct pll_partition { unsigned int tips; @@ -432,6 +451,7 @@ struct pll_random_data PLL_EXPORT extern __thread int pll_errno; PLL_EXPORT extern __thread char pll_errmsg[200]; +PLL_EXPORT extern pll_hardware_t * pll_hardware; PLL_EXPORT extern const unsigned int pll_map_bin[256]; PLL_EXPORT extern const unsigned int pll_map_nt[256]; @@ -1849,6 +1869,19 @@ PLL_EXPORT extern int pll_initstate_r(unsigned int __seed, PLL_EXPORT extern int pll_setstate_r(char * __statebuf, struct pll_random_data * __buf); +/* functions in hardware.c */ + +PLL_EXPORT int pll_hardware_probe(void); + +PLL_EXPORT void pll_hardware_dump(); + +/* functions in init.c */ + +PLL_EXPORT void pll_init(void) __attribute__((constructor)); + +PLL_EXPORT void pll_fini(void) __attribute__((destructor)); + + #ifdef __cplusplus } /* extern "C" */ #endif