diff --git a/ChangeLog.md b/ChangeLog.md
index 207a1db..ed3e43e 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -2,6 +2,40 @@
All notable changes to `libpll` will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).
+## [0.3.0] - 2017-05-15
+### Added
+ - Run-time detection of cpu features
+ - Vectorized (AVX) computation of 20-state transition probability matrices
+ - Faster tip-inner kernels for 20-state models
+ - Improved AVX vectorization of derivatives
+ - Faster PHYLIP parser
+ - vectorized scaling for 20-state and arbitrary-state models
+ - AVX2 vectorizations for partials, likelihood and derivatives
+ - Unweighted parsimony functions including SSE, AVX and AVX2 vectorizations
+ - Randomized stepwise addition
+ - Portable functions for parsing trees from a C-string
+ - Optional per-rate category scalers to prevent numerical underflows on large
+ trees
+ - Setting of identity matrix if all exponentiations of eigenvalues multiplied
+ by branch length and rate are approximately equal to one
+ - Re-entrant cross-platform pseudo-random number generator
+ - Wrapper tree structures
+ - Custom exporting of tree structures using a callback function
+ - Support for median category rates in discrete gamma model
+
+### Fixed
+ - Derivatives computation
+ - Parsing of branch lengths in newick trees
+ - Invariant sites computation
+ - Multiplication of log-likelihood with pattern weight after scaling term
+ - Added destructors for eliminating memory leaks when tree parsing fails
+ - Sumtable computation when having multiple substitution matrices
+ - Ascertainment bias computation
+ - Per-site log-likelihood computation
+ - Uninitialized values in testing framework
+
+
+
## [0.2.0] - 2016-09-09
### Added
- Methods for ascertainment bias correction (Lewis, Felsenstein, Stamatakis)
diff --git a/configure.ac b/configure.ac
index 98860e4..62ba2db 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ([2.63])
-AC_INIT([libpll], [0.2.0], [Tomas.Flouri@h-its.org])
+AC_INIT([libpll], [0.3.0], [Tomas.Flouri@h-its.org])
AM_INIT_AUTOMAKE([subdir-objects])
AC_LANG([C])
AC_CONFIG_SRCDIR([src/pll.c])
@@ -52,37 +52,39 @@ AC_CHECK_FUNCS([asprintf memcpy memset posix_memalign])
have_avx2=no
have_avx=no
-have_sse=no
+have_sse3=no
have_ps2pdf=no
-AX_EXT
-
-if test "x${ax_cv_have_fma3_ext}" = "xyes"; then
- have_avx2=yes
-fi
-
-if test "x${ax_cv_have_avx_ext}" = "xyes"; then
- have_avx=yes
-fi
-
-if test "x${ax_cv_have_sse3_ext}" = "xyes"; then
- have_sse3=yes
-fi
-
-AC_ARG_ENABLE(avx2, AS_HELP_STRING([--disable-avx2], [Build without AVX2/FMA support]))
-AS_IF([test "x${ax_cv_have_fma3_ext}" = "xyes"], [
- have_avx2=yes
-])
-
-AC_ARG_ENABLE(avx, AS_HELP_STRING([--disable-avx], [Build without AVX support]))
-AS_IF([test "x${ax_cv_have_avx_ext}" = "xyes"], [
- have_avx=yes
-])
-
-AC_ARG_ENABLE(sse,i AS_HELP_STRING([--disable-sse],[Build without SSE support]))
-AS_IF([test "x${ax_cv_have_sse3_ext}" = "xyes"], [
- have_sse3=yes
-])
+# Compile-time detection of processor features - now disabled
+#AX_EXT
+#
+#if test "x${ax_cv_have_fma3_ext}" = "xyes"; then
+# have_avx2=yes
+#fi
+#
+#if test "x${ax_cv_have_avx_ext}" = "xyes"; then
+# have_avx=yes
+#fi
+#
+#if test "x${ax_cv_have_sse3_ext}" = "xyes"; then
+# have_sse3=yes
+#fi
+#
+#AC_ARG_ENABLE(avx2, AS_HELP_STRING([--disable-avx2], [Build without AVX2/FMA support]))
+#AS_IF([test "x${ax_cv_have_fma3_ext}" = "xyes"], [
+# have_avx2=yes
+#])
+#
+#AC_ARG_ENABLE(avx, AS_HELP_STRING([--disable-avx], [Build without AVX support]))
+#AS_IF([test "x${ax_cv_have_avx_ext}" = "xyes"], [
+# have_avx=yes
+#])
+#
+#AC_ARG_ENABLE(sse, AS_HELP_STRING([--disable-sse],[Build without SSE support]))
+#AS_IF([test "x${ax_cv_have_sse3_ext}" = "xyes"], [
+# have_sse3=yes
+#])
+#
AC_ARG_ENABLE(pdfman, AS_HELP_STRING([--disable-pdfman], [Disable PDF manual creation]))
AS_IF([test "x$enable_pdfman" != "xno"], [
@@ -94,6 +96,24 @@ AS_IF([test "x$enable_pdfman" != "xno"], [
fi
])
+AC_ARG_ENABLE(sse, AS_HELP_STRING([--disable-sse], [Build without SSE support]))
+AS_IF([test "x$enable_sse" != "xno"], [
+ have_sse3=yes
+ AC_DEFINE([HAVE_SSE3], [1], [Define to 1 to support Streaming SIMD Extensions 3])
+])
+
+AC_ARG_ENABLE(avx, AS_HELP_STRING([--disable-avx], [Build without AVX support]))
+AS_IF([test "x$enable_avx" != "xno"], [
+ have_avx=yes
+ AC_DEFINE([HAVE_AVX], [1], [Define to 1 to support Advanced Vector Extensions])
+])
+
+AC_ARG_ENABLE(avx2, AS_HELP_STRING([--disable-avx2], [Build without AVX2/FMA support]))
+AS_IF([test "x$enable_avx2" != "xno"], [
+ have_avx2=yes
+ AC_DEFINE([HAVE_AVX2], [1], [Define to 1 to support Advanced Vector Extensions 2])
+])
+
AM_CONDITIONAL(HAVE_AVX2, test "x${have_avx2}" = "xyes")
AM_CONDITIONAL(HAVE_AVX, test "x${have_avx}" = "xyes")
AM_CONDITIONAL(HAVE_SSE3, test "x${have_sse3}" = "xyes")
diff --git a/man/libpll.3 b/man/libpll.3
index 43084ee..2ff6344 100644
--- a/man/libpll.3
+++ b/man/libpll.3
@@ -576,5 +576,15 @@ bug releases may not be mentioned):
.TP
.BR v0.2.0\~ "released September 9th, 2016"
First public release.
+.TP
+.BR v0.3.0\~ "released May 15th, 2017"
+Added faster vectorizations for 20-state and arbitrary-state models, unweighted
+parsimony functions, randomized stepwise addition, portable functions for
+parsing trees from C-strings, per-rate category scalers for preventing
+numberical underflows. Modified newick exporting function to accept callbacks
+for custom printing. Fixed derivatives computation, parsing of branch lengths,
+invariant sites computation, log-likelihood computation for cases where we have
+scaling and patterns, ascertainment bias computation, per-site log-likelihood
+computation, memory leaks. Added run-time detection of hardware.
.RE
.LP
diff --git a/src/Makefile.am b/src/Makefile.am
index 7f7ee8b..23c16ed 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -36,7 +36,9 @@ lex_rtree.l \
fast_parsimony.c \
stepwise.c \
random.c \
-phylip.c
+phylip.c \
+hardware.c \
+init.c
libpll_la_CFLAGS = $(AM_CFLAGS)
diff --git a/src/core_derivatives.c b/src/core_derivatives.c
index 8c8860b..a5fea48 100644
--- a/src/core_derivatives.c
+++ b/src/core_derivatives.c
@@ -147,7 +147,7 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states,
const double * t_freqs;
#ifdef HAVE_SSE3
- if (attrib & PLL_ATTRIB_ARCH_SSE)
+ if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
return pll_core_update_sumtable_ii_sse(states,
sites,
@@ -164,7 +164,7 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states,
}
#endif
#ifdef HAVE_AVX
- if (attrib & PLL_ATTRIB_ARCH_AVX)
+ if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
return pll_core_update_sumtable_ii_avx(states,
sites,
@@ -181,7 +181,7 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states,
}
#endif
#ifdef HAVE_AVX2
- if (attrib & PLL_ATTRIB_ARCH_AVX2)
+ if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
return pll_core_update_sumtable_ii_avx2(states,
sites,
@@ -299,7 +299,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states,
unsigned int states_padded = states;
#ifdef HAVE_SSE3
- if (attrib & PLL_ATTRIB_ARCH_SSE)
+ if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
return pll_core_update_sumtable_ti_sse(states,
sites,
@@ -316,7 +316,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states,
}
#endif
#ifdef HAVE_AVX
- if (attrib & PLL_ATTRIB_ARCH_AVX)
+ if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
return pll_core_update_sumtable_ti_avx(states,
sites,
@@ -334,7 +334,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states,
}
#endif
#ifdef HAVE_AVX2
- if (attrib & PLL_ATTRIB_ARCH_AVX2)
+ if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
return pll_core_update_sumtable_ti_avx2(states,
sites,
@@ -530,14 +530,14 @@ PLL_EXPORT int pll_core_likelihood_derivatives(unsigned int states,
// SSE3 vectorization in missing as of now
#ifdef HAVE_SSE3
- if (attrib & PLL_ATTRIB_ARCH_SSE)
+ if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
states_padded = (states+1) & 0xFFFFFFFE;
}
#endif
#ifdef HAVE_AVX2
- if (attrib & PLL_ATTRIB_ARCH_AVX2)
+ if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
states_padded = (states+3) & 0xFFFFFFFC;
@@ -558,7 +558,7 @@ PLL_EXPORT int pll_core_likelihood_derivatives(unsigned int states,
else
#endif
#ifdef HAVE_AVX
- if (attrib & PLL_ATTRIB_ARCH_AVX)
+ if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
states_padded = (states+3) & 0xFFFFFFFC;
diff --git a/src/core_likelihood.c b/src/core_likelihood.c
index b2c632a..392bdc0 100644
--- a/src/core_likelihood.c
+++ b/src/core_likelihood.c
@@ -48,7 +48,7 @@ PLL_EXPORT double pll_core_root_loglikelihood(unsigned int states,
unsigned int states_padded = states;
#ifdef HAVE_SSE3
- if (attrib & PLL_ATTRIB_ARCH_SSE)
+ if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
if (states == 4)
{
@@ -85,7 +85,7 @@ PLL_EXPORT double pll_core_root_loglikelihood(unsigned int states,
}
#endif
#ifdef HAVE_AVX
- if (attrib & PLL_ATTRIB_ARCH_AVX)
+ if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
if (states == 4)
{
@@ -122,7 +122,7 @@ PLL_EXPORT double pll_core_root_loglikelihood(unsigned int states,
}
#endif
#ifdef HAVE_AVX2
- if (attrib & PLL_ATTRIB_ARCH_AVX2)
+ if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
if (states == 4)
{
@@ -241,7 +241,7 @@ double pll_core_edge_loglikelihood_ti_4x4(unsigned int sites,
unsigned int states_padded = states;
#ifdef HAVE_SSE3
- if (attrib & PLL_ATTRIB_ARCH_SSE)
+ if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
return pll_core_edge_loglikelihood_ti_4x4_sse(sites,
rate_cats,
@@ -260,7 +260,7 @@ double pll_core_edge_loglikelihood_ti_4x4(unsigned int sites,
}
#endif
#ifdef HAVE_AVX
- if (attrib & PLL_ATTRIB_ARCH_AVX)
+ if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
return pll_core_edge_loglikelihood_ti_4x4_avx(sites,
rate_cats,
@@ -279,7 +279,7 @@ double pll_core_edge_loglikelihood_ti_4x4(unsigned int sites,
}
#endif
#ifdef HAVE_AVX2
- if (attrib & PLL_ATTRIB_ARCH_AVX2)
+ if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
return pll_core_edge_loglikelihood_ti_4x4_avx(sites,
rate_cats,
@@ -445,7 +445,7 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
unsigned int states_padded = states;
#ifdef HAVE_SSE3
- if (attrib & PLL_ATTRIB_ARCH_SSE)
+ if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
if (states == 4)
{
@@ -488,7 +488,7 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
}
#endif
#ifdef HAVE_AVX
- if (attrib & PLL_ATTRIB_ARCH_AVX)
+ if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
if (states == 4)
{
@@ -549,7 +549,7 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
}
#endif
#ifdef HAVE_AVX2
- if (attrib & PLL_ATTRIB_ARCH_AVX2)
+ if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
if (states == 4)
{
@@ -705,7 +705,7 @@ double pll_core_edge_loglikelihood_ii(unsigned int states,
unsigned int states_padded = states;
#ifdef HAVE_SSE3
- if (attrib & PLL_ATTRIB_ARCH_SSE)
+ if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
if (states == 4)
{
@@ -749,7 +749,7 @@ double pll_core_edge_loglikelihood_ii(unsigned int states,
}
#endif
#ifdef HAVE_AVX
- if (attrib & PLL_ATTRIB_ARCH_AVX)
+ if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
if (states == 4)
{
@@ -793,7 +793,7 @@ double pll_core_edge_loglikelihood_ii(unsigned int states,
}
#endif
#ifdef HAVE_AVX2
- if (attrib & PLL_ATTRIB_ARCH_AVX2)
+ if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
if (states == 4)
{
diff --git a/src/core_partials.c b/src/core_partials.c
index be9534c..e218c06 100644
--- a/src/core_partials.c
+++ b/src/core_partials.c
@@ -95,7 +95,7 @@ PLL_EXPORT void pll_core_update_partial_tt(unsigned int states,
const double * offset;
#ifdef HAVE_SSE3
- if (attrib & PLL_ATTRIB_ARCH_SSE)
+ if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
if (states == 4)
pll_core_update_partial_tt_4x4_sse(sites,
@@ -123,7 +123,7 @@ PLL_EXPORT void pll_core_update_partial_tt(unsigned int states,
#endif
#ifdef HAVE_AVX
- if (attrib & PLL_ATTRIB_ARCH_AVX)
+ if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
if (states == 4)
pll_core_update_partial_tt_4x4_avx(sites,
@@ -150,7 +150,7 @@ PLL_EXPORT void pll_core_update_partial_tt(unsigned int states,
}
#endif
#ifdef HAVE_AVX2
- if (attrib & PLL_ATTRIB_ARCH_AVX2)
+ if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
if (states == 4)
pll_core_update_partial_tt_4x4_avx(sites,
@@ -222,7 +222,7 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4(unsigned int sites,
const double * rmat;
#ifdef HAVE_SSE3
- if (attrib & PLL_ATTRIB_ARCH_SSE)
+ if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
pll_core_update_partial_ti_4x4_sse(sites,
rate_cats,
@@ -238,7 +238,7 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4(unsigned int sites,
}
#endif
#ifdef HAVE_AVX
- if (attrib & PLL_ATTRIB_ARCH_AVX)
+ if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
pll_core_update_partial_ti_4x4_avx(sites,
rate_cats,
@@ -254,7 +254,7 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4(unsigned int sites,
}
#endif
#ifdef HAVE_AVX2
- if (attrib & PLL_ATTRIB_ARCH_AVX2)
+ if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
pll_core_update_partial_ti_4x4_avx(sites,
rate_cats,
@@ -375,7 +375,7 @@ PLL_EXPORT void pll_core_update_partial_ti(unsigned int states,
const double * rmat;
#ifdef HAVE_SSE3
- if ((attrib & PLL_ATTRIB_ARCH_SSE))
+ if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
if (states == 4)
pll_core_update_partial_ti_4x4_sse(sites,
@@ -406,7 +406,7 @@ PLL_EXPORT void pll_core_update_partial_ti(unsigned int states,
}
#endif
#ifdef HAVE_AVX
- if ((attrib & PLL_ATTRIB_ARCH_AVX))
+ if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
pll_core_update_partial_ti_avx(states,
sites,
@@ -425,7 +425,7 @@ PLL_EXPORT void pll_core_update_partial_ti(unsigned int states,
}
#endif
#ifdef HAVE_AVX2
- if ((attrib & PLL_ATTRIB_ARCH_AVX2))
+ if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
pll_core_update_partial_ti_avx(states,
sites,
@@ -532,7 +532,7 @@ PLL_EXPORT void pll_core_update_partial_ii(unsigned int states,
unsigned int span = states * rate_cats;
#ifdef HAVE_SSE3
- if (attrib & PLL_ATTRIB_ARCH_SSE)
+ if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
pll_core_update_partial_ii_sse(states,
sites,
@@ -550,7 +550,7 @@ PLL_EXPORT void pll_core_update_partial_ii(unsigned int states,
}
#endif
#ifdef HAVE_AVX
- if (attrib & PLL_ATTRIB_ARCH_AVX)
+ if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
pll_core_update_partial_ii_avx(states,
sites,
@@ -568,20 +568,20 @@ PLL_EXPORT void pll_core_update_partial_ii(unsigned int states,
}
#endif
#ifdef HAVE_AVX2
- if (attrib & PLL_ATTRIB_ARCH_AVX2)
+ if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
pll_core_update_partial_ii_avx2(states,
- sites,
- rate_cats,
- parent_clv,
- parent_scaler,
- left_clv,
- right_clv,
- left_matrix,
- right_matrix,
- left_scaler,
- right_scaler,
- attrib);
+ sites,
+ rate_cats,
+ parent_clv,
+ parent_scaler,
+ left_clv,
+ right_clv,
+ left_matrix,
+ right_matrix,
+ left_scaler,
+ right_scaler,
+ attrib);
return;
}
#endif
@@ -733,7 +733,7 @@ PLL_EXPORT void pll_core_create_lookup(unsigned int states,
{
#ifdef HAVE_SSE3
- if (attrib & PLL_ATTRIB_ARCH_SSE)
+ if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
if (states == 4)
pll_core_create_lookup_4x4_sse(rate_cats,
@@ -752,7 +752,7 @@ PLL_EXPORT void pll_core_create_lookup(unsigned int states,
}
#endif
#ifdef HAVE_AVX
- if (attrib & PLL_ATTRIB_ARCH_AVX)
+ if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
if (states == 4)
pll_core_create_lookup_4x4_avx(rate_cats,
@@ -771,7 +771,7 @@ PLL_EXPORT void pll_core_create_lookup(unsigned int states,
}
#endif
#ifdef HAVE_AVX2
- if (attrib & PLL_ATTRIB_ARCH_AVX2)
+ if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
if (states == 4)
pll_core_create_lookup_4x4_avx(rate_cats,
diff --git a/src/core_pmatrix.c b/src/core_pmatrix.c
index 739cd64..4fb1e84 100644
--- a/src/core_pmatrix.c
+++ b/src/core_pmatrix.c
@@ -48,7 +48,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix,
#ifdef HAVE_SSE3
- if (attrib & PLL_ATTRIB_ARCH_SSE)
+ if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
if (states == 4)
{
@@ -70,7 +70,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix,
}
#endif
#ifdef HAVE_AVX
- if (attrib & PLL_ATTRIB_ARCH_AVX)
+ if (attrib & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
if (states == 4)
{
@@ -106,7 +106,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix,
}
#endif
#ifdef HAVE_AVX2
- if (attrib & PLL_ATTRIB_ARCH_AVX2)
+ if (attrib & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
if (states == 4)
{
diff --git a/src/fast_parsimony.c b/src/fast_parsimony.c
index c573884..564f7da 100644
--- a/src/fast_parsimony.c
+++ b/src/fast_parsimony.c
@@ -245,17 +245,17 @@ static int fill_parsimony_vectors(const pll_partition_t * partition,
(bitcount % PLL_BITVECTOR_SIZE != 0);
#ifdef HAVE_SSE3
- if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
bitvectors = (bitvectors+3) & 0xFFFFFFFC;
#endif
#ifdef HAVE_AVX
- if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
bitvectors = (bitvectors+7) & 0xFFFFFFF8;
#endif
#ifdef HAVE_AVX2
- if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
bitvectors = (bitvectors+7) & 0xFFFFFFF8;
#endif
@@ -651,17 +651,17 @@ static void fastparsimony_update_vectors_4x4(pll_parsimony_t * parsimony,
{
op = &(ops[i]);
#ifdef HAVE_SSE3
- if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
pll_fastparsimony_update_vector_4x4_sse(parsimony,op);
else
#endif
#ifdef HAVE_AVX
- if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
pll_fastparsimony_update_vector_4x4_avx(parsimony,op);
else
#endif
#ifdef HAVE_AVX2
- if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
pll_fastparsimony_update_vector_4x4_avx2(parsimony,op);
else
#endif
@@ -680,17 +680,17 @@ static int fastparsimony_update_vectors(pll_parsimony_t * parsimony,
{
op = &(ops[i]);
#ifdef HAVE_SSE3
- if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
pll_fastparsimony_update_vector_sse(parsimony,op);
else
#endif
#ifdef HAVE_AVX
- if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
pll_fastparsimony_update_vector_avx(parsimony,op);
else
#endif
#ifdef HAVE_AVX2
- if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
pll_fastparsimony_update_vector_avx2(parsimony,op);
else
#endif
@@ -716,19 +716,19 @@ PLL_EXPORT unsigned int pll_fastparsimony_edge_score(const pll_parsimony_t * par
if (parsimony->states == 4)
{
#ifdef HAVE_SSE3
- if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
return pll_fastparsimony_edge_score_4x4_sse(parsimony,
node1_score_index,
node2_score_index);
#endif
#ifdef HAVE_AVX
- if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
return pll_fastparsimony_edge_score_4x4_avx(parsimony,
node1_score_index,
node2_score_index);
#endif
#ifdef HAVE_AVX2
- if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
return pll_fastparsimony_edge_score_4x4_avx2(parsimony,
node1_score_index,
node2_score_index);
@@ -739,21 +739,21 @@ PLL_EXPORT unsigned int pll_fastparsimony_edge_score(const pll_parsimony_t * par
}
#ifdef HAVE_SSE3
- if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
return pll_fastparsimony_edge_score_sse(parsimony,
node1_score_index,
node2_score_index);
else
#endif
#ifdef HAVE_AVX
- if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
return pll_fastparsimony_edge_score_avx(parsimony,
node1_score_index,
node2_score_index);
else
#endif
#ifdef HAVE_AVX2
- if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2)
+ if (parsimony->attributes & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
return pll_fastparsimony_edge_score_avx2(parsimony,
node1_score_index,
node2_score_index);
diff --git a/src/hardware.c b/src/hardware.c
new file mode 100644
index 0000000..49d414a
--- /dev/null
+++ b/src/hardware.c
@@ -0,0 +1,121 @@
+/*
+ Copyright (C) 2017 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see .
+
+ Contact: Tomas Flouri ,
+ Exelixis Lab, Heidelberg Instutute for Theoretical Studies
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "pll.h"
+
+#ifndef __PPC__
+#define cpuid(f1, f2, a, b, c, d) \
+ __asm__ __volatile__ ("cpuid" \
+ : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \
+ : "a" (f1), "c" (f2));
+#endif
+
+static void cpu_features_detect()
+{
+ unsigned int a,b,c,d;
+
+ memset(pll_hardware,0,sizeof(pll_hardware_t));
+
+#ifdef __PPC__
+ pll_hardware->altivec_present = 1;
+#else
+
+ cpuid(0,0,a,b,c,d);
+ unsigned int maxlevel = a & 0xff;
+
+ if (maxlevel >= 1)
+ {
+ cpuid(1,0,a,b,c,d);
+ pll_hardware->mmx_present = (d >> 23) & 1;
+ pll_hardware->sse_present = (d >> 25) & 1;
+ pll_hardware->sse2_present = (d >> 26) & 1;
+ pll_hardware->sse3_present = (c >> 0) & 1;
+ pll_hardware->ssse3_present = (c >> 9) & 1;
+ pll_hardware->sse41_present = (c >> 19) & 1;
+ pll_hardware->sse42_present = (c >> 20) & 1;
+ pll_hardware->popcnt_present = (c >> 23) & 1;
+ pll_hardware->avx_present = (c >> 28) & 1;
+
+ if (maxlevel >= 7)
+ {
+ cpuid(7,0,a,b,c,d);
+ pll_hardware->avx2_present = (b >> 5) & 1;
+ }
+ }
+#endif
+}
+
+static void cpu_features_show()
+{
+ if (!pll_hardware)
+ {
+ /* TODO: Add proper error control after we figure out
+ cross-platform compatibility */
+ return;
+ }
+
+ fprintf(stderr, "CPU features:");
+ if (pll_hardware->altivec_present)
+ fprintf(stderr, " altivec");
+ if (pll_hardware->mmx_present)
+ fprintf(stderr, " mmx");
+ if (pll_hardware->sse_present)
+ fprintf(stderr, " sse");
+ if (pll_hardware->sse2_present)
+ fprintf(stderr, " sse2");
+ if (pll_hardware->sse3_present)
+ fprintf(stderr, " sse3");
+ if (pll_hardware->ssse3_present)
+ fprintf(stderr, " ssse3");
+ if (pll_hardware->sse41_present)
+ fprintf(stderr, " sse4.1");
+ if (pll_hardware->sse42_present)
+ fprintf(stderr, " sse4.2");
+ if (pll_hardware->popcnt_present)
+ fprintf(stderr, " popcnt");
+ if (pll_hardware->avx_present)
+ fprintf(stderr, " avx");
+ if (pll_hardware->avx2_present)
+ fprintf(stderr, " avx2");
+ fprintf(stderr, "\n");
+}
+
+PLL_EXPORT int pll_hardware_probe()
+{
+ /* probe cpu features */
+ if (!pll_hardware)
+ {
+ if (!(pll_hardware = (pll_hardware_t *)calloc(1,sizeof(pll_hardware_t))))
+ {
+ pll_errno = PLL_ERROR_MEM_ALLOC;
+ snprintf(pll_errmsg, 200, "Unable to allocate enough memory.");
+ return PLL_FAILURE;
+ }
+ }
+ cpu_features_detect();
+
+ return PLL_SUCCESS;
+}
+
+PLL_EXPORT void pll_hardware_dump()
+{
+ cpu_features_show();
+}
diff --git a/src/init.c b/src/init.c
new file mode 100644
index 0000000..90d5bd3
--- /dev/null
+++ b/src/init.c
@@ -0,0 +1,34 @@
+/*
+ Copyright (C) 2017 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see .
+
+ Contact: Tomas Flouri ,
+ Exelixis Lab, Heidelberg Instutute for Theoretical Studies
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "pll.h"
+
+PLL_EXPORT void pll_init()
+{
+ pll_hardware_probe();
+}
+
+PLL_EXPORT void pll_fini()
+{
+ if (pll_hardware)
+ free(pll_hardware);
+ pll_hardware = NULL;
+}
diff --git a/src/pll.c b/src/pll.c
index d0d64f1..50254aa 100644
--- a/src/pll.c
+++ b/src/pll.c
@@ -24,6 +24,8 @@
__thread int pll_errno;
__thread char pll_errmsg[200] = {0};
+pll_hardware_t * pll_hardware = NULL;
+
static void dealloc_partition_data(pll_partition_t * partition);
static void dealloc_partition_data(pll_partition_t * partition)
@@ -240,7 +242,9 @@ static int update_charmap(pll_partition_t * partition, const unsigned int * map)
(partition->states_padded * partition->rate_cats);
/* for AVX we do not need to reallocate ttlookup as it has fixed size */
- if ((partition->states == 4) && (partition->attributes & PLL_ATTRIB_ARCH_AVX))
+ if ((partition->states == 4) &&
+ (partition->attributes & PLL_ATTRIB_ARCH_AVX) &&
+ PLL_STAT(avx_present))
return PLL_SUCCESS;
free(partition->ttlookup);
@@ -337,7 +341,9 @@ static int create_charmap(pll_partition_t * partition, const unsigned int * user
/* dedicated 4x4 function - if AVX is not used we can allocate less space
in case not all 16 possible ambiguities are present */
- if ((partition->states == 4) && (partition->attributes & PLL_ATTRIB_ARCH_AVX))
+ if ((partition->states == 4) &&
+ (partition->attributes & PLL_ATTRIB_ARCH_AVX) &&
+ PLL_STAT(avx_present))
{
partition->ttlookup = pll_aligned_alloc(1024 * partition->rate_cats *
sizeof(double),
@@ -425,21 +431,21 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips,
partition->attributes = attributes;
partition->states_padded = states;
#ifdef HAVE_SSE3
- if (attributes & PLL_ATTRIB_ARCH_SSE)
+ if (attributes & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
partition->alignment = PLL_ALIGNMENT_SSE;
partition->states_padded = (states+1) & 0xFFFFFFFE;
}
#endif
#ifdef HAVE_AVX
- if (attributes & PLL_ATTRIB_ARCH_AVX)
+ if (attributes & PLL_ATTRIB_ARCH_AVX && PLL_STAT(avx_present))
{
partition->alignment = PLL_ALIGNMENT_AVX;
partition->states_padded = (states+3) & 0xFFFFFFFC;
}
#endif
#ifdef HAVE_AVX2
- if (attributes & PLL_ATTRIB_ARCH_AVX2)
+ if (attributes & PLL_ATTRIB_ARCH_AVX2 && PLL_STAT(avx2_present))
{
partition->alignment = PLL_ALIGNMENT_AVX;
partition->states_padded = (states+3) & 0xFFFFFFFC;
diff --git a/src/pll.h b/src/pll.h
index b84b746..fe38a76 100644
--- a/src/pll.h
+++ b/src/pll.h
@@ -49,6 +49,7 @@
#define PLL_MIN(a,b) ((a) < (b) ? (a) : (b))
#define PLL_MAX(a,b) ((a) > (b) ? (a) : (b))
#define PLL_SWAP(x,y) do { __typeof__ (x) _t = x; x = y; y = _t; } while(0)
+#define PLL_STAT(x) (pll_hardware && pll_hardware->x)
/* constants */
@@ -160,6 +161,24 @@
/* structures and data types */
+typedef struct pll_hardware_s
+{
+ /* cpu features */
+ int altivec_present;
+ int mmx_present;
+ int sse_present;
+ int sse2_present;
+ int sse3_present;
+ int ssse3_present;
+ int sse41_present;
+ int sse42_present;
+ int popcnt_present;
+ int avx_present;
+ int avx2_present;
+
+ /* TODO: add chip,core,mem info */
+} pll_hardware_t;
+
typedef struct pll_partition
{
unsigned int tips;
@@ -432,6 +451,7 @@ struct pll_random_data
PLL_EXPORT extern __thread int pll_errno;
PLL_EXPORT extern __thread char pll_errmsg[200];
+PLL_EXPORT extern pll_hardware_t * pll_hardware;
PLL_EXPORT extern const unsigned int pll_map_bin[256];
PLL_EXPORT extern const unsigned int pll_map_nt[256];
@@ -1849,6 +1869,19 @@ PLL_EXPORT extern int pll_initstate_r(unsigned int __seed,
PLL_EXPORT extern int pll_setstate_r(char * __statebuf,
struct pll_random_data * __buf);
+/* functions in hardware.c */
+
+PLL_EXPORT int pll_hardware_probe(void);
+
+PLL_EXPORT void pll_hardware_dump();
+
+/* functions in init.c */
+
+PLL_EXPORT void pll_init(void) __attribute__((constructor));
+
+PLL_EXPORT void pll_fini(void) __attribute__((destructor));
+
+
#ifdef __cplusplus
} /* extern "C" */
#endif