From 790a201cd229e589a71c1f482449ca6628fd2730 Mon Sep 17 00:00:00 2001 From: kozlov Date: Wed, 26 Apr 2017 02:43:18 +0200 Subject: [PATCH] implement per-rate scaling in SSE and CPU kernels (DNA only) --- src/core_derivatives.c | 99 ++++++++++++++++++- src/core_derivatives_avx.c | 98 ++++++++++--------- src/core_derivatives_sse.c | 105 +++++++++++++++++++- src/core_likelihood.c | 137 ++++++++++++++++++++++---- src/core_likelihood_avx.c | 141 ++++++++++++++------------- src/core_likelihood_sse.c | 133 ++++++++++++++++++++++--- src/core_partials.c | 146 ++++++++++++++++++++++------ src/core_partials_sse.c | 194 ++++++++++++++++++++++++++----------- src/pll.h | 42 +++++--- 9 files changed, 846 insertions(+), 249 deletions(-) diff --git a/src/core_derivatives.c b/src/core_derivatives.c index 3f962e6..8f9d322 100644 --- a/src/core_derivatives.c +++ b/src/core_derivatives.c @@ -19,12 +19,14 @@ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ +#include #include "pll.h" PLL_EXPORT int pll_core_update_sumtable_ti_4x4(unsigned int sites, unsigned int rate_cats, const double * parent_clv, const unsigned char * left_tipchars, + const unsigned int * parent_scaler, double ** eigenvecs, double ** inv_eigenvecs, double ** freqs, @@ -45,9 +47,46 @@ PLL_EXPORT int pll_core_update_sumtable_ti_4x4(unsigned int sites, unsigned int states = 4; + unsigned int min_scaler; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + /* build sumtable */ for (n = 0; n < sites; n++) { + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + min_scaler = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < min_scaler) + min_scaler = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler, + PLL_SCALE_RATE_MAXDIFF); + } + } + for (i = 0; i < rate_cats; ++i) { t_eigenvecs = eigenvecs[i]; @@ -67,6 +106,9 @@ PLL_EXPORT int pll_core_update_sumtable_ti_4x4(unsigned int sites, tipstate >>= 1; } sum[j] = lefterm * righterm; + + if (rate_scalings && rate_scalings[i] > 0) + sum[j] *= scale_minlh[rate_scalings[i]-1]; } t_clvc += states; @@ -74,6 +116,9 @@ PLL_EXPORT int pll_core_update_sumtable_ti_4x4(unsigned int sites, } } + if (rate_scalings) + free(rate_scalings); + return PLL_SUCCESS; } @@ -109,10 +154,13 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states, rate_cats, parent_clv, child_clv, + parent_scaler, + child_scaler, eigenvecs, inv_eigenvecs, freqs, - sumtable); + sumtable, + attrib); } #endif #ifdef HAVE_AVX @@ -150,9 +198,47 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states, } #endif + unsigned int min_scaler; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + /* build sumtable */ for (n = 0; n < sites; n++) { + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + min_scaler = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < min_scaler) + min_scaler = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler, + PLL_SCALE_RATE_MAXDIFF); + } + } + for (i = 0; i < rate_cats; ++i) { t_eigenvecs = eigenvecs[i]; @@ -169,6 +255,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states, righterm += t_eigenvecs[j * states + k] * t_clvc[k]; } sum[j] = lefterm * righterm; + + if (rate_scalings && rate_scalings[i] > 0) + sum[j] *= scale_minlh[rate_scalings[i]-1]; } t_clvc += states; t_clvp += states; @@ -176,6 +265,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states, } } + if (rate_scalings) + free(rate_scalings); + return PLL_SUCCESS; } @@ -214,11 +306,13 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states, rate_cats, parent_clv, left_tipchars, + parent_scaler, eigenvecs, inv_eigenvecs, freqs, tipmap, - sumtable); + sumtable, + attrib); } #endif #ifdef HAVE_AVX @@ -265,6 +359,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states, rate_cats, parent_clv, left_tipchars, + parent_scaler, eigenvecs, inv_eigenvecs, freqs, diff --git a/src/core_derivatives_avx.c b/src/core_derivatives_avx.c index 7204f28..8611744 100644 --- a/src/core_derivatives_avx.c +++ b/src/core_derivatives_avx.c @@ -46,22 +46,22 @@ static int core_update_sumtable_ii_4x4_avx(unsigned int sites, unsigned int states = 4; unsigned int min_scaler = 0; - unsigned int * rate_scale_factors = NULL; + unsigned int * rate_scalings = NULL; int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + /* powers of scale threshold for undoing the scaling */ + __m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF]; if (per_rate_scaling) - rate_scale_factors = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); - /* powers of scale threshold for undoing the scaling */ - __m256d v_scale_minlh[5] = { - _mm256_set1_pd(1.0), - _mm256_set1_pd(PLL_SCALE_THRESHOLD), - _mm256_set1_pd(PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD), - _mm256_set1_pd(PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD * - PLL_SCALE_THRESHOLD), - _mm256_set1_pd(PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD * - PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD) - }; + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + v_scale_minlh[i] = _mm256_set1_pd(scale_factor); + } + } /* transposed inv_eigenvecs */ double * tt_inv_eigenvecs = (double *) pll_aligned_alloc ( @@ -95,10 +95,17 @@ static int core_update_sumtable_ii_4x4_avx(unsigned int sites, min_scaler = UINT_MAX; for (i = 0; i < rate_cats; ++i) { - rate_scale_factors[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; - rate_scale_factors[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0; - if (rate_scale_factors[i] < min_scaler) - min_scaler = rate_scale_factors[i]; + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < min_scaler) + min_scaler = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler, + PLL_SCALE_RATE_MAXDIFF); } } @@ -170,12 +177,9 @@ static int core_update_sumtable_ii_4x4_avx(unsigned int sites, __m256d v_sum = _mm256_mul_pd (v_lefterm_sum, v_righterm_sum); /* apply per-rate scalers */ - if (per_rate_scaling) + if (rate_scalings && rate_scalings[i] > 0) { - int scalings = rate_scale_factors[i] - min_scaler > 4 ? - 4 : (rate_scale_factors[i] - min_scaler); - - v_sum = _mm256_mul_pd(v_sum, v_scale_minlh[scalings]); + v_sum = _mm256_mul_pd(v_sum, v_scale_minlh[rate_scalings[i]-1]); } _mm256_store_pd (sum, v_sum); @@ -188,8 +192,8 @@ static int core_update_sumtable_ii_4x4_avx(unsigned int sites, pll_aligned_free (tt_inv_eigenvecs); - if (rate_scale_factors) - free(rate_scale_factors); + if (rate_scalings) + free(rate_scalings); return PLL_SUCCESS; } @@ -413,22 +417,22 @@ static int core_update_sumtable_ti_4x4_avx(unsigned int sites, const double * t_eigenvecs_trans; unsigned int min_scaler = 0; - unsigned int * rate_scale_factors = NULL; + unsigned int * rate_scalings = NULL; int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + /* powers of scale threshold for undoing the scaling */ + __m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF]; if (per_rate_scaling) - rate_scale_factors = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); - /* powers of scale threshold for undoing the scaling */ - __m256d v_scale_minlh[5] = { - _mm256_set1_pd(1.0), - _mm256_set1_pd(PLL_SCALE_THRESHOLD), - _mm256_set1_pd(PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD), - _mm256_set1_pd(PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD * - PLL_SCALE_THRESHOLD), - _mm256_set1_pd(PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD * - PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD) - }; + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + v_scale_minlh[i] = _mm256_set1_pd(scale_factor); + } + } double * eigenvecs_trans = (double *) pll_aligned_alloc ( (states * states * rate_cats) * sizeof(double), @@ -513,9 +517,16 @@ static int core_update_sumtable_ti_4x4_avx(unsigned int sites, min_scaler = UINT_MAX; for (i = 0; i < rate_cats; ++i) { - rate_scale_factors[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; - if (rate_scale_factors[i] < min_scaler) - min_scaler = rate_scale_factors[i]; + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < min_scaler) + min_scaler = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler, + PLL_SCALE_RATE_MAXDIFF); } } @@ -541,12 +552,9 @@ static int core_update_sumtable_ti_4x4_avx(unsigned int sites, __m256d v_sum = _mm256_mul_pd(v_lefterm, v_righterm); /* apply per-rate scalers */ - if (per_rate_scaling) + if (rate_scalings && rate_scalings[i] > 0) { - int scalings = rate_scale_factors[i] - min_scaler > 4 ? - 4 : (rate_scale_factors[i] - min_scaler); - - v_sum = _mm256_mul_pd(v_sum, v_scale_minlh[scalings]); + v_sum = _mm256_mul_pd(v_sum, v_scale_minlh[rate_scalings[i]-1]); } _mm256_store_pd(sum, v_sum); @@ -561,8 +569,8 @@ static int core_update_sumtable_ti_4x4_avx(unsigned int sites, pll_aligned_free(eigenvecs_trans); pll_aligned_free(precomp_left); - if (rate_scale_factors) - free(rate_scale_factors); + if (rate_scalings) + free(rate_scalings); return PLL_SUCCESS; } diff --git a/src/core_derivatives_sse.c b/src/core_derivatives_sse.c index b05b684..817c3e1 100644 --- a/src/core_derivatives_sse.c +++ b/src/core_derivatives_sse.c @@ -19,6 +19,7 @@ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ +#include #include "pll.h" /* @@ -30,11 +31,13 @@ static int core_update_sumtable_ti_4x4_sse(unsigned int sites, unsigned int rate_cats, const double * parent_clv, const unsigned char * left_tipchars, + const unsigned int * parent_scaler, double ** eigenvecs, double ** inv_eigenvecs, double ** freqs_indices, unsigned int * tipmap, - double * sumtable) + double * sumtable, + unsigned int attrib) { unsigned int i,j,k,n; unsigned int tipstate; @@ -49,10 +52,46 @@ static int core_update_sumtable_ti_4x4_sse(unsigned int sites, double * sum = sumtable; + unsigned int min_scaler; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } /* build sumtable */ for (n = 0; n < sites; n++) { + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + min_scaler = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < min_scaler) + min_scaler = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler, + PLL_SCALE_RATE_MAXDIFF); + } + } + for (i = 0; i < rate_cats; ++i) { ev = eigenvecs[i]; @@ -72,12 +111,19 @@ static int core_update_sumtable_ti_4x4_sse(unsigned int sites, tipstate >>= 1; } sum[j] = lterm*rterm; + + if (rate_scalings && rate_scalings[i] > 0) + sum[j] *= scale_minlh[rate_scalings[i]-1]; } clvc += states; sum += states; } } + + if (rate_scalings) + free(rate_scalings); + return PLL_SUCCESS; } @@ -86,10 +132,13 @@ PLL_EXPORT int pll_core_update_sumtable_ii_sse(unsigned int states, unsigned int rate_cats, const double * parent_clv, const double * child_clv, + const unsigned int * parent_scaler, + const unsigned int * child_scaler, double ** eigenvecs, double ** inv_eigenvecs, double ** freqs_indices, - double * sumtable) + double * sumtable, + unsigned int attrib) { unsigned int i, j, k, n; double lterm = 0; @@ -105,9 +154,47 @@ PLL_EXPORT int pll_core_update_sumtable_ii_sse(unsigned int states, unsigned int states_padded = (states+1) & 0xFFFFFFFE; + unsigned int min_scaler; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + /* build sumtable */ for (n = 0; n < sites; n++) { + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + min_scaler = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < min_scaler) + min_scaler = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler, + PLL_SCALE_RATE_MAXDIFF); + } + } + for (i = 0; i < rate_cats; ++i) { ev = eigenvecs[i]; @@ -126,6 +213,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii_sse(unsigned int states, } sum[j] = lterm*rterm; + + if (rate_scalings && rate_scalings[i] > 0) + sum[j] *= scale_minlh[rate_scalings[i]-1]; } clvc += states_padded; @@ -134,6 +224,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii_sse(unsigned int states, } } + if (rate_scalings) + free(rate_scalings); + return PLL_SUCCESS; } @@ -142,11 +235,13 @@ PLL_EXPORT int pll_core_update_sumtable_ti_sse(unsigned int states, unsigned int rate_cats, const double * parent_clv, const unsigned char * left_tipchars, + const unsigned int * parent_scaler, double ** eigenvecs, double ** inv_eigenvecs, double ** freqs_indices, unsigned int * tipmap, - double * sumtable) + double * sumtable, + unsigned int attrib) { unsigned int i,j,k,n; unsigned int tipstate; @@ -165,11 +260,13 @@ PLL_EXPORT int pll_core_update_sumtable_ti_sse(unsigned int states, rate_cats, parent_clv, left_tipchars, + parent_scaler, eigenvecs, inv_eigenvecs, freqs_indices, tipmap, - sumtable); + sumtable, + attrib); } unsigned int states_padded = (states+1) & 0xFFFFFFFE; diff --git a/src/core_likelihood.c b/src/core_likelihood.c index 3f0eeab..3301553 100644 --- a/src/core_likelihood.c +++ b/src/core_likelihood.c @@ -19,6 +19,7 @@ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ +#include #include "pll.h" PLL_EXPORT double pll_core_root_loglikelihood(unsigned int states, @@ -234,7 +235,6 @@ double pll_core_edge_loglikelihood_ti_4x4(unsigned int sites, double terma, terma_r, termb; double site_lk, inv_site_lk; - unsigned int scale_factors; unsigned int cstate; unsigned int states = 4; @@ -255,7 +255,8 @@ double pll_core_edge_loglikelihood_ti_4x4(unsigned int sites, invar_proportion, invar_indices, freqs_indices, - persite_lnl); + persite_lnl, + attrib); } #endif #ifdef HAVE_AVX @@ -297,10 +298,53 @@ double pll_core_edge_loglikelihood_ti_4x4(unsigned int sites, } #endif + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + for (n = 0; n < sites; ++n) { pmat = pmatrix; terma = 0; + + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < site_scalings) + site_scalings = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); + } + } + else + { + /* count number of scaling factors to account for */ + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; + } + for (i = 0; i < rate_cats; ++i) { freqs = frequencies[freqs_indices[i]]; @@ -320,6 +364,12 @@ double pll_core_edge_loglikelihood_ti_4x4(unsigned int sites, pmat += states_padded; } + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + /* account for invariant sites */ if (prop_invar > 0) { @@ -337,13 +387,10 @@ double pll_core_edge_loglikelihood_ti_4x4(unsigned int sites, clvp += states_padded; } - /* count number of scaling factors to acount for */ - scale_factors = (parent_scaler) ? parent_scaler[n] : 0; - /* compute site log-likelihood and scale if necessary */ site_lk = log(terma); - if (scale_factors) - site_lk += scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -355,6 +402,10 @@ double pll_core_edge_loglikelihood_ti_4x4(unsigned int sites, tipchars++; } + + if (rate_scalings) + free(rate_scalings); + return logl; } @@ -410,7 +461,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, invar_proportion, invar_indices, freqs_indices, - persite_lnl); + persite_lnl, + attrib); } else { @@ -648,8 +700,6 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, double terma, terma_r, termb; double site_lk, inv_site_lk; - unsigned int scale_factors; - /* TODO: We need states_padded in the AVX/SSE implementations */ unsigned int states_padded = states; @@ -672,7 +722,8 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, invar_proportion, invar_indices, freqs_indices, - persite_lnl); + persite_lnl, + attrib); } else { @@ -786,10 +837,55 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, } #endif + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + for (n = 0; n < sites; ++n) { pmat = pmatrix; terma = 0; + + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < site_scalings) + site_scalings = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); + } + } + else + { + /* count number of scaling factors to account for */ + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; + site_scalings += (child_scaler) ? child_scaler[n] : 0; + } + for (i = 0; i < rate_cats; ++i) { freqs = frequencies[freqs_indices[i]]; @@ -801,10 +897,17 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, { termb += pmat[k] * clvc[k]; } + terma_r += clvp[j] * freqs[j] * termb; pmat += states_padded; } + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + /* account for invariant sites */ prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0; if (prop_invar > 0) @@ -823,14 +926,10 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, clvc += states_padded; } - /* count number of scaling factors to acount for */ - scale_factors = (parent_scaler) ? parent_scaler[n] : 0; - scale_factors += (child_scaler) ? child_scaler[n] : 0; - /* compute site log-likelihood and scale if necessary */ site_lk = log(terma); - if (scale_factors) - site_lk += scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -840,5 +939,9 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, logl += site_lk; } + + if (rate_scalings) + free(rate_scalings); + return logl; } diff --git a/src/core_likelihood_avx.c b/src/core_likelihood_avx.c index b930fe1..25a370a 100644 --- a/src/core_likelihood_avx.c +++ b/src/core_likelihood_avx.c @@ -222,23 +222,23 @@ double pll_core_edge_loglikelihood_ti_4x4_avx(unsigned int sites, __m256d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; __m256i mask; - unsigned int site_scale_factors; - unsigned int * rate_scale_factors = NULL; - int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = parent_scaler && (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; /* powers of scale threshold for undoing the scaling */ - __m256d v_scale_minlh[5] = { - _mm256_set1_pd(1.0), - _mm256_set1_pd(PLL_SCALE_THRESHOLD), - _mm256_set1_pd(PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD), - _mm256_set1_pd(PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD * - PLL_SCALE_THRESHOLD), - _mm256_set1_pd(PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD * - PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD) - }; - + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; if (per_rate_scaling) - rate_scale_factors = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } /* precompute a lookup table of four values per entry (one for each state), for all 16 states (including ambiguities) and for each rate category. */ @@ -319,18 +319,27 @@ double pll_core_edge_loglikelihood_ti_4x4_avx(unsigned int sites, if (per_rate_scaling) { - site_scale_factors = UINT_MAX; + const unsigned int * pscaler = parent_scaler + n*rate_cats; + + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + if (pscaler[i] < site_scalings) + site_scalings = pscaler[i]; + } + + /* compute relative capped per-rate scalers */ for (i = 0; i < rate_cats; ++i) { - rate_scale_factors[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; - if (rate_scale_factors[i] < site_scale_factors) - site_scale_factors = rate_scale_factors[i]; + rate_scalings[i] = PLL_MIN(pscaler[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); } } else { /* count number of scaling factors to account for */ - site_scale_factors = (parent_scaler) ? parent_scaler[n] : 0; + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; } for (i = 0; i < rate_cats; ++i) @@ -342,19 +351,16 @@ double pll_core_edge_loglikelihood_ti_4x4_avx(unsigned int sites, xmm2 = _mm256_load_pd(clvp); xmm0 = _mm256_mul_pd(xmm1,xmm2); - /* apply per-rate scalers, if necessary */ - if (per_rate_scaling) - { - int scalings = rate_scale_factors[i] - site_scale_factors > 4 ? - 4 : (rate_scale_factors[i] - site_scale_factors); - - xmm0 = _mm256_mul_pd(xmm0, v_scale_minlh[scalings]); - } - /* add up the elements of xmm0 */ xmm1 = _mm256_hadd_pd(xmm0,xmm0); terma_r = ((double *)&xmm1)[0] + ((double *)&xmm1)[2]; + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + if (terma_r > 0.) { /* account for invariant sites */ @@ -380,8 +386,8 @@ double pll_core_edge_loglikelihood_ti_4x4_avx(unsigned int sites, site_lk = log(terma); /* apply per-site scaler, if necessary */ - if (site_scale_factors) - site_lk += site_scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -393,8 +399,8 @@ double pll_core_edge_loglikelihood_ti_4x4_avx(unsigned int sites, } pll_aligned_free(lookup); - if (rate_scale_factors) - free(rate_scale_factors); + if (rate_scalings) + free(rate_scalings); return logl; } @@ -930,23 +936,23 @@ double pll_core_edge_loglikelihood_ii_4x4_avx(unsigned int sites, __m256d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; - unsigned int site_scale_factors; - unsigned int * rate_scale_factors = NULL; + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; /* powers of scale threshold for undoing the scaling */ - __m256d v_scale_minlh[5] = { - _mm256_set1_pd(1.0), - _mm256_set1_pd(PLL_SCALE_THRESHOLD), - _mm256_set1_pd(PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD), - _mm256_set1_pd(PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD * - PLL_SCALE_THRESHOLD), - _mm256_set1_pd(PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD * - PLL_SCALE_THRESHOLD * PLL_SCALE_THRESHOLD) - }; - + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; if (per_rate_scaling) - rate_scale_factors = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } for (n = 0; n < sites; ++n) { @@ -955,20 +961,28 @@ double pll_core_edge_loglikelihood_ii_4x4_avx(unsigned int sites, if (per_rate_scaling) { - site_scale_factors = UINT_MAX; + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < site_scalings) + site_scalings = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ for (i = 0; i < rate_cats; ++i) { - rate_scale_factors[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; - rate_scale_factors[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0; - if (rate_scale_factors[i] < site_scale_factors) - site_scale_factors = rate_scale_factors[i]; + rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); } } else { /* count number of scaling factors to account for */ - site_scale_factors = (parent_scaler) ? parent_scaler[n] : 0; - site_scale_factors += (child_scaler) ? child_scaler[n] : 0; + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; + site_scalings += (child_scaler) ? child_scaler[n] : 0; } for (i = 0; i < rate_cats; ++i) @@ -1024,19 +1038,16 @@ double pll_core_edge_loglikelihood_ii_4x4_avx(unsigned int sites, xmm2 = _mm256_load_pd(clvp); xmm0 = _mm256_mul_pd(xmm1,xmm2); - /* apply per-rate scalers, if necessary */ - if (per_rate_scaling) - { - int scalings = rate_scale_factors[i] - site_scale_factors > 4 ? - 4 : (rate_scale_factors[i] - site_scale_factors); - - xmm0 = _mm256_mul_pd(xmm0, v_scale_minlh[scalings]); - } - /* add up the elements of xmm0 */ xmm1 = _mm256_hadd_pd(xmm0,xmm0); terma_r = ((double *)&xmm1)[0] + ((double *)&xmm1)[2]; + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + if (terma_r > 0.) { /* account for invariant sites */ @@ -1062,8 +1073,8 @@ double pll_core_edge_loglikelihood_ii_4x4_avx(unsigned int sites, site_lk = log(terma); /* apply per-site scaler, if necessary */ - if (site_scale_factors) - site_lk += site_scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -1074,8 +1085,8 @@ double pll_core_edge_loglikelihood_ii_4x4_avx(unsigned int sites, logl += site_lk; } - if (rate_scale_factors) - free(rate_scale_factors); + if (rate_scalings) + free(rate_scalings); return logl; } diff --git a/src/core_likelihood_sse.c b/src/core_likelihood_sse.c index ee5574d..eb817b2 100644 --- a/src/core_likelihood_sse.c +++ b/src/core_likelihood_sse.c @@ -19,6 +19,7 @@ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ +#include #include "pll.h" PLL_EXPORT double pll_core_root_loglikelihood_sse(unsigned int states, @@ -475,7 +476,8 @@ double pll_core_edge_loglikelihood_ii_4x4_sse(unsigned int sites, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl) + double * persite_lnl, + unsigned int attrib) { unsigned int n,i; double logl = 0; @@ -489,16 +491,60 @@ double pll_core_edge_loglikelihood_ii_4x4_sse(unsigned int sites, double terma, terma_r; double site_lk, inv_site_lk; - unsigned int scale_factors; unsigned int states = 4; unsigned int states_padded = 4; __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + for (n = 0; n < sites; ++n) { pmat = pmatrix; terma = 0; + + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < site_scalings) + site_scalings = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); + } + } + else + { + /* count number of scaling factors to account for */ + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; + site_scalings += (child_scaler) ? child_scaler[n] : 0; + } + for (i = 0; i < rate_cats; ++i) { freqs = frequencies[freqs_indices[i]]; @@ -585,6 +631,12 @@ double pll_core_edge_loglikelihood_ii_4x4_sse(unsigned int sites, xmm0 = _mm_hadd_pd(xmm6,xmm7); terma_r = ((double *)&xmm0)[0] + ((double *)&xmm0)[1]; + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + /* account for invariant sites */ prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0; if (prop_invar > 0) @@ -603,14 +655,10 @@ double pll_core_edge_loglikelihood_ii_4x4_sse(unsigned int sites, clvc += states_padded; } - /* count number of scaling factors to acount for */ - scale_factors = (parent_scaler) ? parent_scaler[n] : 0; - scale_factors += (child_scaler) ? child_scaler[n] : 0; - - /* compute site log-likelihood and scale if necessary */ + /* compute site log-likelihood and apply per-site scaler if necessary */ site_lk = log(terma); - if (scale_factors) - site_lk += scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -620,6 +668,10 @@ double pll_core_edge_loglikelihood_ii_4x4_sse(unsigned int sites, logl += site_lk; } + + if (rate_scalings) + free(rate_scalings); + return logl; } @@ -636,7 +688,8 @@ double pll_core_edge_loglikelihood_ti_4x4_sse(unsigned int sites, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl) + double * persite_lnl, + unsigned int attrib) { unsigned int i,k,n; double logl = 0; @@ -649,7 +702,6 @@ double pll_core_edge_loglikelihood_ti_4x4_sse(unsigned int sites, double terma, terma_r; double site_lk, inv_site_lk; - unsigned int scale_factors; unsigned int cstate; unsigned int states_padded = 4; unsigned int span = rate_cats*states_padded; @@ -657,6 +709,24 @@ double pll_core_edge_loglikelihood_ti_4x4_sse(unsigned int sites, __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; __m128d ymm0,ymm1,ymm2,ymm3; + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + /* precompute a lookup table of four values per entry (one for each state), for all 16 states (including ambiguities) and for each rate category. */ double * lookup = pll_aligned_alloc(64*rate_cats*sizeof(double), @@ -763,6 +833,30 @@ double pll_core_edge_loglikelihood_ti_4x4_sse(unsigned int sites, pmat = pmatrix; terma = 0; + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < site_scalings) + site_scalings = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); + } + } + else + { + /* count number of scaling factors to account for */ + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; + } + cstate = tipchars[n]; unsigned int coffset = cstate*span; @@ -787,6 +881,12 @@ double pll_core_edge_loglikelihood_ti_4x4_sse(unsigned int sites, xmm1 = _mm_hadd_pd(xmm4,xmm5); terma_r = ((double *)&xmm1)[0] + ((double *)&xmm1)[1]; + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + /* account for invariant sites */ prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0; if (prop_invar > 0) @@ -805,13 +905,10 @@ double pll_core_edge_loglikelihood_ti_4x4_sse(unsigned int sites, coffset += 4; } - /* count number of scaling factors to acount for */ - scale_factors = (parent_scaler) ? parent_scaler[n] : 0; - /* compute site log-likelihood and scale if necessary */ site_lk = log(terma); - if (scale_factors) - site_lk += scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -821,6 +918,10 @@ double pll_core_edge_loglikelihood_ti_4x4_sse(unsigned int sites, logl += site_lk; } + pll_aligned_free(lookup); + if (rate_scalings) + free(rate_scalings); + return logl; } diff --git a/src/core_partials.c b/src/core_partials.c index 510b326..2ea686f 100644 --- a/src/core_partials.c +++ b/src/core_partials.c @@ -21,7 +21,7 @@ #include "pll.h" -static void fill_parent_scaler(unsigned int sites, +static void fill_parent_scaler(unsigned int scaler_size, unsigned int * parent_scaler, const unsigned int * left_scaler, const unsigned int * right_scaler) @@ -29,19 +29,19 @@ static void fill_parent_scaler(unsigned int sites, unsigned int i; if (!left_scaler && !right_scaler) - memset(parent_scaler, 0, sizeof(unsigned int) * sites); + memset(parent_scaler, 0, sizeof(unsigned int) * scaler_size); else if (left_scaler && right_scaler) { - memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * sites); - for (i = 0; i < sites; ++i) + memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * scaler_size); + for (i = 0; i < scaler_size; ++i) parent_scaler[i] += right_scaler[i]; } else { if (left_scaler) - memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * sites); + memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * scaler_size); else - memcpy(parent_scaler, right_scaler, sizeof(unsigned int) * sites); + memcpy(parent_scaler, right_scaler, sizeof(unsigned int) * scaler_size); } } @@ -51,15 +51,19 @@ PLL_EXPORT void pll_core_update_partial_tt_4x4(unsigned int sites, unsigned int * parent_scaler, const unsigned char * left_tipchars, const unsigned char * right_tipchars, - const double * lookup) + const double * lookup, + unsigned int attrib) { unsigned int j,k,n; unsigned int states = 4; unsigned int span = states * rate_cats; const double * offset; + size_t scaler_size = (attrib & PLL_ATTRIB_RATE_SCALERS) ? + sites*rate_cats : sites; + if (parent_scaler) - memset(parent_scaler, 0, sizeof(unsigned int) * sites); + memset(parent_scaler, 0, sizeof(unsigned int) * scaler_size); for (n = 0; n < sites; ++n) { @@ -100,7 +104,8 @@ PLL_EXPORT void pll_core_update_partial_tt(unsigned int states, parent_scaler, left_tipchars, right_tipchars, - lookup); + lookup, + attrib); else pll_core_update_partial_tt_sse(states, sites, @@ -110,7 +115,8 @@ PLL_EXPORT void pll_core_update_partial_tt(unsigned int states, left_tipchars, right_tipchars, lookup, - tipmap_size); + tipmap_size, + attrib); return; } @@ -173,9 +179,11 @@ PLL_EXPORT void pll_core_update_partial_tt(unsigned int states, unsigned int span = states * rate_cats; unsigned int log2_maxstates = (unsigned int)ceil(log2(tipmap_size)); + size_t scaler_size = (attrib & PLL_ATTRIB_RATE_SCALERS) ? + sites*rate_cats : sites; if (parent_scaler) - memset(parent_scaler, 0, sizeof(unsigned int) * sites); + memset(parent_scaler, 0, sizeof(unsigned int) * scaler_size); for (n = 0; n < sites; ++n) { @@ -203,10 +211,13 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4(unsigned int sites, unsigned int attrib) { unsigned int states = 4; - unsigned int scaling; unsigned int i,j,k,n; unsigned int span = states * rate_cats; + unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ + unsigned int site_scale; + unsigned int init_mask; + const double * lmat; const double * rmat; @@ -221,7 +232,8 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4(unsigned int sites, right_clv, left_matrix, right_matrix, - right_scaler); + right_scaler, + attrib); return; } #endif @@ -258,18 +270,33 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4(unsigned int sites, } #endif + /* init scaling-related stuff */ if (parent_scaler) - fill_parent_scaler(sites, parent_scaler, NULL, right_scaler); + { + /* determine the scaling mode and init the vars accordingly */ + scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1; + init_mask = (scale_mode == 1) ? 1 : 0; + const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites; + + /* update the parent scaler with the scaler of the right child */ + fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler); + } + else + { + /* scaling disabled / not required */ + scale_mode = init_mask = 0; + } for (n = 0; n < sites; ++n) { lmat = left_matrix; rmat = right_matrix; - scaling = (parent_scaler) ? 1 : 0; + site_scale = init_mask; for (k = 0; k < rate_cats; ++k) { + unsigned int rate_scale = 1; for (i = 0; i < states; ++i) { double terma = 0; @@ -285,17 +312,35 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4(unsigned int sites, lstate >>= 1; } parent_clv[i] = terma*termb; + + rate_scale &= (parent_clv[i] < PLL_SCALE_THRESHOLD); + lmat += states; rmat += states; + } - scaling = scaling && (parent_clv[i] < PLL_SCALE_THRESHOLD); + /* check if scaling is needed for the current rate category */ + if (scale_mode == 2) + { + /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (rate_scale) + { + for (i = 0; i < states; ++i) + parent_clv[i] *= PLL_SCALE_FACTOR; + parent_scaler[n*rate_cats + k] += 1; + } } + else + site_scale = site_scale && rate_scale; + parent_clv += states; right_clv += states; } - /* if *all* entries of the site CLV were below the threshold then scale - (all) entries by PLL_SCALE_FACTOR */ - if (scaling) + + /* PER-SITE SCALING: if *all* entries of the *site* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (site_scale) { parent_clv -= span; for (i = 0; i < span; ++i) @@ -323,6 +368,8 @@ PLL_EXPORT void pll_core_update_partial_ti(unsigned int states, int scaling; unsigned int i,j,k,n; unsigned int span = states * rate_cats; + size_t scaler_size = (attrib & PLL_ATTRIB_RATE_SCALERS) ? + sites*rate_cats : sites; const double * lmat; const double * rmat; @@ -339,7 +386,8 @@ PLL_EXPORT void pll_core_update_partial_ti(unsigned int states, right_clv, left_matrix, right_matrix, - right_scaler); + right_scaler, + attrib); else pll_core_update_partial_ti_sse(states, sites, @@ -352,7 +400,8 @@ PLL_EXPORT void pll_core_update_partial_ti(unsigned int states, right_matrix, right_scaler, tipmap, - tipmap_size); + tipmap_size, + attrib); return; } #endif @@ -411,7 +460,7 @@ PLL_EXPORT void pll_core_update_partial_ti(unsigned int states, } if (parent_scaler) - fill_parent_scaler(sites, parent_scaler, NULL, right_scaler); + fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler); for (n = 0; n < sites; ++n) { @@ -472,7 +521,10 @@ PLL_EXPORT void pll_core_update_partial_ii(unsigned int states, unsigned int attrib) { unsigned int i,j,k,n; - unsigned int scaling; + + unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ + unsigned int site_scale; + unsigned int init_mask; const double * lmat; const double * rmat; @@ -492,7 +544,8 @@ PLL_EXPORT void pll_core_update_partial_ii(unsigned int states, left_matrix, right_matrix, left_scaler, - right_scaler); + right_scaler, + attrib); return; } #endif @@ -533,19 +586,33 @@ PLL_EXPORT void pll_core_update_partial_ii(unsigned int states, } #endif - /* add up the scale vectors of the two children if available */ + /* init scaling-related stuff */ if (parent_scaler) - fill_parent_scaler(sites, parent_scaler, left_scaler, right_scaler); + { + /* determine the scaling mode and init the vars accordingly */ + scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1; + init_mask = (scale_mode == 1) ? 1 : 0; + const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites; + + /* add up the scale vectors of the two children if available */ + fill_parent_scaler(scaler_size, parent_scaler, left_scaler, right_scaler); + } + else + { + /* scaling disabled / not required */ + scale_mode = init_mask = 0; + } /* compute CLV */ for (n = 0; n < sites; ++n) { lmat = left_matrix; rmat = right_matrix; - scaling = (parent_scaler) ? 1 : 0; + site_scale = init_mask; for (k = 0; k < rate_cats; ++k) { + unsigned int rate_scale = 1; for (i = 0; i < states; ++i) { double terma = 0; @@ -556,18 +623,35 @@ PLL_EXPORT void pll_core_update_partial_ii(unsigned int states, termb += rmat[j] * right_clv[j]; } parent_clv[i] = terma*termb; + + rate_scale &= (parent_clv[i] < PLL_SCALE_THRESHOLD); + lmat += states; rmat += states; + } - scaling = scaling && (parent_clv[i] < PLL_SCALE_THRESHOLD); + /* check if scaling is needed for the current rate category */ + if (scale_mode == 2) + { + /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (rate_scale) + { + for (i = 0; i < states; ++i) + parent_clv[i] *= PLL_SCALE_FACTOR; + parent_scaler[n*rate_cats + k] += 1; + } } + else + site_scale = site_scale && rate_scale; + parent_clv += states; left_clv += states; right_clv += states; } - /* if *all* entries of the site CLV were below the threshold then scale - (all) entries by PLL_SCALE_FACTOR */ - if (scaling) + /* PER-SITE SCALING: if *all* entries of the *site* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (site_scale) { parent_clv -= span; for (i = 0; i < span; ++i) diff --git a/src/core_partials_sse.c b/src/core_partials_sse.c index 356e075..f7b41b4 100644 --- a/src/core_partials_sse.c +++ b/src/core_partials_sse.c @@ -21,7 +21,7 @@ #include "pll.h" -static void fill_parent_scaler(unsigned int sites, +static void fill_parent_scaler(unsigned int scaler_size, unsigned int * parent_scaler, const unsigned int * left_scaler, const unsigned int * right_scaler) @@ -29,19 +29,19 @@ static void fill_parent_scaler(unsigned int sites, unsigned int i; if (!left_scaler && !right_scaler) - memset(parent_scaler, 0, sizeof(unsigned int) * sites); + memset(parent_scaler, 0, sizeof(unsigned int) * scaler_size); else if (left_scaler && right_scaler) { - memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * sites); - for (i = 0; i < sites; ++i) + memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * scaler_size); + for (i = 0; i < scaler_size; ++i) parent_scaler[i] += right_scaler[i]; } else { if (left_scaler) - memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * sites); + memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * scaler_size); else - memcpy(parent_scaler, right_scaler, sizeof(unsigned int) * sites); + memcpy(parent_scaler, right_scaler, sizeof(unsigned int) * scaler_size); } } @@ -390,7 +390,8 @@ PLL_EXPORT void pll_core_update_partial_tt_sse(unsigned int states, const unsigned char * left_tipchars, const unsigned char * right_tipchars, const double * lookup, - unsigned int tipstates_count) + unsigned int tipstates_count, + unsigned int attrib) { unsigned int j,k,n; unsigned int log2_maxstates = (unsigned int)ceil(log2(tipstates_count)); @@ -406,7 +407,8 @@ PLL_EXPORT void pll_core_update_partial_tt_sse(unsigned int states, parent_scaler, left_tipchars, right_tipchars, - lookup); + lookup, + attrib); return; } @@ -433,15 +435,19 @@ PLL_EXPORT void pll_core_update_partial_tt_4x4_sse(unsigned int sites, unsigned int * parent_scaler, const unsigned char * left_tipchars, const unsigned char * right_tipchars, - const double * lookup) + const double * lookup, + unsigned int attrib) { unsigned int j,k,n; unsigned int states = 4; unsigned int span = states*rate_cats; const double * offset; + size_t scaler_size = (attrib & PLL_ATTRIB_RATE_SCALERS) ? + sites*rate_cats : sites; + if (parent_scaler) - memset(parent_scaler, 0, sizeof(unsigned int) * sites); + memset(parent_scaler, 0, sizeof(unsigned int) * scaler_size); for (n = 0; n < sites; ++n) { @@ -466,23 +472,39 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_sse(unsigned int sites, const double * left_matrix, const double * right_matrix, const unsigned int * left_scaler, - const unsigned int * right_scaler) + const unsigned int * right_scaler, + unsigned int attrib) { unsigned int states = 4; + unsigned int span = states * rate_cats; unsigned int n,k,i; - unsigned int scaling; const double * lmat; const double * rmat; - __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8,xmm9; + unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ + unsigned int scale_mask; + unsigned int init_mask; - unsigned int span = states * rate_cats; + __m128d v_scale_threshold = _mm_set1_pd(PLL_SCALE_THRESHOLD); + __m128d v_scale_factor = _mm_set1_pd(PLL_SCALE_FACTOR); - /* add up the scale vector of the two children if available */ - if (parent_scaler) - fill_parent_scaler(sites, parent_scaler, left_scaler, right_scaler); + __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8,xmm9,xmm10; + if (parent_scaler) + { + /* determine the scaling mode and init the vars accordingly */ + scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1; + init_mask = (scale_mode == 1) ? 0x3 : 0; + const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites; + /* add up the scale vector of the two children if available */ + fill_parent_scaler(scaler_size, parent_scaler, left_scaler, right_scaler); + } + else + { + /* scaling disabled / not required */ + scale_mode = init_mask = 0; + } /* perform the following matrix multiplications: @@ -509,7 +531,7 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_sse(unsigned int sites, { lmat = left_matrix; rmat = right_matrix; - scaling = (parent_scaler) ? 1 : 0; + scale_mask = init_mask; for (k = 0; k < rate_cats; ++k) { @@ -565,7 +587,6 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_sse(unsigned int sites, xmm8 = _mm_hadd_pd(xmm2,xmm6); xmm2 = _mm_hadd_pd(xmm5,xmm7); xmm9 = _mm_mul_pd(xmm8,xmm2); - _mm_store_pd(parent_clv, xmm9); rmat += states; lmat += states; @@ -610,33 +631,50 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_sse(unsigned int sites, xmm8 = _mm_mul_pd(xmm3,xmm7); xmm7 = _mm_load_pd(rmat+2); - xmm9 = _mm_mul_pd(xmm4,xmm7); + xmm10 = _mm_mul_pd(xmm4,xmm7); /* calculate (b13*d1 + b15*d3 | b14*d2 + b16*d4) */ - xmm7 = _mm_add_pd(xmm8,xmm9); /* needed (4) */ + xmm7 = _mm_add_pd(xmm8,xmm10); /* needed (4) */ rmat += states; lmat += states; xmm8 = _mm_hadd_pd(xmm2,xmm6); xmm2 = _mm_hadd_pd(xmm5,xmm7); - xmm9 = _mm_mul_pd(xmm8,xmm2); - _mm_store_pd(parent_clv+2, xmm9); + xmm10 = _mm_mul_pd(xmm8,xmm2); - for (i = 0; i < states; ++i) - scaling = scaling && (parent_clv[i] < PLL_SCALE_THRESHOLD); + /* check if scaling is needed for the current rate category */ + __m128d v_cmp = _mm_cmplt_pd(xmm9, v_scale_threshold); + unsigned int rate_mask = _mm_movemask_pd(v_cmp); + v_cmp = _mm_cmplt_pd(xmm10, v_scale_threshold); + rate_mask = rate_mask & _mm_movemask_pd(v_cmp); + + if (scale_mode == 2) + { + /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (rate_mask == 0x3) + { + xmm9 = _mm_mul_pd(xmm9,v_scale_factor); + xmm10 = _mm_mul_pd(xmm10,v_scale_factor); + parent_scaler[n*rate_cats + k] += 1; + } + } + else + scale_mask = scale_mask & rate_mask; + + _mm_store_pd(parent_clv, xmm9); + _mm_store_pd(parent_clv+2, xmm10); parent_clv += states; left_clv += states; right_clv += states; } - /* if *all* entries of the site CLV were below the threshold then scale - (all) entries by PLL_SCALE_FACTOR */ - if (scaling) - { - __m128d v_scale_factor = _mm_set_pd(PLL_SCALE_FACTOR, - PLL_SCALE_FACTOR); + /* PER-SITE SCALING: if *all* entries of the *site* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (scale_mask == 0x3) + { parent_clv -= span; for (i = 0; i < span; i += 2) { @@ -660,7 +698,8 @@ PLL_EXPORT void pll_core_update_partial_ii_sse(unsigned int states, const double * left_matrix, const double * right_matrix, const unsigned int * left_scaler, - const unsigned int * right_scaler) + const unsigned int * right_scaler, + unsigned int attrib) { unsigned int i,j,k,n; unsigned int scaling; @@ -682,7 +721,8 @@ PLL_EXPORT void pll_core_update_partial_ii_sse(unsigned int states, left_matrix, right_matrix, left_scaler, - right_scaler); + right_scaler, + attrib); return; } @@ -798,19 +838,25 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4_sse(unsigned int sites, const double * right_clv, const double * left_matrix, const double * right_matrix, - const unsigned int * right_scaler) + const unsigned int * right_scaler, + unsigned int attrib) { unsigned int states = 4; - unsigned int scaling; + unsigned int span = states * rate_cats; unsigned int i,k,n; + unsigned int lstate; const double * lmat; const double * rmat; - unsigned int span = states * rate_cats; - unsigned int lstate; + unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ + unsigned int scale_mask; + unsigned int init_mask; - __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; + __m128d v_scale_threshold = _mm_set1_pd(PLL_SCALE_THRESHOLD); + __m128d v_scale_factor = _mm_set1_pd(PLL_SCALE_FACTOR); + + __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; __m128d ymm0,ymm1,ymm2,ymm3; /* precompute a lookup table of four values per entry (one for each state), @@ -903,16 +949,27 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4_sse(unsigned int sites, } } - /* update the parent scaler with the scaler of the right child */ if (parent_scaler) - fill_parent_scaler(sites, parent_scaler, NULL, right_scaler); + { + /* determine the scaling mode and init the vars accordingly */ + scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1; + init_mask = (scale_mode == 1) ? 0x3 : 0; + const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites; + /* update the parent scaler with the scaler of the right child */ + fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler); + } + else + { + /* scaling disabled / not required */ + scale_mode = init_mask = 0; + } /* iterate over sites and compute CLV entries */ for (n = 0; n < sites; ++n) { rmat = right_matrix; - scaling = (parent_scaler) ? 1 : 0; + scale_mask = init_mask; lstate = left_tipchar[n]; @@ -950,24 +1007,23 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4_sse(unsigned int sites, /* load precomputed lookup table into xmm2 */ xmm2 = _mm_load_pd(lookup+loffset); xmm3 = _mm_mul_pd(xmm4,xmm2); - _mm_store_pd(parent_clv,xmm3); /* load right pmatrix row2 */ xmm2 = _mm_load_pd(rmat); - xmm3 = _mm_load_pd(rmat+2); + xmm8 = _mm_load_pd(rmat+2); xmm4 = _mm_mul_pd(xmm0,xmm2); - xmm5 = _mm_mul_pd(xmm1,xmm3); + xmm5 = _mm_mul_pd(xmm1,xmm8); xmm6 = _mm_add_pd(xmm4,xmm5); /* a1*c1 + a3*c3 | a2*c2 + a4*c4 */ rmat += states; /* load right pmatrix row3 */ xmm2 = _mm_load_pd(rmat); - xmm3 = _mm_load_pd(rmat+2); + xmm8 = _mm_load_pd(rmat+2); xmm4 = _mm_mul_pd(xmm0,xmm2); - xmm5 = _mm_mul_pd(xmm1,xmm3); + xmm5 = _mm_mul_pd(xmm1,xmm8); xmm7 = _mm_add_pd(xmm4,xmm5); /* a5*c1 + a7*c3 | a6*c2 + a8*c4 */ rmat += states; @@ -977,23 +1033,43 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4_sse(unsigned int sites, /* load precomputed lookup table into xmm2 */ xmm2 = _mm_load_pd(lookup+loffset+2); - xmm3 = _mm_mul_pd(xmm4,xmm2); - _mm_store_pd(parent_clv+2,xmm3); + xmm8 = _mm_mul_pd(xmm4,xmm2); + +// for (i = 0; i < states; ++i) +// scaling = scaling && (parent_clv[i] < PLL_SCALE_THRESHOLD); - for (i = 0; i < states; ++i) - scaling = scaling && (parent_clv[i] < PLL_SCALE_THRESHOLD); + /* check if scaling is needed for the current rate category */ + __m128d v_cmp = _mm_cmplt_pd(xmm3, v_scale_threshold); + unsigned int rate_mask = _mm_movemask_pd(v_cmp); + v_cmp = _mm_cmplt_pd(xmm8, v_scale_threshold); + rate_mask = rate_mask & _mm_movemask_pd(v_cmp); + + if (scale_mode == 2) + { + /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (rate_mask == 0x3) + { + xmm3 = _mm_mul_pd(xmm3,v_scale_factor); + xmm8 = _mm_mul_pd(xmm8,v_scale_factor); + parent_scaler[n*rate_cats + k] += 1; + } + } + else + scale_mask = scale_mask & rate_mask; + + _mm_store_pd(parent_clv,xmm3); + _mm_store_pd(parent_clv+2,xmm8); parent_clv += states; right_clv += states; loffset += 4; } - /* if *all* entries of the site CLV were below the threshold then scale - (all) entries by PLL_SCALE_FACTOR */ - if (scaling) - { - __m128d v_scale_factor = _mm_set_pd(PLL_SCALE_FACTOR, - PLL_SCALE_FACTOR); + /* PER-SITE SCALING: if *all* entries of the *site* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (scale_mask == 0x3) + { parent_clv -= span; for (i = 0; i < span; i += 2) { @@ -1019,7 +1095,8 @@ PLL_EXPORT void pll_core_update_partial_ti_sse(unsigned int states, const double * right_matrix, const unsigned int * right_scaler, const unsigned int * tipmap, - unsigned int tipmap_size) + unsigned int tipmap_size, + unsigned int attrib) { int scaling; unsigned int i,j,k,n; @@ -1042,7 +1119,8 @@ PLL_EXPORT void pll_core_update_partial_ti_sse(unsigned int states, right_clv, left_matrix, right_matrix, - right_scaler); + right_scaler, + attrib); return; } diff --git a/src/pll.h b/src/pll.h index f077612..d49a8b0 100644 --- a/src/pll.h +++ b/src/pll.h @@ -68,6 +68,11 @@ #define PLL_SCALE_FACTOR_SQRT 340282366920938463463374607431768211456.0 /* 2**128 */ #define PLL_SCALE_THRESHOLD_SQRT (1.0/PLL_SCALE_FACTOR_SQRT) #define PLL_SCALE_BUFFER_NONE -1 + +/* in per-rate scaling mode, maximum difference between scalers + * please see https://github.com/xflouris/libpll/issues/44 */ +#define PLL_SCALE_RATE_MAXDIFF 4 + #define PLL_MISC_EPSILON 1e-8 #define PLL_ONE_EPSILON 1e-15 #define PLL_ONE_MIN (1-PLL_ONE_EPSILON) @@ -804,7 +809,8 @@ PLL_EXPORT void pll_core_update_partial_tt_4x4(unsigned int sites, unsigned int * parent_scaler, const unsigned char * left_tipchars, const unsigned char * right_tipchars, - const double * lookup); + const double * lookup, + unsigned int attrib); PLL_EXPORT void pll_core_update_partial_ti_4x4(unsigned int sites, unsigned int rate_cats, @@ -823,6 +829,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_4x4(unsigned int sites, unsigned int rate_cats, const double * parent_clv, const unsigned char * left_tipchars, + const unsigned int * parent_scaler, double ** eigenvecs, double ** inv_eigenvecs, double ** freqs, @@ -965,7 +972,8 @@ PLL_EXPORT void pll_core_update_partial_tt_sse(unsigned int states, const unsigned char * left_tipchars, const unsigned char * right_tipchars, const double * lookup, - unsigned int tipstates_count); + unsigned int tipstates_count, + unsigned int attrib); PLL_EXPORT void pll_core_update_partial_tt_4x4_sse(unsigned int sites, unsigned int rate_cats, @@ -973,7 +981,8 @@ PLL_EXPORT void pll_core_update_partial_tt_4x4_sse(unsigned int sites, unsigned int * parent_scaler, const unsigned char * left_tipchars, const unsigned char * right_tipchars, - const double * lookup); + const double * lookup, + unsigned int attrib); PLL_EXPORT void pll_core_update_partial_ti_sse(unsigned int states, unsigned int sites, @@ -986,7 +995,8 @@ PLL_EXPORT void pll_core_update_partial_ti_sse(unsigned int states, const double * right_matrix, const unsigned int * right_scaler, const unsigned int * tipmap, - unsigned int tipmap_size); + unsigned int tipmap_size, + unsigned int attrib); PLL_EXPORT void pll_core_update_partial_ti_4x4_sse(unsigned int sites, @@ -997,7 +1007,8 @@ PLL_EXPORT void pll_core_update_partial_ti_4x4_sse(unsigned int sites, const double * right_clv, const double * left_matrix, const double * right_matrix, - const unsigned int * right_scaler); + const unsigned int * right_scaler, + unsigned int attrib); PLL_EXPORT void pll_core_update_partial_ii_sse(unsigned int states, unsigned int sites, @@ -1009,7 +1020,8 @@ PLL_EXPORT void pll_core_update_partial_ii_sse(unsigned int states, const double * left_matrix, const double * right_matrix, const unsigned int * left_scaler, - const unsigned int * right_scaler); + const unsigned int * right_scaler, + unsigned int attrib); PLL_EXPORT void pll_core_update_partial_ii_4x4_sse(unsigned int sites, unsigned int rate_cats, @@ -1020,7 +1032,8 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_sse(unsigned int sites, const double * left_matrix, const double * right_matrix, const unsigned int * left_scaler, - const unsigned int * right_scaler); + const unsigned int * right_scaler, + unsigned int attrib); #endif /* functions in core_partials_avx.c */ @@ -1182,21 +1195,26 @@ PLL_EXPORT int pll_core_update_sumtable_ii_sse(unsigned int states, unsigned int rate_cats, const double * parent_clv, const double * child_clv, + const unsigned int * parent_scaler, + const unsigned int * child_scaler, double ** eigenvecs, double ** inv_eigenvecs, double ** freqs, - double *sumtable); + double *sumtable, + unsigned int attrib); PLL_EXPORT int pll_core_update_sumtable_ti_sse(unsigned int states, unsigned int sites, unsigned int rate_cats, const double * parent_clv, const unsigned char * left_tipchars, + const unsigned int * parent_scaler, double ** eigenvecs, double ** inv_eigenvecs, double ** freqs, unsigned int * tipmap, - double *sumtable); + double *sumtable, + unsigned int attrib); #endif @@ -1327,7 +1345,8 @@ double pll_core_edge_loglikelihood_ii_4x4_sse(unsigned int sites, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl); + double * persite_lnl, + unsigned int attrib); PLL_EXPORT double pll_core_edge_loglikelihood_ti_sse(unsigned int states, @@ -1359,7 +1378,8 @@ double pll_core_edge_loglikelihood_ti_4x4_sse(unsigned int sites, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl); + double * persite_lnl, + unsigned int attrib); PLL_EXPORT double pll_core_root_loglikelihood_4x4_sse(unsigned int sites, unsigned int rate_cats,