diff --git a/src/core_derivatives.c b/src/core_derivatives.c index 2b6e205..d069843 100644 --- a/src/core_derivatives.c +++ b/src/core_derivatives.c @@ -371,9 +371,46 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states, attrib); } + unsigned int min_scaler; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + /* build sumtable: non-vectorized version, general case */ for (n = 0; n < sites; n++) { + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + min_scaler = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < min_scaler) + min_scaler = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler, + PLL_SCALE_RATE_MAXDIFF); + } + } + for (i = 0; i < rate_cats; ++i) { t_eigenvecs = eigenvecs[i]; @@ -393,12 +430,18 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states, tipstate >>= 1; } sum[j] = lefterm * righterm; + + if (rate_scalings && rate_scalings[i] > 0) + sum[j] *= scale_minlh[rate_scalings[i]-1]; } t_clvc += states_padded; sum += states_padded; } } + if (rate_scalings) + free(rate_scalings); + return PLL_SUCCESS; } diff --git a/src/core_derivatives_avx.c b/src/core_derivatives_avx.c index 992e41b..0ddac53 100644 --- a/src/core_derivatives_avx.c +++ b/src/core_derivatives_avx.c @@ -45,6 +45,7 @@ static int core_update_sumtable_ii_4x4_avx(unsigned int sites, unsigned int states = 4; + /* scaling stuff*/ unsigned int min_scaler = 0; unsigned int * rate_scalings = NULL; int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; @@ -55,6 +56,13 @@ static int core_update_sumtable_ii_4x4_avx(unsigned int sites, { rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + if (!rate_scalings) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers"); + return PLL_FAILURE; + } + double scale_factor = 1.0; for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) { @@ -238,6 +246,32 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states, unsigned int states_padded = (states+3) & 0xFFFFFFFC; + /* scaling stuff */ + unsigned int min_scaler = 0; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + __m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + if (!rate_scalings) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers"); + return PLL_FAILURE; + } + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + v_scale_minlh[i] = _mm256_set1_pd(scale_factor); + } + } + /* padded eigenvecs */ double * tt_eigenvecs = (double *) pll_aligned_alloc ( (states_padded * states_padded * rate_cats) * sizeof(double), @@ -282,6 +316,26 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states, /* vectorized loop from update_sumtable() */ for (n = 0; n < sites; n++) { + /* compute per-rate scalers and obtain minimum value (within site) */ + if (per_rate_scaling) + { + min_scaler = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < min_scaler) + min_scaler = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler, + PLL_SCALE_RATE_MAXDIFF); + } + } + const double * c_eigenvecs = tt_eigenvecs; const double * ct_inv_eigenvecs = tt_inv_eigenvecs; for (i = 0; i < rate_cats; ++i) @@ -381,6 +435,13 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states, /* update sum */ __m256d v_prod = _mm256_mul_pd (v_lefterm_sum, v_righterm_sum); + + /* apply per-rate scalers */ + if (rate_scalings && rate_scalings[i] > 0) + { + v_prod = _mm256_mul_pd(v_prod, v_scale_minlh[rate_scalings[i]-1]); + } + _mm256_store_pd (sum + j, v_prod); } @@ -392,6 +453,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states, pll_aligned_free (tt_inv_eigenvecs); pll_aligned_free (tt_eigenvecs); + if (rate_scalings) + free(rate_scalings); return PLL_SUCCESS; } @@ -426,6 +489,13 @@ static int core_update_sumtable_ti_4x4_avx(unsigned int sites, { rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + if (!rate_scalings) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers"); + return PLL_FAILURE; + } + double scale_factor = 1.0; for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) { @@ -568,7 +638,6 @@ static int core_update_sumtable_ti_4x4_avx(unsigned int sites, pll_aligned_free(eigenvecs_trans); pll_aligned_free(precomp_left); - if (rate_scalings) free(rate_scalings); @@ -614,6 +683,32 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states, const double * t_clvc = parent_clv; const double * t_eigenvecs_padded; + /* scaling stuff */ + unsigned int min_scaler = 0; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + __m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + if (!rate_scalings) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers"); + return PLL_FAILURE; + } + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + v_scale_minlh[i] = _mm256_set1_pd(scale_factor); + } + } + double * eigenvecs_padded = (double *) pll_aligned_alloc ( (states_padded * states_padded * rate_cats) * sizeof(double), PLL_ALIGNMENT_AVX); @@ -688,6 +783,25 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states, /* build sumtable */ for (n = 0; n < sites; n++) { + /* compute per-rate scalers and obtain minimum value (within site) */ + if (per_rate_scaling) + { + min_scaler = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < min_scaler) + min_scaler = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler, + PLL_SCALE_RATE_MAXDIFF); + } + } + tipstate = (unsigned int) left_tipchars[n]; unsigned int loffset = tipstate * span; @@ -757,6 +871,13 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states, __m256d v_lefterm = _mm256_load_pd(t_precomp + j); __m256d v_sum = _mm256_mul_pd(v_lefterm, v_righterm); + + /* apply per-rate scalers */ + if (rate_scalings && rate_scalings[i] > 0) + { + v_sum = _mm256_mul_pd(v_sum, v_scale_minlh[rate_scalings[i]-1]); + } + _mm256_store_pd(sum + j, v_sum); } @@ -768,6 +889,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states, pll_aligned_free(eigenvecs_padded); pll_aligned_free(precomp_left); + if (rate_scalings) + free(rate_scalings); return PLL_SUCCESS; } diff --git a/src/core_derivatives_avx2.c b/src/core_derivatives_avx2.c index bd345ec..a6ca175 100644 --- a/src/core_derivatives_avx2.c +++ b/src/core_derivatives_avx2.c @@ -18,7 +18,7 @@ Exelixis Lab, Heidelberg Instutute for Theoretical Studies Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ - +#include #include "pll.h" PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states, @@ -63,6 +63,32 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states, unsigned int states_padded = (states+3) & 0xFFFFFFFC; + /* scaling stuff */ + unsigned int min_scaler = 0; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + __m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + if (!rate_scalings) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers"); + return PLL_FAILURE; + } + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + v_scale_minlh[i] = _mm256_set1_pd(scale_factor); + } + } + /* padded eigenvecs */ double * tt_eigenvecs = (double *) pll_aligned_alloc ( (states_padded * states_padded * rate_cats) * sizeof(double), @@ -107,6 +133,26 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states, /* vectorized loop from update_sumtable() */ for (n = 0; n < sites; n++) { + /* compute per-rate scalers and obtain minimum value (within site) */ + if (per_rate_scaling) + { + min_scaler = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < min_scaler) + min_scaler = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler, + PLL_SCALE_RATE_MAXDIFF); + } + } + const double * c_eigenvecs = tt_eigenvecs; const double * ct_inv_eigenvecs = tt_inv_eigenvecs; for (i = 0; i < rate_cats; ++i) @@ -201,6 +247,13 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states, /* update sum */ __m256d v_prod = _mm256_mul_pd (v_lefterm_sum, v_righterm_sum); + + /* apply per-rate scalers */ + if (rate_scalings && rate_scalings[i] > 0) + { + v_prod = _mm256_mul_pd(v_prod, v_scale_minlh[rate_scalings[i]-1]); + } + _mm256_store_pd (sum + j, v_prod); } @@ -212,6 +265,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states, pll_aligned_free (tt_inv_eigenvecs); pll_aligned_free (tt_eigenvecs); + if (rate_scalings) + free(rate_scalings); return PLL_SUCCESS; } @@ -255,6 +310,31 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states, unsigned int i, j, k, n; unsigned int tipstate; + unsigned int min_scaler = 0; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + __m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + if (!rate_scalings) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers"); + return PLL_FAILURE; + } + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + v_scale_minlh[i] = _mm256_set1_pd(scale_factor); + } + } + double * sum = sumtable; const double * t_clvc = parent_clv; const double * t_eigenvecs_padded; @@ -332,6 +412,25 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states, /* build sumtable */ for (n = 0; n < sites; n++) { + /* compute per-rate scalers and obtain minimum value (within site) */ + if (per_rate_scaling) + { + min_scaler = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < min_scaler) + min_scaler = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler, + PLL_SCALE_RATE_MAXDIFF); + } + } + tipstate = (unsigned int) left_tipchars[n]; unsigned int loffset = tipstate * span; @@ -397,6 +496,13 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states, __m256d v_lefterm = _mm256_load_pd(t_precomp + j); __m256d v_sum = _mm256_mul_pd(v_lefterm, v_righterm); + + /* apply per-rate scalers */ + if (rate_scalings && rate_scalings[i] > 0) + { + v_sum = _mm256_mul_pd(v_sum, v_scale_minlh[rate_scalings[i]-1]); + } + _mm256_store_pd(sum + j, v_sum); } @@ -408,6 +514,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states, pll_aligned_free(eigenvecs_padded); pll_aligned_free(precomp_left); + if (rate_scalings) + free(rate_scalings); return PLL_SUCCESS; } diff --git a/src/core_derivatives_sse.c b/src/core_derivatives_sse.c index cf77d84..206994d 100644 --- a/src/core_derivatives_sse.c +++ b/src/core_derivatives_sse.c @@ -271,9 +271,46 @@ PLL_EXPORT int pll_core_update_sumtable_ti_sse(unsigned int states, unsigned int states_padded = (states+1) & 0xFFFFFFFE; + unsigned int min_scaler; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + /* build sumtable: non-vectorized version, general case */ for (n = 0; n < sites; n++) { + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + min_scaler = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < min_scaler) + min_scaler = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler, + PLL_SCALE_RATE_MAXDIFF); + } + } + for (i = 0; i < rate_cats; ++i) { ev = eigenvecs[i]; @@ -293,11 +330,18 @@ PLL_EXPORT int pll_core_update_sumtable_ti_sse(unsigned int states, tipstate >>= 1; } sum[j] = lterm*rterm; + + if (rate_scalings && rate_scalings[i] > 0) + sum[j] *= scale_minlh[rate_scalings[i]-1]; } clvc += states_padded; sum += states_padded; } } + + if (rate_scalings) + free(rate_scalings); + return PLL_SUCCESS; } diff --git a/src/core_likelihood.c b/src/core_likelihood.c index 392bdc0..ba7efed 100644 --- a/src/core_likelihood.c +++ b/src/core_likelihood.c @@ -439,7 +439,6 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, double terma, terma_r, termb; double site_lk, inv_site_lk; - unsigned int scale_factors; unsigned int cstate; unsigned int states_padded = states; @@ -480,7 +479,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, invar_proportion, invar_indices, freqs_indices, - persite_lnl); + persite_lnl, + attrib); } /* this line is never called, but should we disable the else case above, then states_padded must be set to this value */ @@ -523,7 +523,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, invar_proportion, invar_indices, freqs_indices, - persite_lnl); + persite_lnl, + attrib); } else { @@ -541,7 +542,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, invar_proportion, invar_indices, freqs_indices, - persite_lnl); + persite_lnl, + attrib); } /* this line is never called, but should we disable the else case above, then states_padded must be set to this value */ @@ -584,7 +586,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, invar_proportion, invar_indices, freqs_indices, - persite_lnl); + persite_lnl, + attrib); } else { @@ -602,7 +605,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, invar_proportion, invar_indices, freqs_indices, - persite_lnl); + persite_lnl, + attrib); } /* this line is never called, but should we disable the else case above, then states_padded must be set to this value */ @@ -610,8 +614,50 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, } #endif + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + for (n = 0; n < sites; ++n) { + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < site_scalings) + site_scalings = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); + } + } + else + { + /* count number of scaling factors to account for */ + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; + } + pmat = pmatrix; terma = 0; for (i = 0; i < rate_cats; ++i) @@ -633,6 +679,12 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, pmat += states_padded; } + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + /* account for invariant sites */ if (prop_invar > 0) { @@ -649,13 +701,10 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, clvp += states_padded; } - /* count number of scaling factors to acount for */ - scale_factors = (parent_scaler) ? parent_scaler[n] : 0; - /* compute site log-likelihood and scale if necessary */ site_lk = log(terma); - if (scale_factors) - site_lk += scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -667,6 +716,10 @@ double pll_core_edge_loglikelihood_ti(unsigned int states, tipchars++; } + + if (rate_scalings) + free(rate_scalings); + return logl; } @@ -741,7 +794,8 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, invar_proportion, invar_indices, freqs_indices, - persite_lnl); + persite_lnl, + attrib); } /* this line is never called, but should we disable the else case above, then states_padded must be set to this value */ @@ -785,7 +839,8 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, invar_proportion, invar_indices, freqs_indices, - persite_lnl); + persite_lnl, + attrib); } /* this line is never called, but should we disable the else case above, then states_padded must be set to this value */ @@ -829,7 +884,8 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, invar_proportion, invar_indices, freqs_indices, - persite_lnl); + persite_lnl, + attrib); } /* this line is never called, but should we disable the else case above, then states_padded must be set to this value */ @@ -857,9 +913,6 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, for (n = 0; n < sites; ++n) { - pmat = pmatrix; - terma = 0; - if (per_rate_scaling) { /* compute minimum per-rate scaler -> common per-site scaler */ @@ -886,6 +939,8 @@ double pll_core_edge_loglikelihood_ii(unsigned int states, site_scalings += (child_scaler) ? child_scaler[n] : 0; } + pmat = pmatrix; + terma = 0; for (i = 0; i < rate_cats; ++i) { freqs = frequencies[freqs_indices[i]]; diff --git a/src/core_likelihood_avx.c b/src/core_likelihood_avx.c index 26ded2d..7157f8c 100644 --- a/src/core_likelihood_avx.c +++ b/src/core_likelihood_avx.c @@ -420,7 +420,8 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl) + double * persite_lnl, + unsigned int attrib) { unsigned int n,i,j,m; double logl = 0; @@ -434,7 +435,6 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites, double site_lk, inv_site_lk; unsigned int cstate; - unsigned int scale_factors; unsigned int states = 20; unsigned int states_padded = states; @@ -445,6 +445,32 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites, unsigned int span = states_padded * rate_cats; unsigned int maxstates = tipmap_size; + /* scaling stuff */ + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + if (!rate_scalings) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers."); + return -INFINITY; + } + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + /* precompute a lookup table of four values per entry (one for each state), for all 16 states (including ambiguities) and for each rate category. */ double * lookup = pll_aligned_alloc(maxstates*span*sizeof(double), @@ -510,6 +536,30 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites, cstate = (unsigned int) tipchars[n]; unsigned int loffset = cstate*span; + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < site_scalings) + site_scalings = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); + } + } + else + { + /* count number of scaling factors to account for */ + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; + } + for (i = 0; i < rate_cats; ++i) { xmm1 = _mm256_setzero_pd(); @@ -534,6 +584,12 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites, xmm0 = _mm256_hadd_pd(xmm1,xmm1); terma_r = ((double *)&xmm0)[0] + ((double *)&xmm0)[2]; + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + /* account for invariant sites */ prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0; if (prop_invar > 0) @@ -551,13 +607,11 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites, pmat -= displacement; } - /* count number of scaling factors to acount for */ - scale_factors = (parent_scaler) ? parent_scaler[n] : 0; /* compute site log-likelihood and scale if necessary */ site_lk = log(terma); - if (scale_factors) - site_lk += scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -569,6 +623,8 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites, } pll_aligned_free(lookup); + if (rate_scalings) + free(rate_scalings); return logl; } @@ -588,7 +644,8 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl) + double * persite_lnl, + unsigned int attrib) { unsigned int n,i,j,k; double logl = 0; @@ -602,7 +659,6 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states, double site_lk, inv_site_lk; unsigned int cstate; - unsigned int scale_factors; unsigned int states_padded = (states+3) & 0xFFFFFFFC; __m256d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; @@ -610,6 +666,32 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states, size_t displacement = (states_padded - states) * (states_padded); + /* scaling stuff */ + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + if (!rate_scalings) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers."); + return -INFINITY; + } + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + for (n = 0; n < sites; ++n) { pmat = pmatrix; @@ -617,6 +699,30 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states, cstate = tipmap[tipchars[n]]; + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < site_scalings) + site_scalings = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); + } + } + else + { + /* count number of scaling factors to account for */ + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; + } + for (i = 0; i < rate_cats; ++i) { freqs = frequencies[freqs_indices[i]]; @@ -706,6 +812,12 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states, clvp += 4; } + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + /* account for invariant sites */ prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0; if (prop_invar > 0) @@ -723,13 +835,11 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states, pmat -= displacement; } - /* count number of scaling factors to acount for */ - scale_factors = (parent_scaler) ? parent_scaler[n] : 0; /* compute site log-likelihood and scale if necessary */ site_lk = log(terma); - if (scale_factors) - site_lk += scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -739,6 +849,10 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states, logl += site_lk; } + + if (rate_scalings) + free(rate_scalings); + return logl; } @@ -757,7 +871,8 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl) + double * persite_lnl, + unsigned int attrib) { unsigned int n,i,j,k; double logl = 0; @@ -771,17 +886,69 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states, double terma, terma_r; double site_lk, inv_site_lk; - unsigned int scale_factors; unsigned int states_padded = (states+3) & 0xFFFFFFFC; __m256d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; size_t displacement = (states_padded - states) * (states_padded); + /* scaling stuff */ + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + if (!rate_scalings) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf(pll_errmsg, 200, "Cannot allocate space for precomputation."); + return -INFINITY; + } + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + for (n = 0; n < sites; ++n) { pmat = pmatrix; terma = 0; + + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < site_scalings) + site_scalings = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); + } + } + else + { + /* count number of scaling factors to account for */ + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; + site_scalings += (child_scaler) ? child_scaler[n] : 0; + } + for (i = 0; i < rate_cats; ++i) { freqs = frequencies[freqs_indices[i]]; @@ -864,6 +1031,12 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states, clvp += 4; } + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + /* account for invariant sites */ prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0; if (prop_invar > 0) @@ -882,14 +1055,11 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states, clvc += states_padded; pmat -= displacement; } - /* count number of scaling factors to acount for */ - scale_factors = (parent_scaler) ? parent_scaler[n] : 0; - scale_factors += (child_scaler) ? child_scaler[n] : 0; /* compute site log-likelihood and scale if necessary */ site_lk = log(terma); - if (scale_factors) - site_lk += scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -899,6 +1069,10 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states, logl += site_lk; } + + if (rate_scalings) + free(rate_scalings); + return logl; } diff --git a/src/core_likelihood_avx2.c b/src/core_likelihood_avx2.c index b730a35..552f12f 100644 --- a/src/core_likelihood_avx2.c +++ b/src/core_likelihood_avx2.c @@ -19,6 +19,7 @@ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ +#include #include "pll.h" PLL_EXPORT double pll_core_root_loglikelihood_avx2(unsigned int states, @@ -122,7 +123,8 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl) + double * persite_lnl, + unsigned int attrib) { unsigned int n,i,j,m = 0; double logl = 0; @@ -136,7 +138,6 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites, double site_lk, inv_site_lk; unsigned int cstate; - unsigned int scale_factors; unsigned int states = 20; unsigned int states_padded = states; @@ -144,6 +145,32 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites, size_t displacement = (states_padded - states) * (states_padded); + /* scaling stuff */ + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + if (!rate_scalings) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers."); + return -INFINITY; + } + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + unsigned int span = states_padded * rate_cats; unsigned int maxstates = tipmap_size; @@ -212,6 +239,30 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites, cstate = (unsigned int) tipchars[n]; unsigned int loffset = cstate*span; + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < site_scalings) + site_scalings = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); + } + } + else + { + /* count number of scaling factors to account for */ + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; + } + for (i = 0; i < rate_cats; ++i) { xmm1 = _mm256_setzero_pd(); @@ -234,6 +285,12 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites, xmm0 = _mm256_hadd_pd(xmm1,xmm1); terma_r = ((double *)&xmm0)[0] + ((double *)&xmm0)[2]; + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + /* account for invariant sites */ prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0; if (prop_invar > 0) @@ -251,13 +308,11 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites, pmat -= displacement; } - /* count number of scaling factors to acount for */ - scale_factors = (parent_scaler) ? parent_scaler[n] : 0; /* compute site log-likelihood and scale if necessary */ site_lk = log(terma); - if (scale_factors) - site_lk += scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -269,6 +324,8 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites, } pll_aligned_free(lookup); + if (rate_scalings) + free(rate_scalings); return logl; } @@ -288,7 +345,8 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl) + double * persite_lnl, + unsigned int attrib) { unsigned int n,i,j,k; double logl = 0; @@ -302,17 +360,69 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states, double terma, terma_r; double site_lk, inv_site_lk; - unsigned int scale_factors; unsigned int states_padded = (states+3) & 0xFFFFFFFC; __m256d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; size_t displacement = (states_padded - states) * (states_padded); + /* scaling stuff */ + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + if (!rate_scalings) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers."); + return -INFINITY; + } + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + for (n = 0; n < sites; ++n) { pmat = pmatrix; terma = 0; + + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < site_scalings) + site_scalings = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); + } + } + else + { + /* count number of scaling factors to account for */ + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; + site_scalings += (child_scaler) ? child_scaler[n] : 0; + } + for (i = 0; i < rate_cats; ++i) { freqs = frequencies[freqs_indices[i]]; @@ -391,6 +501,12 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states, clvp += 4; } + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + /* account for invariant sites */ prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0; if (prop_invar > 0) @@ -409,14 +525,11 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states, clvc += states_padded; pmat -= displacement; } - /* count number of scaling factors to acount for */ - scale_factors = (parent_scaler) ? parent_scaler[n] : 0; - scale_factors += (child_scaler) ? child_scaler[n] : 0; /* compute site log-likelihood and scale if necessary */ site_lk = log(terma); - if (scale_factors) - site_lk += scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -426,5 +539,9 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states, logl += site_lk; } + + if (rate_scalings) + free(rate_scalings); + return logl; } diff --git a/src/core_likelihood_sse.c b/src/core_likelihood_sse.c index 4549b09..6a14b2b 100644 --- a/src/core_likelihood_sse.c +++ b/src/core_likelihood_sse.c @@ -202,7 +202,8 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl) + double * persite_lnl, + unsigned int attrib) { unsigned int n,i,j,k; double logl = 0; @@ -216,7 +217,6 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states, double site_lk, inv_site_lk; unsigned int cstate; - unsigned int scale_factors; unsigned int states_padded = (states+1) & 0xFFFFFFFE; __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; @@ -225,6 +225,32 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states, xmm5 = _mm_setzero_pd(); + /* scaling stuff */ + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + if (!rate_scalings) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers."); + return -INFINITY; + } + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + for (n = 0; n < sites; ++n) { pmat = pmatrix; @@ -232,6 +258,30 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states, cstate = tipmap[tipchars[n]]; + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < site_scalings) + site_scalings = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); + } + } + else + { + /* count number of scaling factors to account for */ + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; + } + for (i = 0; i < rate_cats; ++i) { freqs = frequencies[freqs_indices[i]]; @@ -294,6 +344,12 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states, clvp += 2; } + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + /* account for invariant sites */ prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0; if (prop_invar > 0) @@ -311,13 +367,11 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states, pmat -= displacement; } - /* count number of scaling factors to acount for */ - scale_factors = (parent_scaler) ? parent_scaler[n] : 0; /* compute site log-likelihood and scale if necessary */ site_lk = log(terma); - if (scale_factors) - site_lk += scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -327,6 +381,10 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states, logl += site_lk; } + + if (rate_scalings) + free(rate_scalings); + return logl; } @@ -345,7 +403,8 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl) + double * persite_lnl, + unsigned int attrib) { unsigned int n,i,j,k; double logl = 0; @@ -359,17 +418,69 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states, double terma, terma_r; double site_lk, inv_site_lk; - unsigned int scale_factors; unsigned int states_padded = (states+1) & 0xFFFFFFFE; __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; size_t displacement = (states_padded - states) * (states_padded); + /* scaling stuff */ + unsigned int site_scalings; + unsigned int * rate_scalings = NULL; + int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0; + + /* powers of scale threshold for undoing the scaling */ + double scale_minlh[PLL_SCALE_RATE_MAXDIFF]; + if (per_rate_scaling) + { + rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int)); + + if (!rate_scalings) + { + pll_errno = PLL_ERROR_MEM_ALLOC; + snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers."); + return -INFINITY; + } + + double scale_factor = 1.0; + for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i) + { + scale_factor *= PLL_SCALE_THRESHOLD; + scale_minlh[i] = scale_factor; + } + } + for (n = 0; n < sites; ++n) { pmat = pmatrix; terma = 0; + + if (per_rate_scaling) + { + /* compute minimum per-rate scaler -> common per-site scaler */ + site_scalings = UINT_MAX; + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0; + rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0; + if (rate_scalings[i] < site_scalings) + site_scalings = rate_scalings[i]; + } + + /* compute relative capped per-rate scalers */ + for (i = 0; i < rate_cats; ++i) + { + rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings, + PLL_SCALE_RATE_MAXDIFF); + } + } + else + { + /* count number of scaling factors to account for */ + site_scalings = (parent_scaler) ? parent_scaler[n] : 0; + site_scalings += (child_scaler) ? child_scaler[n] : 0; + } + for (i = 0; i < rate_cats; ++i) { freqs = frequencies[freqs_indices[i]]; @@ -424,6 +535,12 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states, clvp += 2; } + /* apply per-rate scalers, if necessary */ + if (rate_scalings && rate_scalings[i] > 0) + { + terma_r *= scale_minlh[rate_scalings[i]-1]; + } + /* account for invariant sites */ prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0; if (prop_invar > 0) @@ -442,14 +559,11 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states, clvc += states_padded; pmat -= displacement; } - /* count number of scaling factors to acount for */ - scale_factors = (parent_scaler) ? parent_scaler[n] : 0; - scale_factors += (child_scaler) ? child_scaler[n] : 0; /* compute site log-likelihood and scale if necessary */ site_lk = log(terma); - if (scale_factors) - site_lk += scale_factors * log(PLL_SCALE_THRESHOLD); + if (site_scalings) + site_lk += site_scalings * log(PLL_SCALE_THRESHOLD); site_lk *= pattern_weights[n]; @@ -459,6 +573,10 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states, logl += site_lk; } + + if (rate_scalings) + free(rate_scalings); + return logl; } diff --git a/src/core_partials_avx.c b/src/core_partials_avx.c index d20c70d..640ca78 100644 --- a/src/core_partials_avx.c +++ b/src/core_partials_avx.c @@ -377,9 +377,6 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_avx(unsigned int sites, { unsigned int states = 4; unsigned int n,k,i; - unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ - unsigned int scale_mask; - unsigned int init_mask; const double * lmat; const double * rmat; @@ -389,6 +386,14 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_avx(unsigned int sites, unsigned int span = states * rate_cats; + + /* scaling-related stuff */ + unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ + unsigned int scale_mask; + unsigned int init_mask; + __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); + __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); + if (!parent_scaler) { /* scaling disabled / not required */ @@ -404,9 +409,6 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_avx(unsigned int sites, fill_parent_scaler(scaler_size, parent_scaler, left_scaler, right_scaler); } - __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); - __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); - for (n = 0; n < sites; ++n) { lmat = left_matrix; @@ -556,8 +558,11 @@ PLL_EXPORT void pll_core_update_partial_tt_avx(unsigned int states, return; } + size_t scaler_size = (attrib & PLL_ATTRIB_RATE_SCALERS) ? + sites*rate_cats : sites; + if (parent_scaler) - memset(parent_scaler, 0, sizeof(unsigned int) * sites); + memset(parent_scaler, 0, sizeof(unsigned int) * scaler_size); for (n = 0; n < sites; ++n) { @@ -627,7 +632,6 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states, unsigned int attrib) { unsigned int i,j,k,n; - unsigned int scaling; const double * lmat; const double * rmat; @@ -666,18 +670,36 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states, right_matrix, right_scaler, tipmap, - tipmap_size); + tipmap_size, + attrib); return; } - /* add up the scale vector of the two children if available */ - if (parent_scaler) - fill_parent_scaler(sites, parent_scaler, NULL, right_scaler); + /* scaling-related stuff */ + unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ + unsigned int scale_mask; + unsigned int init_mask; + __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); + __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); + + if (!parent_scaler) + { + /* scaling disabled / not required */ + scale_mode = init_mask = 0; + } + else + { + /* determine the scaling mode and init the vars accordingly */ + scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1; + init_mask = (scale_mode == 1) ? 0xF : 0; + const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites; + /* add up the scale vector of the two children if available */ + fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler); + } size_t displacement = (states_padded - states) * (states_padded); __m256i mask; - __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); /* compute CLV */ for (n = 0; n < sites; ++n) @@ -685,16 +707,17 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states, lmat = left_matrix; rmat = right_matrix; - scaling = (parent_scaler) ? 0xF : 0; + scale_mask = init_mask; lstate = tipmap[left_tipchars[n]]; for (k = 0; k < rate_cats; ++k) { + unsigned int rate_mask = 0xF; + /* iterate over quadruples of rows */ for (i = 0; i < states_padded; i += 4) { - __m256d v_terma0 = _mm256_setzero_pd(); __m256d v_termb0 = _mm256_setzero_pd(); __m256d v_terma1 = _mm256_setzero_pd(); @@ -820,14 +843,32 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states, __m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum); - /* check for scaling */ + /* check if scaling is needed for the current rate category */ __m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS); - scaling = scaling & _mm256_movemask_pd(v_cmp); + rate_mask = rate_mask & _mm256_movemask_pd(v_cmp); _mm256_store_pd(parent_clv+i, v_prod); } + if (scale_mode == 2) + { + /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (rate_mask == 0xF) + { + for (i = 0; i < states_padded; i += 4) + { + __m256d v_prod = _mm256_load_pd(parent_clv + i); + v_prod = _mm256_mul_pd(v_prod, v_scale_factor); + _mm256_store_pd(parent_clv + i, v_prod); + } + parent_scaler[n*rate_cats + k] += 1; + } + } + else + scale_mask = scale_mask & rate_mask; + /* reset pointers to point to the start of the next p-matrix, as the vectorization assumes a square states_padded * states_padded matrix, even though the real matrix is states * states_padded */ @@ -837,12 +878,11 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states, parent_clv += states_padded; right_clv += states_padded; } + /* if *all* entries of the site CLV were below the threshold then scale (all) entries by PLL_SCALE_FACTOR */ - if (scaling == 0xF) + if (scale_mask == 0xF) { - __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); - parent_clv -= span_padded; for (i = 0; i < span_padded; i += 4) { @@ -1064,25 +1104,25 @@ PLL_EXPORT void pll_core_update_partial_ti_20x20_avx(unsigned int sites, const double * right_matrix, const unsigned int * right_scaler, const unsigned int * tipmap, - unsigned int tipmap_size) + unsigned int tipmap_size, + unsigned int attrib) { unsigned int states = 20; unsigned int states_padded = states; unsigned int maxstates = tipmap_size; - unsigned int scaling; unsigned int i,j,k,n,m; const double * lmat; const double * rmat; - unsigned int span = states_padded * rate_cats; + unsigned int span_padded = states_padded * rate_cats; unsigned int lstate; __m256d xmm0,xmm1,xmm2,xmm3; /* precompute a lookup table of four values per entry (one for each state), for all 16 states (including ambiguities) and for each rate category. */ - double * lookup = pll_aligned_alloc(maxstates*span*sizeof(double), + double * lookup = pll_aligned_alloc(maxstates*span_padded*sizeof(double), PLL_ALIGNMENT_AVX); if (!lookup) { @@ -1136,31 +1176,48 @@ PLL_EXPORT void pll_core_update_partial_ti_20x20_avx(unsigned int sites, } } - /* update the parent scaler with the scaler of the right child */ - if (parent_scaler) - fill_parent_scaler(sites, parent_scaler, NULL, right_scaler); + /* scaling-related stuff */ + unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ + unsigned int scale_mask; + unsigned int init_mask; + __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); + __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); - size_t displacement = (states_padded - states) * (states_padded); + if (!parent_scaler) + { + /* scaling disabled / not required */ + scale_mode = init_mask = 0; + } + else + { + /* determine the scaling mode and init the vars accordingly */ + scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1; + init_mask = (scale_mode == 1) ? 0xF : 0; + const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites; + /* add up the scale vector of the two children if available */ + fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler); + } - __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); + size_t displacement = (states_padded - states) * (states_padded); /* iterate over sites and compute CLV entries */ for (n = 0; n < sites; ++n) { rmat = right_matrix; - scaling = (parent_scaler) ? 0xF : 0; + scale_mask = init_mask; lstate = (unsigned int) left_tipchar[n]; - unsigned int loffset = lstate*span; + unsigned int loffset = lstate*span_padded; for (k = 0; k < rate_cats; ++k) { + unsigned int rate_mask = 0xF; + /* iterate over quadruples of rows */ for (i = 0; i < states_padded; i += 4) { - __m256d v_termb0 = _mm256_setzero_pd(); __m256d v_termb1 = _mm256_setzero_pd(); __m256d v_termb2 = _mm256_setzero_pd(); @@ -1230,13 +1287,31 @@ PLL_EXPORT void pll_core_update_partial_ti_20x20_avx(unsigned int sites, __m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum); - /* check for scaling */ + /* check if scaling is needed for the current rate category */ __m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS); - scaling = scaling & _mm256_movemask_pd(v_cmp); + rate_mask = rate_mask & _mm256_movemask_pd(v_cmp); _mm256_store_pd(parent_clv+i, v_prod); } + if (scale_mode == 2) + { + /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (rate_mask == 0xF) + { + for (i = 0; i < states_padded; i += 4) + { + __m256d v_prod = _mm256_load_pd(parent_clv + i); + v_prod = _mm256_mul_pd(v_prod, v_scale_factor); + _mm256_store_pd(parent_clv + i, v_prod); + } + parent_scaler[n*rate_cats + k] += 1; + } + } + else + scale_mask = scale_mask & rate_mask; + /* reset pointers to point to the start of the next p-matrix, as the vectorization assumes a square states_padded * states_padded matrix, even though the real matrix is states * states_padded */ @@ -1248,18 +1323,16 @@ PLL_EXPORT void pll_core_update_partial_ti_20x20_avx(unsigned int sites, /* if *all* entries of the site CLV were below the threshold then scale (all) entries by PLL_SCALE_FACTOR */ - if (scaling == 0xF) + if (scale_mask == 0xF) { - __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); - - parent_clv -= span; - for (i = 0; i < span; i += 4) + parent_clv -= span_padded; + for (i = 0; i < span_padded; i += 4) { __m256d v_prod = _mm256_load_pd(parent_clv + i); v_prod = _mm256_mul_pd(v_prod,v_scale_factor); _mm256_store_pd(parent_clv + i, v_prod); } - parent_clv += span; + parent_clv += span_padded; parent_scaler[n] += 1; } } @@ -1280,7 +1353,6 @@ PLL_EXPORT void pll_core_update_partial_ii_avx(unsigned int states, unsigned int attrib) { unsigned int i,j,k,n; - unsigned int scaling; const double * lmat; const double * rmat; @@ -1305,27 +1377,44 @@ PLL_EXPORT void pll_core_update_partial_ii_avx(unsigned int states, return; } - /* add up the scale vector of the two children if available */ - if (parent_scaler) - fill_parent_scaler(sites, parent_scaler, left_scaler, right_scaler); + /* scaling-related stuff */ + unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ + unsigned int scale_mask; + unsigned int init_mask; + __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); + __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); - size_t displacement = (states_padded - states) * (states_padded); + if (!parent_scaler) + { + /* scaling disabled / not required */ + scale_mode = init_mask = 0; + } + else + { + /* determine the scaling mode and init the vars accordingly */ + scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1; + init_mask = (scale_mode == 1) ? 0xF : 0; + const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites; + /* add up the scale vector of the two children if available */ + fill_parent_scaler(scaler_size, parent_scaler, left_scaler, right_scaler); + } - __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); + size_t displacement = (states_padded - states) * (states_padded); /* compute CLV */ for (n = 0; n < sites; ++n) { lmat = left_matrix; rmat = right_matrix; - scaling = (parent_scaler) ? 0xF : 0; + scale_mask = init_mask; for (k = 0; k < rate_cats; ++k) { + unsigned int rate_mask = 0xF; + /* iterate over quadruples of rows */ for (i = 0; i < states_padded; i += 4) { - __m256d v_terma0 = _mm256_setzero_pd(); __m256d v_termb0 = _mm256_setzero_pd(); __m256d v_terma1 = _mm256_setzero_pd(); @@ -1436,13 +1525,31 @@ PLL_EXPORT void pll_core_update_partial_ii_avx(unsigned int states, __m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum); - /* check for scaling */ + /* check if scaling is needed for the current rate category */ __m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS); - scaling = scaling & _mm256_movemask_pd(v_cmp); + rate_mask = rate_mask & _mm256_movemask_pd(v_cmp); _mm256_store_pd(parent_clv+i, v_prod); } + if (scale_mode == 2) + { + /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (rate_mask == 0xF) + { + for (i = 0; i < states_padded; i += 4) + { + __m256d v_prod = _mm256_load_pd(parent_clv + i); + v_prod = _mm256_mul_pd(v_prod, v_scale_factor); + _mm256_store_pd(parent_clv + i, v_prod); + } + parent_scaler[n*rate_cats + k] += 1; + } + } + else + scale_mask = scale_mask & rate_mask; + /* reset pointers to point to the start of the next p-matrix, as the vectorization assumes a square states_padded * states_padded matrix, even though the real matrix is states * states_padded */ @@ -1456,10 +1563,8 @@ PLL_EXPORT void pll_core_update_partial_ii_avx(unsigned int states, /* if *all* entries of the site CLV were below the threshold then scale (all) entries by PLL_SCALE_FACTOR */ - if (scaling == 0xF) + if (scale_mask == 0xF) { - __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); - parent_clv -= span_padded; for (i = 0; i < span_padded; i += 4) { diff --git a/src/core_partials_avx2.c b/src/core_partials_avx2.c index a16824b..eaad146 100644 --- a/src/core_partials_avx2.c +++ b/src/core_partials_avx2.c @@ -21,7 +21,7 @@ #include "pll.h" -static void fill_parent_scaler(unsigned int sites, +static void fill_parent_scaler(unsigned int scaler_size, unsigned int * parent_scaler, const unsigned int * left_scaler, const unsigned int * right_scaler) @@ -29,19 +29,19 @@ static void fill_parent_scaler(unsigned int sites, unsigned int i; if (!left_scaler && !right_scaler) - memset(parent_scaler, 0, sizeof(unsigned int) * sites); + memset(parent_scaler, 0, sizeof(unsigned int) * scaler_size); else if (left_scaler && right_scaler) { - memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * sites); - for (i = 0; i < sites; ++i) + memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * scaler_size); + for (i = 0; i < scaler_size; ++i) parent_scaler[i] += right_scaler[i]; } else { if (left_scaler) - memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * sites); + memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * scaler_size); else - memcpy(parent_scaler, right_scaler, sizeof(unsigned int) * sites); + memcpy(parent_scaler, right_scaler, sizeof(unsigned int) * scaler_size); } } @@ -61,7 +61,6 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states, unsigned int attrib) { unsigned int i,j,k,n; - unsigned int scaling; const double * lmat; const double * rmat; @@ -101,18 +100,36 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states, right_matrix, right_scaler, tipmap, - tipmap_size); + tipmap_size, + attrib); return; } - /* add up the scale vector of the two children if available */ - if (parent_scaler) - fill_parent_scaler(sites, parent_scaler, NULL, right_scaler); + /* scaling-related stuff */ + unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ + unsigned int scale_mask; + unsigned int init_mask; + __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); + __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); + + if (!parent_scaler) + { + /* scaling disabled / not required */ + scale_mode = init_mask = 0; + } + else + { + /* determine the scaling mode and init the vars accordingly */ + scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1; + init_mask = (scale_mode == 1) ? 0xF : 0; + const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites; + /* add up the scale vector of the two children if available */ + fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler); + } size_t displacement = (states_padded - states) * (states_padded); __m256i mask; - __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); /* compute CLV */ for (n = 0; n < sites; ++n) @@ -120,12 +137,14 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states, lmat = left_matrix; rmat = right_matrix; - scaling = (parent_scaler) ? 0xF : 0; + scale_mask = init_mask; lstate = tipmap[left_tipchars[n]]; for (k = 0; k < rate_cats; ++k) { + unsigned int rate_mask = 0xF; + /* iterate over quadruples of rows */ for (i = 0; i < states_padded; i += 4) { @@ -251,14 +270,32 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states, __m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum); - /* check for scaling */ + /* check if scaling is needed for the current rate category */ __m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS); - scaling = scaling & _mm256_movemask_pd(v_cmp); + rate_mask = rate_mask & _mm256_movemask_pd(v_cmp); _mm256_store_pd(parent_clv+i, v_prod); } + if (scale_mode == 2) + { + /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (rate_mask == 0xF) + { + for (i = 0; i < states_padded; i += 4) + { + __m256d v_prod = _mm256_load_pd(parent_clv + i); + v_prod = _mm256_mul_pd(v_prod, v_scale_factor); + _mm256_store_pd(parent_clv + i, v_prod); + } + parent_scaler[n*rate_cats + k] += 1; + } + } + else + scale_mask = scale_mask & rate_mask; + /* reset pointers to point to the start of the next p-matrix, as the vectorization assumes a square states_padded * states_padded matrix, even though the real matrix is states * states_padded */ @@ -268,12 +305,11 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states, parent_clv += states_padded; right_clv += states_padded; } + /* if *all* entries of the site CLV were below the threshold then scale (all) entries by PLL_SCALE_FACTOR */ - if (scaling == 0xF) + if (scale_mask == 0xF) { - __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); - parent_clv -= span_padded; for (i = 0; i < span_padded; i += 4) { @@ -298,25 +334,25 @@ void pll_core_update_partial_ti_20x20_avx2(unsigned int sites, const double * right_matrix, const unsigned int * right_scaler, const unsigned int * tipmap, - unsigned int tipmap_size) + unsigned int tipmap_size, + unsigned int attrib) { unsigned int states = 20; unsigned int states_padded = states; unsigned int maxstates = tipmap_size; - unsigned int scaling; unsigned int i,j,k,n,m; const double * lmat; const double * rmat; - unsigned int span = states_padded * rate_cats; + unsigned int span_padded = states_padded * rate_cats; unsigned int lstate; __m256d xmm0,xmm1,xmm2,xmm3; /* precompute a lookup table of four values per entry (one for each state), for all 16 states (including ambiguities) and for each rate category. */ - double * lookup = pll_aligned_alloc(maxstates*span*sizeof(double), + double * lookup = pll_aligned_alloc(maxstates*span_padded*sizeof(double), PLL_ALIGNMENT_AVX); if (!lookup) { @@ -370,31 +406,48 @@ void pll_core_update_partial_ti_20x20_avx2(unsigned int sites, } } - /* update the parent scaler with the scaler of the right child */ - if (parent_scaler) - fill_parent_scaler(sites, parent_scaler, NULL, right_scaler); + /* scaling-related stuff */ + unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ + unsigned int scale_mask; + unsigned int init_mask; + __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); + __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); - size_t displacement = (states_padded - states) * (states_padded); + if (!parent_scaler) + { + /* scaling disabled / not required */ + scale_mode = init_mask = 0; + } + else + { + /* determine the scaling mode and init the vars accordingly */ + scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1; + init_mask = (scale_mode == 1) ? 0xF : 0; + const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites; + /* add up the scale vector of the two children if available */ + fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler); + } - __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); + size_t displacement = (states_padded - states) * (states_padded); /* iterate over sites and compute CLV entries */ for (n = 0; n < sites; ++n) { rmat = right_matrix; - scaling = (parent_scaler) ? 0xF : 0; + scale_mask = init_mask; lstate = (unsigned int) left_tipchar[n]; - unsigned int loffset = lstate*span; + unsigned int loffset = lstate*span_padded; for (k = 0; k < rate_cats; ++k) { + unsigned int rate_mask = 0xF; + /* iterate over quadruples of rows */ for (i = 0; i < states_padded; i += 4) { - __m256d v_termb0 = _mm256_setzero_pd(); __m256d v_termb1 = _mm256_setzero_pd(); __m256d v_termb2 = _mm256_setzero_pd(); @@ -460,13 +513,31 @@ void pll_core_update_partial_ti_20x20_avx2(unsigned int sites, __m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum); - /* check for scaling */ + /* check if scaling is needed for the current rate category */ __m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS); - scaling = scaling & _mm256_movemask_pd(v_cmp); + rate_mask = rate_mask & _mm256_movemask_pd(v_cmp); _mm256_store_pd(parent_clv+i, v_prod); } + if (scale_mode == 2) + { + /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (rate_mask == 0xF) + { + for (i = 0; i < states_padded; i += 4) + { + __m256d v_prod = _mm256_load_pd(parent_clv + i); + v_prod = _mm256_mul_pd(v_prod, v_scale_factor); + _mm256_store_pd(parent_clv + i, v_prod); + } + parent_scaler[n*rate_cats + k] += 1; + } + } + else + scale_mask = scale_mask & rate_mask; + /* reset pointers to point to the start of the next p-matrix, as the vectorization assumes a square states_padded * states_padded matrix, even though the real matrix is states * states_padded */ @@ -478,18 +549,16 @@ void pll_core_update_partial_ti_20x20_avx2(unsigned int sites, /* if *all* entries of the site CLV were below the threshold then scale (all) entries by PLL_SCALE_FACTOR */ - if (scaling == 0xF) + if (scale_mask == 0xF) { - __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); - - parent_clv -= span; - for (i = 0; i < span; i += 4) + parent_clv -= span_padded; + for (i = 0; i < span_padded; i += 4) { __m256d v_prod = _mm256_load_pd(parent_clv + i); v_prod = _mm256_mul_pd(v_prod,v_scale_factor); _mm256_store_pd(parent_clv + i, v_prod); } - parent_clv += span; + parent_clv += span_padded; parent_scaler[n] += 1; } } @@ -510,7 +579,6 @@ PLL_EXPORT void pll_core_update_partial_ii_avx2(unsigned int states, unsigned int attrib) { unsigned int i,j,k,n; - unsigned int scaling; const double * lmat; const double * rmat; @@ -536,27 +604,44 @@ PLL_EXPORT void pll_core_update_partial_ii_avx2(unsigned int states, return; } - /* add up the scale vector of the two children if available */ - if (parent_scaler) - fill_parent_scaler(sites, parent_scaler, left_scaler, right_scaler); + /* scaling-related stuff */ + unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ + unsigned int scale_mask; + unsigned int init_mask; + __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); + __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); - size_t displacement = (states_padded - states) * (states_padded); + if (!parent_scaler) + { + /* scaling disabled / not required */ + scale_mode = init_mask = 0; + } + else + { + /* determine the scaling mode and init the vars accordingly */ + scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1; + init_mask = (scale_mode == 1) ? 0xF : 0; + const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites; + /* add up the scale vector of the two children if available */ + fill_parent_scaler(scaler_size, parent_scaler, left_scaler, right_scaler); + } - __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD); + size_t displacement = (states_padded - states) * (states_padded); /* compute CLV */ for (n = 0; n < sites; ++n) { lmat = left_matrix; rmat = right_matrix; - scaling = (parent_scaler) ? 0xF : 0; + scale_mask = init_mask; for (k = 0; k < rate_cats; ++k) { + unsigned int rate_mask = 0xF; + /* iterate over quadruples of rows */ for (i = 0; i < states_padded; i += 4) { - __m256d v_terma0 = _mm256_setzero_pd(); __m256d v_termb0 = _mm256_setzero_pd(); __m256d v_terma1 = _mm256_setzero_pd(); @@ -664,13 +749,31 @@ PLL_EXPORT void pll_core_update_partial_ii_avx2(unsigned int states, __m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum); - /* check for scaling */ + /* check if scaling is needed for the current rate category */ __m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS); - scaling = scaling & _mm256_movemask_pd(v_cmp); + rate_mask = rate_mask & _mm256_movemask_pd(v_cmp); _mm256_store_pd(parent_clv+i, v_prod); } + if (scale_mode == 2) + { + /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (rate_mask == 0xF) + { + for (i = 0; i < states_padded; i += 4) + { + __m256d v_prod = _mm256_load_pd(parent_clv + i); + v_prod = _mm256_mul_pd(v_prod, v_scale_factor); + _mm256_store_pd(parent_clv + i, v_prod); + } + parent_scaler[n*rate_cats + k] += 1; + } + } + else + scale_mask = scale_mask & rate_mask; + /* reset pointers to point to the start of the next p-matrix, as the vectorization assumes a square states_padded * states_padded matrix, even though the real matrix is states * states_padded */ @@ -684,10 +787,8 @@ PLL_EXPORT void pll_core_update_partial_ii_avx2(unsigned int states, /* if *all* entries of the site CLV were below the threshold then scale (all) entries by PLL_SCALE_FACTOR */ - if (scaling == 0xF) + if (scale_mask == 0xF) { - __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR); - parent_clv -= span_padded; for (i = 0; i < span_padded; i += 4) { diff --git a/src/core_partials_sse.c b/src/core_partials_sse.c index 7c128c6..ca77790 100644 --- a/src/core_partials_sse.c +++ b/src/core_partials_sse.c @@ -412,8 +412,11 @@ PLL_EXPORT void pll_core_update_partial_tt_sse(unsigned int states, return; } + size_t scaler_size = (attrib & PLL_ATTRIB_RATE_SCALERS) ? + sites*rate_cats : sites; + if (parent_scaler) - memset(parent_scaler, 0, sizeof(unsigned int) * sites); + memset(parent_scaler, 0, sizeof(unsigned int) * scaler_size); for (n = 0; n < sites; ++n) { @@ -702,7 +705,6 @@ PLL_EXPORT void pll_core_update_partial_ii_sse(unsigned int states, unsigned int attrib) { unsigned int i,j,k,n; - unsigned int scaling; const double * lmat; const double * rmat; @@ -731,19 +733,39 @@ PLL_EXPORT void pll_core_update_partial_ii_sse(unsigned int states, __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6; - /* add up the scale vectors of the two children if available */ + /* scaling stuff */ + unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ + unsigned int scale_mask; + unsigned int init_mask; + __m128d v_scale_threshold = _mm_set1_pd(PLL_SCALE_THRESHOLD); + __m128d v_scale_factor = _mm_set1_pd(PLL_SCALE_FACTOR); + if (parent_scaler) - fill_parent_scaler(sites, parent_scaler, left_scaler, right_scaler); + { + /* determine the scaling mode and init the vars accordingly */ + scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1; + init_mask = (scale_mode == 1) ? 0x3 : 0; + const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites; + /* add up the scale vector of the two children if available */ + fill_parent_scaler(scaler_size, parent_scaler, left_scaler, right_scaler); + } + else + { + /* scaling disabled / not required */ + scale_mode = init_mask = 0; + } /* compute CLV */ for (n = 0; n < sites; ++n) { lmat = left_matrix; rmat = right_matrix; - scaling = (parent_scaler) ? 1 : 0; + scale_mask = init_mask; for (k = 0; k < rate_cats; ++k) { + unsigned int rate_mask = 0x3; + for (i = 0; i < states_padded; i += 2) { __m128d v_terma0 = _mm_setzero_pd(); @@ -794,6 +816,11 @@ PLL_EXPORT void pll_core_update_partial_ii_sse(unsigned int states, xmm4 = _mm_hadd_pd(v_terma0,v_terma1); xmm5 = _mm_hadd_pd(v_termb0,v_termb1); xmm6 = _mm_mul_pd(xmm4,xmm5); + + /* check if scaling is needed for the current rate category */ + __m128d v_cmp = _mm_cmplt_pd(xmm6, v_scale_threshold); + rate_mask = rate_mask & _mm_movemask_pd(v_cmp); + _mm_store_pd(parent_clv+i,xmm6); } @@ -803,20 +830,33 @@ PLL_EXPORT void pll_core_update_partial_ii_sse(unsigned int states, lmat -= displacement; rmat -= displacement; - for (j = 0; j < states; ++j) - scaling = scaling && (parent_clv[j] < PLL_SCALE_THRESHOLD); + if (scale_mode == 2) + { + /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (rate_mask == 0x3) + { + for (i = 0; i < states_padded; i += 2) + { + __m128d v_prod = _mm_load_pd(parent_clv + i); + v_prod = _mm_mul_pd(v_prod, v_scale_factor); + _mm_store_pd(parent_clv + i, v_prod); + } + parent_scaler[n*rate_cats + k] += 1; + } + } + else + scale_mask = scale_mask & rate_mask; parent_clv += states_padded; left_clv += states_padded; right_clv += states_padded; } - /* if *all* entries of the site CLV were below the threshold then scale - (all) entries by PLL_SCALE_FACTOR */ - if (scaling) - { - __m128d v_scale_factor = _mm_set_pd(PLL_SCALE_FACTOR, - PLL_SCALE_FACTOR); + /* PER-SITE SCALING: if *all* entries of the *site* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (scale_mask == 0x3) + { parent_clv -= span_padded; for (i = 0; i < span_padded; i += 2) { @@ -1098,7 +1138,6 @@ PLL_EXPORT void pll_core_update_partial_ti_sse(unsigned int states, unsigned int tipmap_size, unsigned int attrib) { - int scaling; unsigned int i,j,k,n; unsigned int states_padded = (states+1) & 0xFFFFFFFE; unsigned int span_padded = states_padded * rate_cats; @@ -1124,26 +1163,46 @@ PLL_EXPORT void pll_core_update_partial_ti_sse(unsigned int states, return; } - if (parent_scaler) - fill_parent_scaler(sites, parent_scaler, NULL, right_scaler); - size_t displacement = (states_padded - states) * (states_padded); __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; xmm7 = _mm_setzero_pd(); + /* scaling stuff */ + unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */ + unsigned int scale_mask; + unsigned int init_mask; + __m128d v_scale_threshold = _mm_set1_pd(PLL_SCALE_THRESHOLD); + __m128d v_scale_factor = _mm_set1_pd(PLL_SCALE_FACTOR); + + if (parent_scaler) + { + /* determine the scaling mode and init the vars accordingly */ + scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1; + init_mask = (scale_mode == 1) ? 0x3 : 0; + const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites; + /* add up the scale vector of the two children if available */ + fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler); + } + else + { + /* scaling disabled / not required */ + scale_mode = init_mask = 0; + } + /* compute CLV */ for (n = 0; n < sites; ++n) { lmat = left_matrix; rmat = right_matrix; - - scaling = (parent_scaler) ? 1 : 0; + scale_mask = init_mask; lstate = tipmap[left_tipchars[n]]; for (k = 0; k < rate_cats; ++k) { + unsigned int rate_mask = 0x3; + /* iterate over quadruples of rows */ for (i = 0; i < states_padded; i += 2) { @@ -1204,6 +1263,11 @@ PLL_EXPORT void pll_core_update_partial_ti_sse(unsigned int states, xmm4 = _mm_hadd_pd(v_terma0,v_terma1); xmm5 = _mm_hadd_pd(v_termb0,v_termb1); xmm6 = _mm_mul_pd(xmm4,xmm5); + + /* check if scaling is needed for the current rate category */ + __m128d v_cmp = _mm_cmplt_pd(xmm6, v_scale_threshold); + rate_mask = rate_mask & _mm_movemask_pd(v_cmp); + _mm_store_pd(parent_clv+i,xmm6); } @@ -1213,19 +1277,32 @@ PLL_EXPORT void pll_core_update_partial_ti_sse(unsigned int states, lmat -= displacement; rmat -= displacement; - for (j = 0; j < states; ++j) - scaling = scaling && (parent_clv[j] < PLL_SCALE_THRESHOLD); + if (scale_mode == 2) + { + /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (rate_mask == 0x3) + { + for (i = 0; i < states_padded; i += 2) + { + __m128d v_prod = _mm_load_pd(parent_clv + i); + v_prod = _mm_mul_pd(v_prod, v_scale_factor); + _mm_store_pd(parent_clv + i, v_prod); + } + parent_scaler[n*rate_cats + k] += 1; + } + } + else + scale_mask = scale_mask & rate_mask; parent_clv += states_padded; right_clv += states_padded; } - /* if *all* entries of the site CLV were below the threshold then scale - (all) entries by PLL_SCALE_FACTOR */ - if (scaling) - { - __m128d v_scale_factor = _mm_set_pd(PLL_SCALE_FACTOR, - PLL_SCALE_FACTOR); + /* PER-SITE SCALING: if *all* entries of the *site* CLV were below + * the threshold then scale (all) entries by PLL_SCALE_FACTOR */ + if (scale_mask == 0x3) + { parent_clv -= span_padded; for (i = 0; i < span_padded; i += 2) { diff --git a/src/pll.h b/src/pll.h index 83daa9c..8405768 100644 --- a/src/pll.h +++ b/src/pll.h @@ -1192,7 +1192,8 @@ PLL_EXPORT void pll_core_update_partial_ti_20x20_avx(unsigned int sites, const double * right_matrix, const unsigned int * right_scaler, const unsigned int * tipmap, - unsigned int tipmap_size); + unsigned int tipmap_size, + unsigned int attrib); PLL_EXPORT void pll_core_update_partial_ii_avx(unsigned int states, unsigned int sites, @@ -1248,7 +1249,8 @@ void pll_core_update_partial_ti_20x20_avx2(unsigned int sites, const double * right_matrix, const unsigned int * right_scaler, const unsigned int * tipmap, - unsigned int tipmap_size); + unsigned int tipmap_size, + unsigned int attrib); PLL_EXPORT void pll_core_update_partial_ii_avx2(unsigned int states, unsigned int sites, @@ -1407,7 +1409,8 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl); + double * persite_lnl, + unsigned int attrib); PLL_EXPORT double pll_core_edge_loglikelihood_ii_4x4_sse(unsigned int sites, @@ -1441,7 +1444,8 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl); + double * persite_lnl, + unsigned int attrib); PLL_EXPORT double pll_core_edge_loglikelihood_ti_4x4_sse(unsigned int sites, @@ -1502,7 +1506,8 @@ PLL_EXPORT double pll_core_edge_loglikelihood_ii_avx(unsigned int states, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl); + double * persite_lnl, + unsigned int attrib); PLL_EXPORT double pll_core_edge_loglikelihood_ii_4x4_avx(unsigned int sites, unsigned int rate_cats, @@ -1549,7 +1554,8 @@ PLL_EXPORT double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl); + double * persite_lnl, + unsigned int attrib); PLL_EXPORT double pll_core_edge_loglikelihood_ti_avx(unsigned int states, unsigned int sites, @@ -1565,7 +1571,8 @@ PLL_EXPORT double pll_core_edge_loglikelihood_ti_avx(unsigned int states, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl); + double * persite_lnl, + unsigned int attrib); PLL_EXPORT double pll_core_root_loglikelihood_4x4_avx(unsigned int sites, unsigned int rate_cats, @@ -1626,7 +1633,8 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl); + double * persite_lnl, + unsigned int attrib); PLL_EXPORT @@ -1644,7 +1652,8 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states, const double * invar_proportion, const int * invar_indices, const unsigned int * freqs_indices, - double * persite_lnl); + double * persite_lnl, + unsigned int attrib); #endif /* functions in core_pmatrix.c */