From d53249a631ca1110bc58ff427ff5ffc15a392793 Mon Sep 17 00:00:00 2001 From: kozlov Date: Mon, 15 May 2017 17:54:45 +0200 Subject: [PATCH] use padding in eigenvecs arrays (fixes #139) --- src/core_derivatives.c | 11 +++++++---- src/core_derivatives_avx.c | 10 +++++----- src/core_derivatives_avx2.c | 10 +++++----- src/core_derivatives_sse.c | 8 ++++---- src/core_pmatrix.c | 4 ++-- src/models.c | 16 ++++++++++++---- src/pll.c | 3 +++ 7 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/core_derivatives.c b/src/core_derivatives.c index a5fea48..2b6e205 100644 --- a/src/core_derivatives.c +++ b/src/core_derivatives.c @@ -146,6 +146,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states, const double * t_inv_eigenvecs; const double * t_freqs; + unsigned int states_padded = states; + #ifdef HAVE_SSE3 if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { @@ -251,8 +253,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states, righterm = 0; for (k = 0; k < states; ++k) { - lefterm += t_clvp[k] * t_freqs[k] * t_inv_eigenvecs[k * states + j]; - righterm += t_eigenvecs[j * states + k] * t_clvc[k]; + lefterm += t_clvp[k] * t_freqs[k] * + t_inv_eigenvecs[k * states_padded + j]; + righterm += t_eigenvecs[j * states_padded + k] * t_clvc[k]; } sum[j] = lefterm * righterm; @@ -385,8 +388,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states, for (k = 0; k < states; ++k) { lefterm += (tipstate & 1) * t_freqs[k] - * t_inv_eigenvecs[k * states + j]; - righterm += t_eigenvecs[j * states + k] * t_clvc[k]; + * t_inv_eigenvecs[k * states_padded + j]; + righterm += t_eigenvecs[j * states_padded + k] * t_clvc[k]; tipstate >>= 1; } sum[j] = lefterm * righterm; diff --git a/src/core_derivatives_avx.c b/src/core_derivatives_avx.c index 54f03ff..992e41b 100644 --- a/src/core_derivatives_avx.c +++ b/src/core_derivatives_avx.c @@ -273,9 +273,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states, for (k = 0; k < states; ++k) { tt_inv_eigenvecs[i * states_padded * states_padded + j * states_padded - + k] = inv_eigenvecs[i][k * states + j] * t_freqs[k]; + + k] = inv_eigenvecs[i][k * states_padded + j] * t_freqs[k]; tt_eigenvecs[i * states_padded * states_padded + j * states_padded - + k] = eigenvecs[i][j * states + k]; + + k] = eigenvecs[i][j * states_padded + k]; } } @@ -636,7 +636,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states, for (k = 0; k < states_padded; ++k) { eigenvecs_padded[i*states_padded*states_padded + j*states_padded + k] = - (j < states && k < states) ? eigenvecs[i][j*states + k] : 0.; + (j < states && k < states) ? eigenvecs[i][j*states_padded + k] : 0.; } } @@ -659,7 +659,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states, /* special case for non-ambiguous state */ __m256d v_freqs = _mm256_set1_pd(freqs[i][ss]); __m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] + - ss*states + j); + ss*states_padded + j); v_lefterm = _mm256_mul_pd(v_eigen, v_freqs); } else @@ -671,7 +671,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states, { __m256d v_freqs = _mm256_set1_pd(freqs[i][k]); __m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] + - k*states + j); + k*states_padded + j); v_lefterm = _mm256_add_pd(v_lefterm, _mm256_mul_pd(v_eigen, v_freqs)); diff --git a/src/core_derivatives_avx2.c b/src/core_derivatives_avx2.c index 20216d5..bd345ec 100644 --- a/src/core_derivatives_avx2.c +++ b/src/core_derivatives_avx2.c @@ -98,9 +98,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states, for (k = 0; k < states; ++k) { tt_inv_eigenvecs[i * states_padded * states_padded + j * states_padded - + k] = inv_eigenvecs[i][k * states + j] * t_freqs[k]; + + k] = inv_eigenvecs[i][k * states_padded + j] * t_freqs[k]; tt_eigenvecs[i * states_padded * states_padded + j * states_padded - + k] = eigenvecs[i][j * states + k]; + + k] = eigenvecs[i][j * states_padded + k]; } } @@ -281,7 +281,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states, for (k = 0; k < states_padded; ++k) { eigenvecs_padded[i*states_padded*states_padded + j*states_padded + k] = - (j < states && k < states) ? eigenvecs[i][j*states + k] : 0.; + (j < states && k < states) ? eigenvecs[i][j*states_padded + k] : 0.; } } @@ -304,7 +304,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states, /* special case for non-ambiguous state */ __m256d v_freqs = _mm256_set1_pd(freqs[i][ss]); __m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] + - ss*states + j); + ss*states_padded + j); v_lefterm = _mm256_mul_pd(v_eigen, v_freqs); } else @@ -316,7 +316,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states, { __m256d v_freqs = _mm256_set1_pd(freqs[i][k]); __m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] + - k*states + j); + k*states_padded + j); v_lefterm = _mm256_fmadd_pd(v_eigen, v_freqs, v_lefterm); } diff --git a/src/core_derivatives_sse.c b/src/core_derivatives_sse.c index f9849dd..cf77d84 100644 --- a/src/core_derivatives_sse.c +++ b/src/core_derivatives_sse.c @@ -208,8 +208,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii_sse(unsigned int states, for (k = 0; k < states; ++k) { - lterm += clvp[k] * freqs[k] * invev[k*states+j]; - rterm += ev[j*states+k] * clvc[k]; + lterm += clvp[k] * freqs[k] * invev[k*states_padded+j]; + rterm += ev[j*states_padded+k] * clvc[k]; } sum[j] = lterm*rterm; @@ -288,8 +288,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti_sse(unsigned int states, for (k = 0; k < states; ++k) { - lterm += (tipstate & 1) * freqs[k] * invev[k*states+j]; - rterm += ev[j*states+k] * clvc[k]; + lterm += (tipstate & 1) * freqs[k] * invev[k*states_padded+j]; + rterm += ev[j*states_padded+k] * clvc[k]; tipstate >>= 1; } sum[j] = lterm*rterm; diff --git a/src/core_pmatrix.c b/src/core_pmatrix.c index 4fb1e84..77aedb5 100644 --- a/src/core_pmatrix.c +++ b/src/core_pmatrix.c @@ -214,7 +214,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix, for (j = 0; j < states; ++j) for (k = 0; k < states; ++k) - temp[j*states+k] = inv_evecs[j*states+k] * expd[k]; + temp[j*states+k] = inv_evecs[j*states_padded+k] * expd[k]; for (j = 0; j < states; ++j) { @@ -224,7 +224,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix, for (m = 0; m < states; ++m) { pmat[j*states_padded+k] += - temp[j*states+m] * evecs[m*states+k]; + temp[j*states+m] * evecs[m*states_padded+k]; } } } diff --git a/src/models.c b/src/models.c index 6081283..0148f92 100644 --- a/src/models.c +++ b/src/models.c @@ -262,6 +262,7 @@ PLL_EXPORT int pll_update_eigen(pll_partition_t * partition, double * subst_params = partition->subst_params[params_index]; unsigned int states = partition->states; + unsigned int states_padded = partition->states_padded; a = create_ratematrix(subst_params, freqs, @@ -291,25 +292,32 @@ PLL_EXPORT int pll_update_eigen(pll_partition_t * partition, /* store eigen vectors */ for (i = 0; i < states; ++i) - memcpy(eigenvecs + i*states, a[i], states*sizeof(double)); + memcpy(eigenvecs + i*states_padded, a[i], states*sizeof(double)); /* store eigen values */ memcpy(eigenvals, d, states*sizeof(double)); /* store inverse eigen vectors */ for (k = 0, i = 0; i < states; ++i) - for (j = i; j < states*states; j += states) + { + for (j = i; j < states_padded*states; j += states_padded) inv_eigenvecs[k++] = eigenvecs[j]; + /* account for padding */ + k += states_padded - states; + } + + assert(k == states_padded*states); + /* multiply the inverse eigen vectors from the left with sqrt(pi)^-1 */ for (i = 0; i < states; ++i) for (j = 0; j < states; ++j) - inv_eigenvecs[i*states+ j] /= sqrt(freqs[i]); + inv_eigenvecs[i*states_padded+ j] /= sqrt(freqs[i]); /* multiply the eigen vectors from the right with sqrt(pi) */ for (i = 0; i < states; ++i) for (j = 0; j < states; ++j) - eigenvecs[i*states+j] *= sqrt(freqs[j]); + eigenvecs[i*states_padded+j] *= sqrt(freqs[j]); partition->eigen_decomp_valid[params_index] = 1; diff --git a/src/pll.c b/src/pll.c index 50254aa..0944761 100644 --- a/src/pll.c +++ b/src/pll.c @@ -604,6 +604,7 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips, "Unable to allocate enough memory for eigenvectors."); return PLL_FAILURE; } + memset(partition->eigenvecs[i], 0, states * states_padded * sizeof(double)); /* TODO: don't forget to add code for SSE/AVX */ } @@ -632,6 +633,7 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips, "Unable to allocate enough memory for inverse eigenvectors."); return PLL_FAILURE; } + memset(partition->inv_eigenvecs[i], 0, states * states_padded * sizeof(double)); /* TODO: don't forget to add code for SSE/AVX */ } @@ -660,6 +662,7 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips, "Unable to allocate enough memory for eigenvalues."); return PLL_FAILURE; } + memset(partition->eigenvals[i], 0, states_padded * sizeof(double)); /* TODO: don't forget to add code for SSE/AVX */ }