Skip to content

Commit

Permalink
use padding in eigenvecs arrays (fixes #139)
Browse files Browse the repository at this point in the history
  • Loading branch information
amkozlov committed May 15, 2017
1 parent a948423 commit d53249a
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 24 deletions.
11 changes: 7 additions & 4 deletions src/core_derivatives.c
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states,
const double * t_inv_eigenvecs;
const double * t_freqs;

unsigned int states_padded = states;

#ifdef HAVE_SSE3
if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present))
{
Expand Down Expand Up @@ -251,8 +253,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states,
righterm = 0;
for (k = 0; k < states; ++k)
{
lefterm += t_clvp[k] * t_freqs[k] * t_inv_eigenvecs[k * states + j];
righterm += t_eigenvecs[j * states + k] * t_clvc[k];
lefterm += t_clvp[k] * t_freqs[k] *
t_inv_eigenvecs[k * states_padded + j];
righterm += t_eigenvecs[j * states_padded + k] * t_clvc[k];
}
sum[j] = lefterm * righterm;

Expand Down Expand Up @@ -385,8 +388,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states,
for (k = 0; k < states; ++k)
{
lefterm += (tipstate & 1) * t_freqs[k]
* t_inv_eigenvecs[k * states + j];
righterm += t_eigenvecs[j * states + k] * t_clvc[k];
* t_inv_eigenvecs[k * states_padded + j];
righterm += t_eigenvecs[j * states_padded + k] * t_clvc[k];
tipstate >>= 1;
}
sum[j] = lefterm * righterm;
Expand Down
10 changes: 5 additions & 5 deletions src/core_derivatives_avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -273,9 +273,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states,
for (k = 0; k < states; ++k)
{
tt_inv_eigenvecs[i * states_padded * states_padded + j * states_padded
+ k] = inv_eigenvecs[i][k * states + j] * t_freqs[k];
+ k] = inv_eigenvecs[i][k * states_padded + j] * t_freqs[k];
tt_eigenvecs[i * states_padded * states_padded + j * states_padded
+ k] = eigenvecs[i][j * states + k];
+ k] = eigenvecs[i][j * states_padded + k];
}
}

Expand Down Expand Up @@ -636,7 +636,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,
for (k = 0; k < states_padded; ++k)
{
eigenvecs_padded[i*states_padded*states_padded + j*states_padded + k] =
(j < states && k < states) ? eigenvecs[i][j*states + k] : 0.;
(j < states && k < states) ? eigenvecs[i][j*states_padded + k] : 0.;
}
}

Expand All @@ -659,7 +659,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,
/* special case for non-ambiguous state */
__m256d v_freqs = _mm256_set1_pd(freqs[i][ss]);
__m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] +
ss*states + j);
ss*states_padded + j);
v_lefterm = _mm256_mul_pd(v_eigen, v_freqs);
}
else
Expand All @@ -671,7 +671,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,
{
__m256d v_freqs = _mm256_set1_pd(freqs[i][k]);
__m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] +
k*states + j);
k*states_padded + j);

v_lefterm = _mm256_add_pd(v_lefterm,
_mm256_mul_pd(v_eigen, v_freqs));
Expand Down
10 changes: 5 additions & 5 deletions src/core_derivatives_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states,
for (k = 0; k < states; ++k)
{
tt_inv_eigenvecs[i * states_padded * states_padded + j * states_padded
+ k] = inv_eigenvecs[i][k * states + j] * t_freqs[k];
+ k] = inv_eigenvecs[i][k * states_padded + j] * t_freqs[k];
tt_eigenvecs[i * states_padded * states_padded + j * states_padded
+ k] = eigenvecs[i][j * states + k];
+ k] = eigenvecs[i][j * states_padded + k];
}
}

Expand Down Expand Up @@ -281,7 +281,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,
for (k = 0; k < states_padded; ++k)
{
eigenvecs_padded[i*states_padded*states_padded + j*states_padded + k] =
(j < states && k < states) ? eigenvecs[i][j*states + k] : 0.;
(j < states && k < states) ? eigenvecs[i][j*states_padded + k] : 0.;
}
}

Expand All @@ -304,7 +304,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,
/* special case for non-ambiguous state */
__m256d v_freqs = _mm256_set1_pd(freqs[i][ss]);
__m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] +
ss*states + j);
ss*states_padded + j);
v_lefterm = _mm256_mul_pd(v_eigen, v_freqs);
}
else
Expand All @@ -316,7 +316,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,
{
__m256d v_freqs = _mm256_set1_pd(freqs[i][k]);
__m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] +
k*states + j);
k*states_padded + j);

v_lefterm = _mm256_fmadd_pd(v_eigen, v_freqs, v_lefterm);
}
Expand Down
8 changes: 4 additions & 4 deletions src/core_derivatives_sse.c
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii_sse(unsigned int states,

for (k = 0; k < states; ++k)
{
lterm += clvp[k] * freqs[k] * invev[k*states+j];
rterm += ev[j*states+k] * clvc[k];
lterm += clvp[k] * freqs[k] * invev[k*states_padded+j];
rterm += ev[j*states_padded+k] * clvc[k];
}

sum[j] = lterm*rterm;
Expand Down Expand Up @@ -288,8 +288,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti_sse(unsigned int states,

for (k = 0; k < states; ++k)
{
lterm += (tipstate & 1) * freqs[k] * invev[k*states+j];
rterm += ev[j*states+k] * clvc[k];
lterm += (tipstate & 1) * freqs[k] * invev[k*states_padded+j];
rterm += ev[j*states_padded+k] * clvc[k];
tipstate >>= 1;
}
sum[j] = lterm*rterm;
Expand Down
4 changes: 2 additions & 2 deletions src/core_pmatrix.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix,

for (j = 0; j < states; ++j)
for (k = 0; k < states; ++k)
temp[j*states+k] = inv_evecs[j*states+k] * expd[k];
temp[j*states+k] = inv_evecs[j*states_padded+k] * expd[k];

for (j = 0; j < states; ++j)
{
Expand All @@ -224,7 +224,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix,
for (m = 0; m < states; ++m)
{
pmat[j*states_padded+k] +=
temp[j*states+m] * evecs[m*states+k];
temp[j*states+m] * evecs[m*states_padded+k];
}
}
}
Expand Down
16 changes: 12 additions & 4 deletions src/models.c
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ PLL_EXPORT int pll_update_eigen(pll_partition_t * partition,
double * subst_params = partition->subst_params[params_index];

unsigned int states = partition->states;
unsigned int states_padded = partition->states_padded;

a = create_ratematrix(subst_params,
freqs,
Expand Down Expand Up @@ -291,25 +292,32 @@ PLL_EXPORT int pll_update_eigen(pll_partition_t * partition,

/* store eigen vectors */
for (i = 0; i < states; ++i)
memcpy(eigenvecs + i*states, a[i], states*sizeof(double));
memcpy(eigenvecs + i*states_padded, a[i], states*sizeof(double));

/* store eigen values */
memcpy(eigenvals, d, states*sizeof(double));

/* store inverse eigen vectors */
for (k = 0, i = 0; i < states; ++i)
for (j = i; j < states*states; j += states)
{
for (j = i; j < states_padded*states; j += states_padded)
inv_eigenvecs[k++] = eigenvecs[j];

/* account for padding */
k += states_padded - states;
}

assert(k == states_padded*states);

/* multiply the inverse eigen vectors from the left with sqrt(pi)^-1 */
for (i = 0; i < states; ++i)
for (j = 0; j < states; ++j)
inv_eigenvecs[i*states+ j] /= sqrt(freqs[i]);
inv_eigenvecs[i*states_padded+ j] /= sqrt(freqs[i]);

/* multiply the eigen vectors from the right with sqrt(pi) */
for (i = 0; i < states; ++i)
for (j = 0; j < states; ++j)
eigenvecs[i*states+j] *= sqrt(freqs[j]);
eigenvecs[i*states_padded+j] *= sqrt(freqs[j]);

partition->eigen_decomp_valid[params_index] = 1;

Expand Down
3 changes: 3 additions & 0 deletions src/pll.c
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,7 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips,
"Unable to allocate enough memory for eigenvectors.");
return PLL_FAILURE;
}
memset(partition->eigenvecs[i], 0, states * states_padded * sizeof(double));
/* TODO: don't forget to add code for SSE/AVX */
}

Expand Down Expand Up @@ -632,6 +633,7 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips,
"Unable to allocate enough memory for inverse eigenvectors.");
return PLL_FAILURE;
}
memset(partition->inv_eigenvecs[i], 0, states * states_padded * sizeof(double));
/* TODO: don't forget to add code for SSE/AVX */
}

Expand Down Expand Up @@ -660,6 +662,7 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips,
"Unable to allocate enough memory for eigenvalues.");
return PLL_FAILURE;
}
memset(partition->eigenvals[i], 0, states_padded * sizeof(double));
/* TODO: don't forget to add code for SSE/AVX */
}

Expand Down

0 comments on commit d53249a

Please sign in to comment.