Skip to content

Commit

Permalink
fix negative pmatrix values by applying a nice trick suggested by @br…
Browse files Browse the repository at this point in the history
…edelings in #129

test output files were modified accordingly
  • Loading branch information
amkozlov committed May 26, 2017
1 parent 23f80cc commit 4498719
Show file tree
Hide file tree
Showing 11 changed files with 6,172 additions and 123 deletions.
51 changes: 29 additions & 22 deletions src/core_pmatrix.c
Original file line number Diff line number Diff line change
Expand Up @@ -179,38 +179,44 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix,
}
else
{
/* NOTE: in order to deal with numerical issues in cases when Qt -> 0, we
* use a trick suggested by Ben Redelings and explained here:
* https://github.com/xflouris/libpll/issues/129#issuecomment-304004005
* In short, we use expm1() to compute (exp(Qt) - I), and then correct
* for this by adding an identity matrix I in the very end */

/* exponentiate eigenvalues */
if (pinvar > PLL_MISC_EPSILON)
{
for (j = 0; j < states; ++j)
expd[j] = exp(evals[j] * rates[n] * branch_lengths[i]
expd[j] = expm1(evals[j] * rates[n] * branch_lengths[i]
/ (1.0 - pinvar));
}
else
{
for (j = 0; j < states; ++j)
expd[j] = exp(evals[j] * rates[n] * branch_lengths[i]);
expd[j] = expm1(evals[j] * rates[n] * branch_lengths[i]);
}

/* check if all values of expd are approximately one */
for (k=0, j=0; j < states; ++j)
if ((expd[j] > PLL_ONE_MIN) && (expd[j] < PLL_ONE_MAX))
++k;

/* if yes, it means we are multiplying the inverse eigenvectors matrix
by the eigenvectors matrix, and essentially the resulting pmatrix is
the identity matrix. This is done to prevent having numerical issues
(negative entries in the pmatrix) which can occur due to the
different floating point representations of one in expd */
if (k == states)
{
/* set identity matrix */
for (j = 0; j < states; ++j)
for (k = 0; k < states; ++k)
pmat[j*states_padded + k] = (j == k) ? 1 : 0;

continue;
}
// /* check if all values of expd are approximately one */
// for (k=0, j=0; j < states; ++j)
// if ((expd[j] > PLL_ONE_MIN) && (expd[j] < PLL_ONE_MAX))
// ++k;
//
// /* if yes, it means we are multiplying the inverse eigenvectors matrix
// by the eigenvectors matrix, and essentially the resulting pmatrix is
// the identity matrix. This is done to prevent having numerical issues
// (negative entries in the pmatrix) which can occur due to the
// different floating point representations of one in expd */
// if (k == states && 0)
// {
// /* set identity matrix */
// for (j = 0; j < states; ++j)
// for (k = 0; k < states; ++k)
// pmat[j*states_padded + k] = (j == k) ? 1 : 0;
//
// continue;
// }

for (j = 0; j < states; ++j)
for (k = 0; k < states; ++k)
Expand All @@ -220,7 +226,8 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix,
{
for (k = 0; k < states; ++k)
{
pmat[j*states_padded+k] = 0;
// pmat[j*states_padded+k] = 0;
pmat[j*states_padded+k] = (j==k) ? 1.0 : 0;
for (m = 0; m < states; ++m)
{
pmat[j*states_padded+k] +=
Expand Down
135 changes: 83 additions & 52 deletions src/core_pmatrix_avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ PLL_EXPORT int pll_core_update_pmatrix_4x4_avx(double ** pmatrix,
ymm8 = _mm256_set1_pd(PLL_ONE_MIN);
ymm9 = _mm256_set1_pd(PLL_ONE_MAX);

__m256d idm_row0 = _mm256_set_pd(0., 0., 0., 1.0);
__m256d idm_row1 = _mm256_set_pd(0., 0., 1.0, 0.);
__m256d idm_row2 = _mm256_set_pd(0., 1.0, 0., 0.);
__m256d idm_row3 = _mm256_set_pd(1.0, 0., 0., 0.);

for (i = 0; i < count; ++i)
{
assert(branch_lengths[i] >= 0);
Expand Down Expand Up @@ -129,10 +134,17 @@ PLL_EXPORT int pll_core_update_pmatrix_4x4_avx(double ** pmatrix,

/* for now exponentiate non-vectorized */
_mm256_store_pd(expd,xmm2);
xmm1 = _mm256_set_pd(exp(expd[3]),
exp(expd[2]),
exp(expd[1]),
exp(expd[0]));

/* NOTE: in order to deal with numerical issues in cases when Qt -> 0, we
* use a trick suggested by Ben Redelings and explained here:
* https://github.com/xflouris/libpll/issues/129#issuecomment-304004005
* In short, we use expm1() to compute (exp(Qt) - I), and then correct
* for this by adding an identity matrix I in the very end */

xmm1 = _mm256_set_pd(expm1(expd[3]),
expm1(expd[2]),
expm1(expd[1]),
expm1(expd[0]));


/* check if all values of expd are approximately one */
Expand All @@ -147,7 +159,7 @@ PLL_EXPORT int pll_core_update_pmatrix_4x4_avx(double ** pmatrix,
(negative entries in the pmatrix) which can occur due to the
different floating point representations of one in expd. Otherwise,
proceed as normal and multiply eigenvector matrices with expd */
if (_mm256_movemask_pd(xmm5) == 0xF)
if (_mm256_movemask_pd(xmm5) == 0xF && 0)
{
_mm256_store_pd(pmat+0,xmm0);
_mm256_store_pd(pmat+4,xmm0);
Expand Down Expand Up @@ -207,6 +219,7 @@ PLL_EXPORT int pll_core_update_pmatrix_4x4_avx(double ** pmatrix,
ymm3 = _mm256_blend_pd(ymm0,ymm1,12);
ymm0 = _mm256_add_pd(ymm2,ymm3);

ymm0 = _mm256_add_pd(ymm0,idm_row0);
_mm256_store_pd(pmat+0,ymm0);

/* pmat row 1 */
Expand All @@ -230,6 +243,7 @@ PLL_EXPORT int pll_core_update_pmatrix_4x4_avx(double ** pmatrix,
ymm3 = _mm256_blend_pd(ymm0,ymm1,12);
ymm0 = _mm256_add_pd(ymm2,ymm3);

ymm0 = _mm256_add_pd(ymm0,idm_row1);
_mm256_store_pd(pmat+4,ymm0);

/* pmat row 2 */
Expand All @@ -253,6 +267,7 @@ PLL_EXPORT int pll_core_update_pmatrix_4x4_avx(double ** pmatrix,
ymm3 = _mm256_blend_pd(ymm0,ymm1,12);
ymm0 = _mm256_add_pd(ymm2,ymm3);

ymm0 = _mm256_add_pd(ymm0,idm_row2);
_mm256_store_pd(pmat+8,ymm0);

/* pmat row 3 */
Expand All @@ -276,6 +291,7 @@ PLL_EXPORT int pll_core_update_pmatrix_4x4_avx(double ** pmatrix,
ymm3 = _mm256_blend_pd(ymm0,ymm1,12);
ymm0 = _mm256_add_pd(ymm2,ymm3);

ymm0 = _mm256_add_pd(ymm0,idm_row3);
_mm256_store_pd(pmat+12,ymm0);
}
}
Expand Down Expand Up @@ -382,8 +398,8 @@ PLL_EXPORT int pll_core_update_pmatrix_20x20_avx(double ** pmatrix,
__m256d ymm0,ymm1,ymm2,ymm3,ymm4,ymm5,ymm6,ymm7,ymm8,ymm9;
__m256d zmm0,zmm1,zmm2,zmm3;

ymm8 = _mm256_set1_pd(PLL_ONE_MIN);
ymm9 = _mm256_set1_pd(PLL_ONE_MAX);
// ymm8 = _mm256_set1_pd(PLL_ONE_MIN);
// ymm9 = _mm256_set1_pd(PLL_ONE_MAX);

double * tran = NULL;
for (i = 0; i < count; ++i)
Expand Down Expand Up @@ -446,8 +462,14 @@ PLL_EXPORT int pll_core_update_pmatrix_20x20_avx(double ** pmatrix,
_mm256_store_pd(expd+k*4,xmm5);
}

/* NOTE: in order to deal with numerical issues in cases when Qt -> 0, we
* use a trick suggested by Ben Redelings and explained here:
* https://github.com/xflouris/libpll/issues/129#issuecomment-304004005
* In short, we use expm1() to compute (exp(Qt) - I), and then correct
* for this by adding an identity matrix I in the very end */

for (k = 0; k < 20; ++k)
expd[k] = exp(expd[k]);
expd[k] = expm1(expd[k]);

/* load expd */
xmm4 = _mm256_load_pd(expd+0);
Expand All @@ -456,50 +478,50 @@ PLL_EXPORT int pll_core_update_pmatrix_20x20_avx(double ** pmatrix,
xmm7 = _mm256_load_pd(expd+12);
xmm8 = _mm256_load_pd(expd+16);

/* check if all values of expd are approximately one */
ymm0 = _mm256_cmp_pd(xmm4, ymm8, _CMP_GT_OS);
ymm1 = _mm256_cmp_pd(xmm4, ymm9, _CMP_LT_OS);
ymm2 = _mm256_and_pd(ymm0,ymm1); /* expd[0..3] */
ymm0 = _mm256_cmp_pd(xmm5, ymm8, _CMP_GT_OS);
ymm1 = _mm256_cmp_pd(xmm5, ymm9, _CMP_LT_OS);
ymm3 = _mm256_and_pd(ymm0,ymm1); /* expd[4..7] */
ymm0 = _mm256_cmp_pd(xmm6, ymm8, _CMP_GT_OS);
ymm1 = _mm256_cmp_pd(xmm6, ymm9, _CMP_LT_OS);
ymm4 = _mm256_and_pd(ymm0,ymm1); /* expd[8..11] */
ymm0 = _mm256_cmp_pd(xmm7, ymm8, _CMP_GT_OS);
ymm1 = _mm256_cmp_pd(xmm7, ymm9, _CMP_LT_OS);
ymm5 = _mm256_and_pd(ymm0,ymm1); /* expd[12..15] */
ymm0 = _mm256_cmp_pd(xmm8, ymm8, _CMP_GT_OS);
ymm1 = _mm256_cmp_pd(xmm8, ymm9, _CMP_LT_OS);
ymm6 = _mm256_and_pd(ymm0,ymm1); /* expd[12..15] */

/* AND the results of comnparisons and check the mask to see if all
values of expd are approximately one. If they are, it means we are
multiplying the inverse eigenvectors matrix by the eigenvectors
matrix, and essentially the resulting pmatrix is the identity matrix.
This is done to prevent having numerical issues (negative entries in
the pmatrix) which can occur due to the different floating point
representations of one in expd. Otherwise, proceed as normal and
multiply eigenvector matrices with expd */
ymm0 = _mm256_and_pd(ymm2,ymm3);
ymm1 = _mm256_and_pd(ymm0,ymm4);
ymm0 = _mm256_and_pd(ymm1,ymm5);
ymm1 = _mm256_and_pd(ymm0,ymm6);
if (_mm256_movemask_pd(ymm1) == 0xF)
{
xmm0 = _mm256_setzero_pd();
for (j = 0; j < 20; ++j)
{
_mm256_store_pd(pmat+0,xmm0);
_mm256_store_pd(pmat+4,xmm0);
_mm256_store_pd(pmat+8,xmm0);
_mm256_store_pd(pmat+12,xmm0);
_mm256_store_pd(pmat+16,xmm0);
pmat[j] = 1;
pmat += 20;
}
continue;
}
// /* check if all values of expd are approximately one */
// ymm0 = _mm256_cmp_pd(xmm4, ymm8, _CMP_GT_OS);
// ymm1 = _mm256_cmp_pd(xmm4, ymm9, _CMP_LT_OS);
// ymm2 = _mm256_and_pd(ymm0,ymm1); /* expd[0..3] */
// ymm0 = _mm256_cmp_pd(xmm5, ymm8, _CMP_GT_OS);
// ymm1 = _mm256_cmp_pd(xmm5, ymm9, _CMP_LT_OS);
// ymm3 = _mm256_and_pd(ymm0,ymm1); /* expd[4..7] */
// ymm0 = _mm256_cmp_pd(xmm6, ymm8, _CMP_GT_OS);
// ymm1 = _mm256_cmp_pd(xmm6, ymm9, _CMP_LT_OS);
// ymm4 = _mm256_and_pd(ymm0,ymm1); /* expd[8..11] */
// ymm0 = _mm256_cmp_pd(xmm7, ymm8, _CMP_GT_OS);
// ymm1 = _mm256_cmp_pd(xmm7, ymm9, _CMP_LT_OS);
// ymm5 = _mm256_and_pd(ymm0,ymm1); /* expd[12..15] */
// ymm0 = _mm256_cmp_pd(xmm8, ymm8, _CMP_GT_OS);
// ymm1 = _mm256_cmp_pd(xmm8, ymm9, _CMP_LT_OS);
// ymm6 = _mm256_and_pd(ymm0,ymm1); /* expd[12..15] */
//
// /* AND the results of comnparisons and check the mask to see if all
// values of expd are approximately one. If they are, it means we are
// multiplying the inverse eigenvectors matrix by the eigenvectors
// matrix, and essentially the resulting pmatrix is the identity matrix.
// This is done to prevent having numerical issues (negative entries in
// the pmatrix) which can occur due to the different floating point
// representations of one in expd. Otherwise, proceed as normal and
// multiply eigenvector matrices with expd */
// ymm0 = _mm256_and_pd(ymm2,ymm3);
// ymm1 = _mm256_and_pd(ymm0,ymm4);
// ymm0 = _mm256_and_pd(ymm1,ymm5);
// ymm1 = _mm256_and_pd(ymm0,ymm6);
// if (_mm256_movemask_pd(ymm1) == 0xF)
// {
// xmm0 = _mm256_setzero_pd();
// for (j = 0; j < 20; ++j)
// {
// _mm256_store_pd(pmat+0,xmm0);
// _mm256_store_pd(pmat+4,xmm0);
// _mm256_store_pd(pmat+8,xmm0);
// _mm256_store_pd(pmat+12,xmm0);
// _mm256_store_pd(pmat+16,xmm0);
// pmat[j] = 1;
// pmat += 20;
// }
// continue;
// }

/* compute temp matrix */
for (k = 0; k < 400; k += 20)
Expand Down Expand Up @@ -565,6 +587,15 @@ PLL_EXPORT int pll_core_update_pmatrix_20x20_avx(double ** pmatrix,
pmat += 4;
}
}

/* add identity matrix */
pmat -= 400;
for (j = 0; j < 20; ++j)
{
pmat[j] += 1.0;
pmat += 20;
}

#ifdef DEBUG
for (j = 0; j < 20; ++j)
for (k = 0; k < 20; ++k)
Expand Down
17 changes: 16 additions & 1 deletion src/core_pmatrix_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,14 @@ int pll_core_update_pmatrix_20x20_avx2(double ** pmatrix,
_mm256_store_pd(expd+k*4,xmm5);
}

/* NOTE: in order to deal with numerical issues in cases when Qt -> 0, we
* use a trick suggested by Ben Redelings and explained here:
* https://github.com/xflouris/libpll/issues/129#issuecomment-304004005
* In short, we use expm1() to compute (exp(Qt) - I), and then correct
* for this by adding an identity matrix I in the very end */

for (k = 0; k < 20; ++k)
expd[k] = exp(expd[k]);
expd[k] = expm1(expd[k]);

/* load expd */
xmm4 = _mm256_load_pd(expd+0);
Expand Down Expand Up @@ -255,6 +261,15 @@ int pll_core_update_pmatrix_20x20_avx2(double ** pmatrix,
pmat += 4;
}
}

/* add identity matrix */
pmat -= 400;
for (j = 0; j < 20; ++j)
{
pmat[j] += 1.0;
pmat += 20;
}

}
}

Expand Down
Loading

0 comments on commit 4498719

Please sign in to comment.