diff --git a/test/out/scaling.out b/test/out/scaling.out new file mode 100644 index 0000000..11d335c --- /dev/null +++ b/test/out/scaling.out @@ -0,0 +1,64 @@ +scaling = per-site, alpha = 0.050000, rates = [ 0.000000 0.000001 0.005299 3.994700 ] +recompute P-matrices: 3997 +recompute CLVs: 1998 +scaler 1995: [ 8 6 21 21 4 ] +per-site logLH: [ -1565.4625 -1079.1468 -inf -inf -812.9101 ] +logLH INNER-INNER at edge (3997-3995): -inf, LH derivatives: -nan, -nan +logLH TIP-INNER at edge (3997-1999): -inf, LH derivatives: -nan, -nan + +scaling = per-rate, alpha = 0.050000, rates = [ 0.000000 0.000001 0.005299 3.994700 ] +recompute P-matrices: 3997 +recompute CLVs: 1998 +scaler 1995: [ 8+(50 22 7 0) 6+(50 25 9 0) 15+(48 16 0 5) 15+(47 16 0 5) 4+(50 27 11 0) ] +per-site logLH: [ -1565.4625 -1079.1468 -2838.4636 -2813.7556 -812.9101 ] +logLH INNER-INNER at edge (3997-3995): -9109.7386, LH derivatives: -2.7826, 2.9738 +logLH TIP-INNER at edge (3997-1999): -9109.7386, LH derivatives: -1.5936, 1.2950 + +scaling = per-site, alpha = 0.200000, rates = [ 0.000531 0.033775 0.383658 3.582035 ] +recompute P-matrices: 3997 +recompute CLVs: 1998 +scaler 1995: [ 8 6 21 21 4 ] +per-site logLH: [ -1567.6326 -1076.7894 -inf -inf -827.4514 ] +logLH INNER-INNER at edge (3997-3995): -inf, LH derivatives: -nan, -nan +logLH TIP-INNER at edge (3997-1999): -inf, LH derivatives: -nan, -nan + +scaling = per-rate, alpha = 0.200000, rates = [ 0.000531 0.033775 0.383658 3.582035 ] +recompute P-matrices: 3997 +recompute CLVs: 1998 +scaler 1995: [ 8+(11 3 1 0) 6+(14 6 1 0) 12+(8 0 2 8) 12+(7 0 2 8) 4+(15 7 3 0) ] +per-site logLH: [ -1567.6326 -1076.7894 -2277.3326 -2279.8917 -827.4514 ] +logLH INNER-INNER at edge (3997-3995): -8029.0976, LH derivatives: -2.9702, 3.0540 +logLH TIP-INNER at edge (3997-1999): -8029.0976, LH derivatives: -1.6910, 1.3293 + +scaling = per-site, alpha = 2.000000, rates = [ 0.293275 0.655014 1.069990 1.981722 ] +recompute P-matrices: 3997 +recompute CLVs: 1998 +scaler 1995: [ 9 6 13 13 5 ] +per-site logLH: [ -1603.6166 -1100.1947 -2388.7163 -2399.8925 -913.5515 ] +logLH INNER-INNER at edge (3997-3995): -8405.9717, LH derivatives: -3.0736, 3.5362 +logLH TIP-INNER at edge (3997-1999): -8405.9717, LH derivatives: -1.4613, 1.3836 + +scaling = per-rate, alpha = 2.000000, rates = [ 0.293275 0.655014 1.069990 1.981722 ] +recompute P-matrices: 3997 +recompute CLVs: 1998 +scaler 1995: [ 9+(0 0 0 0) 6+(2 1 0 0) 13+(0 4 6 7) 13+(0 4 6 7) 5+(2 1 0 0) ] +per-site logLH: [ -1603.6166 -1100.1947 -2388.7163 -2399.8925 -913.5515 ] +logLH INNER-INNER at edge (3997-3995): -8405.9717, LH derivatives: -3.0736, 3.5362 +logLH TIP-INNER at edge (3997-1999): -8405.9717, LH derivatives: -1.4613, 1.3836 + +scaling = per-site, alpha = 99.000000, rates = [ 0.875297 0.964547 1.029683 1.130473 ] +recompute P-matrices: 3997 +recompute CLVs: 1998 +scaler 1995: [ 9 6 18 18 5 ] +per-site logLH: [ -1658.3038 -1166.0014 -3313.6724 -3314.2351 -1008.3317 ] +logLH INNER-INNER at edge (3997-3995): -10460.5445, LH derivatives: -2.4820, 3.3850 +logLH TIP-INNER at edge (3997-1999): -10460.5445, LH derivatives: -1.0708, 1.1693 + +scaling = per-rate, alpha = 99.000000, rates = [ 0.875297 0.964547 1.029683 1.130473 ] +recompute P-matrices: 3997 +recompute CLVs: 1998 +scaler 1995: [ 9+(0 0 0 0) 6+(0 0 0 0) 18+(0 1 1 1) 18+(0 1 1 1) 5+(0 0 0 0) ] +per-site logLH: [ -1658.3038 -1166.0014 -3313.6724 -3314.2351 -1008.3317 ] +logLH INNER-INNER at edge (3997-3995): -10460.5445, LH derivatives: -2.4820, 3.3850 +logLH TIP-INNER at edge (3997-1999): -10460.5445, LH derivatives: -1.0708, 1.1693 + diff --git a/test/src/README.md b/test/src/README.md index b395f9d..db63f96 100644 --- a/test/src/README.md +++ b/test/src/README.md @@ -95,6 +95,10 @@ and 4 different proportions of invariant sites, from 0.0 to 0.9 Validate Nearest Neighbor Interchange moves. +## scaling + +Validate CLV scaling on large trees (per-site and per-rate scaling modes) + ## treemove-spr Validate Subtree Prunning and Regrafting moves. diff --git a/test/src/scaling.c b/test/src/scaling.c new file mode 100644 index 0000000..00a3f78 --- /dev/null +++ b/test/src/scaling.c @@ -0,0 +1,338 @@ +/* + Copyright (C) 2017 Alexey Kozlov, Diego Darriba, Tomas Flouri + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + + Contact: Alexey Kozlov , + Exelixis Lab, Heidelberg Instutute for Theoretical Studies + Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany +*/ +#include "common.h" + +#define N_STATES_NT 4 +#define N_STATES_AA 4 +#define N_CAT_GAMMA 4 +#define N_SITES 5 +#define FLOAT_PRECISION 4 + +#define TREEFILE "testdata/2000.tree" + +#define MIN(a,b) (ascale_buffers > 0 && clv_idx >= p->tips) ? + clv_idx - p->tips : PLL_SCALE_BUFFER_NONE; +} + +void show_scaler(const pll_partition_t * p, unsigned int clv_idx) +{ + unsigned int scaler = scaler_idx(p, clv_idx); + if (scaler != PLL_SCALE_BUFFER_NONE) + { + unsigned int i, j; + printf("scaler %u: [ ", scaler); + for (i = 0; i < p->sites; ++i) + { + if (p->attributes & PLL_ATTRIB_RATE_SCALERS) + { + unsigned int * scalev = p->scale_buffer[scaler] + i*p->rate_cats; + unsigned int min_scaler = 1e6; + for (j = 0; j < p->rate_cats; ++j) + if (scalev[j] < min_scaler) + min_scaler = scalev[j]; + + printf("%d+(", min_scaler); + for (j = 0; j < p->rate_cats; ++j) + { + unsigned int s = scalev[j] - min_scaler; + printf("%u", MIN(50, s)); + if (j < p->rate_cats-1) + printf(" "); + } + printf(") "); + } + else + printf("%u ", p->scale_buffer[scaler][i]); + } + printf("]\n"); + } +} + +void show_clv(const pll_partition_t * p, unsigned int clv_idx, unsigned int site) +{ + unsigned int i; + unsigned int clv_span = p->states * p->rate_cats; + printf("CLV %u site %u (size=%u): [ ", clv_idx, site, clv_span); + for (i = (site-1)*clv_span; i < site*clv_span; ++i) + printf("%e ", p->clv[clv_idx][i]); + printf("]\n"); + +} + +pll_partition_t * init_partition(unsigned int attrs) +{ + unsigned int i,j; + pll_partition_t * p = pll_partition_create(tree->tip_count, + tree->inner_count, + N_STATES_NT, + N_SITES, + 1, /* rate matrices */ + 2*tree->tip_count - 3, + N_CAT_GAMMA, + tree->inner_count, + attrs); + + if (!p) + fatal("ERROR creating partition: %s\n", pll_errmsg); + + size_t len = strlen(nt_alphabet); + char * seq = (char *) calloc(N_SITES+1, sizeof(char)); + for (i = 0; i < tree->tip_count; ++i) + { + for (j = 0; j < N_SITES; ++j) + { + seq[j] = (i < 1500) ? nt_alphabet[j % len] : nt_alphabet[(i + j) % len]; + } + + pll_set_tip_states(p, tree->nodes[i]->clv_index, pll_map_nt, seq); + } + + free(seq); + + pll_set_frequencies(p, 0, frequencies); + pll_set_subst_params(p, 0, subst_params); + + return p; +} + +void init(unsigned int attrs) +{ + unsigned int i; + + tree = pll_utree_parse_newick(TREEFILE); + + if (!tree) + fatal("ERROR reading tree file: %s\n", pll_errmsg); + + part_sitescale = init_partition(attrs); + part_ratescale = init_partition(attrs | PLL_ATTRIB_RATE_SCALERS); + + /* build fixed structures */ + unsigned int nodes_count = tree->inner_count + tree->tip_count; + unsigned int branch_count = nodes_count - 1; + travbuffer = (pll_unode_t **)malloc(nodes_count * sizeof(pll_unode_t *)); + branch_lengths = (double *)malloc(branch_count * sizeof(double)); + matrix_indices = (unsigned int *)malloc(branch_count * sizeof(unsigned int)); + operations = (pll_operation_t *)malloc(tree->inner_count * + sizeof(pll_operation_t)); + persite_lnl = (double *)malloc(part_sitescale->sites * sizeof(double)); + sumtable = pll_aligned_alloc(part_sitescale->sites * + part_sitescale->rate_cats * + part_sitescale->states_padded * sizeof(double), + part_sitescale->alignment); + + root = tree->nodes[tree->tip_count+tree->inner_count-1]; + + /* get full traversal */ + pll_utree_traverse(root, + PLL_TREE_TRAVERSE_POSTORDER, + cb_full_traversal, + travbuffer, + &traversal_size); + + pll_utree_create_operations(travbuffer, + traversal_size, + branch_lengths, + matrix_indices, + operations, + &matrix_count, + &ops_count); + + for (i = 0; i < matrix_count; ++i) + branch_lengths[i] = (i % 2 == 0) ? 1.0 : 1e-6; +} + +void comp_derivatives(pll_partition_t * partition, pll_unode_t * r, + double brlen, double * d1, double * d2) +{ + if (!pll_update_sumtable(partition, r->clv_index, r->back->clv_index, + r->scaler_index, r->back->scaler_index, + params_indices, sumtable)) + { + fatal("ERROR computing sumtable: %s\n", pll_errmsg); + } + + if (!pll_compute_likelihood_derivatives(partition, + r->scaler_index, + r->back->scaler_index, + brlen, + params_indices, + sumtable, + d1, d2)) + { + fatal("ERROR computing derivatives: %s\n", pll_errmsg); + } +} + +int eval(pll_partition_t * partition, double alpha) +{ + double rate_cats[N_CAT_GAMMA]; + unsigned int i; + double d_f, dd_f; + + pll_compute_gamma_cats(alpha, n_cat_gamma, rate_cats); + pll_set_category_rates(partition, rate_cats); + + printf("scaling = "); + if (partition->attributes & PLL_ATTRIB_RATE_SCALERS) + printf("per-rate"); + else if (partition->scale_buffers > 0) + printf("per-site"); + else + printf("OFF"); + + printf(", alpha = %lf, rates = [ ", alpha); + for (i = 0; i < n_cat_gamma; ++i) + printf("%lf ", rate_cats[i]); + printf("]\n"); + + printf("recompute P-matrices: %d\n", matrix_count); + pll_update_prob_matrices(partition, + params_indices, + matrix_indices, + branch_lengths, + matrix_count); + + printf("recompute CLVs: %d\n", ops_count); + pll_update_partials(partition, operations, ops_count); + + show_scaler(partition, root->back->clv_index); +// show_clv(partition, root->back->clv_index, 53); +// pll_show_pmatrix(partition, root->pmatrix_index, 9); +// pll_show_pmatrix(partition, root->pmatrix_index-1, 9); + + // test derivatives + comp_derivatives(partition, root, 1.0, &d_f, &dd_f); + + double ii_loglh = pll_compute_edge_loglikelihood(partition, + root->clv_index, + root->scaler_index, + root->back->clv_index, + root->back->scaler_index, + root->pmatrix_index, + params_indices, + persite_lnl); + + printf("per-site logLH: [ "); + for (i = 0; i < partition->sites; ++i) + printf("%.4lf ", persite_lnl[i]); + printf("]\n"); + printf("logLH INNER-INNER at edge (%u-%u): %.4lf, LH derivatives: %.4lf, %.4lf\n", + root->clv_index, root->back->clv_index, ii_loglh, + d_f, dd_f); + + pll_unode_t * new_root = root->next->next; + pll_unode_t * tip = new_root->back; + assert(tip->clv_index < tree->tip_count); + + // change root CLV orientation + pll_operation_t op; + op.parent_clv_index = new_root->clv_index; + op.child1_clv_index = new_root->next->back->clv_index; + op.child2_clv_index = new_root->next->next->back->clv_index; + op.child1_matrix_index = new_root->next->pmatrix_index; + op.child2_matrix_index = new_root->next->next->pmatrix_index; + op.parent_scaler_index = scaler_idx(partition, op.parent_clv_index); + op.child1_scaler_index = scaler_idx(partition, op.child1_clv_index); + op.child2_scaler_index = scaler_idx(partition, op.child2_clv_index); + + pll_update_partials(partition, &op, 1); + + // test derivatives + comp_derivatives(partition, new_root, 1.0, &d_f, &dd_f); + + double ti_loglh = pll_compute_edge_loglikelihood(partition, + new_root->clv_index, + new_root->scaler_index, + tip->clv_index, + tip->scaler_index, + tip->pmatrix_index, + params_indices, + persite_lnl); + + printf("logLH TIP-INNER at edge (%u-%u): %.4lf, LH derivatives: %.4lf, %.4lf\n\n", + new_root->clv_index, new_root->back->clv_index, ti_loglh, + d_f, dd_f); + + if (fabs(ii_loglh - ti_loglh) > 1e-4) + { + printf("ERROR: Likelihood mismatch INNER-INNER vs TIP-INNER (see above)\n"); + } + + return 1; +} + +void cleanup() +{ + free(travbuffer); + free(branch_lengths); + free(operations); + free(matrix_indices); + free(persite_lnl); + free(sumtable); + pll_partition_destroy(part_noscale); + pll_partition_destroy(part_sitescale); + pll_partition_destroy(part_ratescale); + pll_utree_destroy(tree, NULL); +} + + +int main(int argc, char * argv[]) +{ + /* check attributes */ + unsigned int attributes = get_attributes(argc, argv); + + init(attributes); + + unsigned int i; + for (i = 0; i < alpha_count; ++i) + { + eval(part_sitescale, alphas[i]); + eval(part_ratescale, alphas[i]); + } + + cleanup(); + + return (0); +}