From 18ae95cf742e007ebc6fdd98f9550e4bbe8405b0 Mon Sep 17 00:00:00 2001 From: computations Date: Wed, 27 Jan 2021 13:41:28 +0100 Subject: [PATCH 1/3] Updates raxml to allow for floating point site weights --- libs/pll-modules | 2 +- src/MSA.cpp | 8 ++++---- src/MSA.hpp | 8 ++++---- src/Model.hpp | 4 ++-- src/PartitionInfo.cpp | 2 +- src/PartitionedMSA.cpp | 2 +- src/PartitionedMSAView.cpp | 6 +++--- src/PartitionedMSAView.hpp | 6 +++--- src/TreeInfo.cpp | 15 ++++++++------- src/TreeInfo.hpp | 7 ++++--- src/bootstrap/BootstrapGenerator.cpp | 8 ++++---- src/bootstrap/BootstrapGenerator.hpp | 6 +++--- src/io/binary_io.cpp | 4 ++-- src/main.cpp | 16 ++++++++-------- src/types.hpp | 5 +++++ test/src/PartitionedMSAViewTest.cpp | 8 ++++---- 16 files changed, 57 insertions(+), 50 deletions(-) diff --git a/libs/pll-modules b/libs/pll-modules index 7c6669e9..83b49af8 160000 --- a/libs/pll-modules +++ b/libs/pll-modules @@ -1 +1 @@ -Subproject commit 7c6669e9751e08d378b2f95c44b5b17ddd64637c +Subproject commit 83b49af85e7e1ffaa081cd3926f5b6d7007742f3 diff --git a/src/MSA.cpp b/src/MSA.cpp index 51908fa7..b4b46b66 100644 --- a/src/MSA.cpp +++ b/src/MSA.cpp @@ -130,13 +130,13 @@ void MSA::compress_patterns(const pll_state_t * charmap, bool store_backmap) _length = _pll_msa->length; if (_weights.empty()) - _weights = WeightVector(w, w + _pll_msa->length); + _weights = FloatWeightVector(w, w + _pll_msa->length); else { /* external weights specified -> use site_pattern_map to generate a compressed weight vector */ assert(_weights.size() == uncompressed_length); assert(!_site_pattern_map.empty()); - WeightVector new_weights(_length, 0); + FloatWeightVector new_weights(_length, 0); for (size_t i = 0; i < _site_pattern_map.size(); ++i) new_weights[_site_pattern_map[i]] += _weights[i]; _weights = std::move(new_weights); @@ -326,13 +326,13 @@ void MSA::update_num_sites() _num_sites = std::accumulate(_weights.begin(), _weights.end(), 0); } -void MSA::weights(const WeightVector& v) +void MSA::weights(const FloatWeightVector& v) { _weights = v; update_num_sites(); } -void MSA::weights(WeightVector&& v) +void MSA::weights(FloatWeightVector&& v) { _weights = std::move(v); update_num_sites(); diff --git a/src/MSA.hpp b/src/MSA.hpp index 9c367d99..35cc20c8 100644 --- a/src/MSA.hpp +++ b/src/MSA.hpp @@ -46,7 +46,7 @@ class MSA size_t length() const { return _length; } size_t num_sites() const { return _num_sites; } size_t num_patterns() const { return _weights.size(); } - const WeightVector& weights() const {return _weights; } + const FloatWeightVector& weights() const {return _weights; } const NameIdMap& label_id_map() const { return _label_id_map; } const WeightVector site_pattern_map() const { return _site_pattern_map; } const pll_msa_t * pll_msa() const; @@ -71,8 +71,8 @@ class MSA doubleVector state_freqs() const; void num_sites(const unsigned int sites) { _num_sites = sites; } - void weights(const WeightVector& v); - void weights(WeightVector&& v); + void weights(const FloatWeightVector& v); + void weights(FloatWeightVector&& v); void remove_sites(const std::vector& site_indices); @@ -97,7 +97,7 @@ class MSA container _sequences; container _labels; NameIdMap _label_id_map; - WeightVector _weights; + FloatWeightVector _weights; WeightVector _site_pattern_map; ProbVectorList _probs; RangeList _local_seq_ranges; diff --git a/src/Model.hpp b/src/Model.hpp index 95e90a7f..8a250d00 100644 --- a/src/Model.hpp +++ b/src/Model.hpp @@ -139,7 +139,7 @@ class Model bool param_estimated(int param) const; AscBiasCorrection ascbias_type() const { return _ascbias_type; } - const WeightVector& ascbias_weights() const { return _ascbias_weights; } + const FloatWeightVector& ascbias_weights() const { return _ascbias_weights; } /* per alignment site, given in elements (NOT in bytes) */ size_t clv_entry_size() const { return _num_states * _num_ratecats; } @@ -192,7 +192,7 @@ class Model double _brlen_scaler; AscBiasCorrection _ascbias_type; - WeightVector _ascbias_weights; + FloatWeightVector _ascbias_weights; std::vector _submodels; diff --git a/src/PartitionInfo.cpp b/src/PartitionInfo.cpp index 47affb3e..fbc93085 100644 --- a/src/PartitionInfo.cpp +++ b/src/PartitionInfo.cpp @@ -81,7 +81,7 @@ void PartitionInfo::compress_patterns(bool store_backmap) pllmod_msa_stats_t * PartitionInfo::compute_stats(unsigned long stats_mask) const { - const unsigned int * weights = _msa.weights().empty() ? nullptr : _msa.weights().data(); + const double * weights = _msa.weights().empty() ? nullptr : _msa.weights().data(); pllmod_msa_stats_t * stats = pllmod_msa_compute_stats(_msa.pll_msa(), _model.num_states(), _model.charmap(), weights, stats_mask); diff --git a/src/PartitionedMSA.cpp b/src/PartitionedMSA.cpp index 0f15bab0..b075a435 100644 --- a/src/PartitionedMSA.cpp +++ b/src/PartitionedMSA.cpp @@ -165,7 +165,7 @@ void PartitionedMSA::split_msa() if (!_full_msa.weights().empty()) { auto& msa = _part_list[p].msa(); - WeightVector w(msa.length()); + FloatWeightVector w(msa.length()); const auto full_weights = _full_msa.weights(); assert(full_weights.size() == site_part_map().size()); diff --git a/src/PartitionedMSAView.cpp b/src/PartitionedMSAView.cpp index aec7bf88..c4705e15 100644 --- a/src/PartitionedMSAView.cpp +++ b/src/PartitionedMSAView.cpp @@ -97,7 +97,7 @@ const IDSet& PartitionedMSAView::exclude_sites(size_t part_id) const return _excluded_sites.at(part_id); } -void PartitionedMSAView::site_weights(const WeightVectorList& weights) +void PartitionedMSAView::site_weights(const FloatWeightVectorList& weights) { if (weights.size() != part_count()) throw runtime_error("PartitionedMSAView: invalid weight vector size"); @@ -105,7 +105,7 @@ void PartitionedMSAView::site_weights(const WeightVectorList& weights) _site_weights.clear(); _site_weights.resize(part_count()); size_t part_id = 0; - for (const WeightVector& v: weights) + for (const auto& v: weights) { site_weights(part_id, v); part_id++; @@ -113,7 +113,7 @@ void PartitionedMSAView::site_weights(const WeightVectorList& weights) assert(_site_weights.size() == part_count()); } -void PartitionedMSAView::site_weights(size_t part_id, const WeightVector& weights) +void PartitionedMSAView::site_weights(size_t part_id, const FloatWeightVector& weights) { _site_weights.resize(part_count()); diff --git a/src/PartitionedMSAView.hpp b/src/PartitionedMSAView.hpp index cd7054bd..54f22ed0 100644 --- a/src/PartitionedMSAView.hpp +++ b/src/PartitionedMSAView.hpp @@ -36,8 +36,8 @@ class PartitionedMSAView void exclude_sites(size_t part_id, IDVector site_ids); const IDSet& exclude_sites(size_t part_id) const; - void site_weights(const WeightVectorList& site_weights); - void site_weights(size_t part_id, const WeightVector& site_weights); + void site_weights(const FloatWeightVectorList& site_weights); + void site_weights(size_t part_id, const FloatWeightVector& site_weights); private: const PartitionedMSA * _parted_msa; @@ -45,7 +45,7 @@ class PartitionedMSAView NameMap _taxon_name_map; IDSet _excluded_taxa; std::vector _excluded_sites; - WeightVectorList _site_weights; + FloatWeightVectorList _site_weights; mutable IDVector _orig_taxon_ids; diff --git a/src/TreeInfo.cpp b/src/TreeInfo.cpp index f0d80c6a..080e5947 100644 --- a/src/TreeInfo.cpp +++ b/src/TreeInfo.cpp @@ -2,6 +2,7 @@ #include "TreeInfo.hpp" #include "ParallelContext.hpp" +#include "types.hpp" using namespace std; @@ -9,13 +10,13 @@ TreeInfo::TreeInfo (const Options &opts, const Tree& tree, const PartitionedMSA& const IDVector& tip_msa_idmap, const PartitionAssignment& part_assign) { - init(opts, tree, parted_msa, tip_msa_idmap, part_assign, std::vector()); + init(opts, tree, parted_msa, tip_msa_idmap, part_assign, FloatWeightVectorList()); } TreeInfo::TreeInfo (const Options &opts, const Tree& tree, const PartitionedMSA& parted_msa, const IDVector& tip_msa_idmap, const PartitionAssignment& part_assign, - const std::vector& site_weights) + const FloatWeightVectorList& site_weights) { init(opts, tree, parted_msa, tip_msa_idmap, part_assign, site_weights); } @@ -23,7 +24,7 @@ TreeInfo::TreeInfo (const Options &opts, const Tree& tree, const PartitionedMSA& void TreeInfo::init(const Options &opts, const Tree& tree, const PartitionedMSA& parted_msa, const IDVector& tip_msa_idmap, const PartitionAssignment& part_assign, - const std::vector& site_weights) + const FloatWeightVectorList& site_weights) { _brlen_min = opts.brlen_min; _brlen_max = opts.brlen_max; @@ -458,7 +459,7 @@ void assign(Model& model, const TreeInfo& treeinfo, size_t partition_id) model.brlen_scaler(pll_treeinfo.brlen_scalers[partition_id]); } -void build_clv(ProbVector::const_iterator probs, size_t sites, WeightVector::const_iterator weights, +void build_clv(ProbVector::const_iterator probs, size_t sites, FloatWeightVector::const_iterator weights, pll_partition_t* partition, bool normalize, std::vector& clv) { const auto states = partition->states; @@ -535,7 +536,7 @@ void set_partition_tips(const Options& opts, const MSA& msa, const IDVector& tip void set_partition_tips(const Options& opts, const MSA& msa, const IDVector& tip_msa_idmap, const PartitionRange& part_region, pll_partition_t* partition, const pll_state_t * charmap, - const WeightVector& weights) + const FloatWeightVector& weights) { assert(!weights.empty()); @@ -544,7 +545,7 @@ void set_partition_tips(const Options& opts, const MSA& msa, const IDVector& tip const auto pend = pstart + plen; /* compress weights array by removing all zero entries */ - uintVector comp_weights; + FloatWeightVector comp_weights; for (size_t j = pstart; j < pend; ++j) { if (weights[j] > 0) @@ -595,7 +596,7 @@ void set_partition_tips(const Options& opts, const MSA& msa, const IDVector& tip pll_partition_t* create_pll_partition(const Options& opts, const PartitionInfo& pinfo, const IDVector& tip_msa_idmap, - const PartitionRange& part_region, const uintVector& weights) + const PartitionRange& part_region, const FloatWeightVector& weights) { const MSA& msa = pinfo.msa(); const Model& model = pinfo.model(); diff --git a/src/TreeInfo.hpp b/src/TreeInfo.hpp index ff7098f8..9256b707 100644 --- a/src/TreeInfo.hpp +++ b/src/TreeInfo.hpp @@ -6,6 +6,7 @@ #include "Options.hpp" #include "AncestralStates.hpp" #include "loadbalance/PartitionAssignment.hpp" +#include "types.hpp" struct spr_round_params { @@ -31,7 +32,7 @@ class TreeInfo const IDVector& tip_msa_idmap, const PartitionAssignment& part_assign); TreeInfo (const Options &opts, const Tree& tree, const PartitionedMSA& parted_msa, const IDVector& tip_msa_idmap, const PartitionAssignment& part_assign, - const std::vector& site_weights); + const FloatWeightVectorList& site_weights); virtual ~TreeInfo (); @@ -74,7 +75,7 @@ class TreeInfo void init(const Options &opts, const Tree& tree, const PartitionedMSA& parted_msa, const IDVector& tip_msa_idmap, const PartitionAssignment& part_assign, - const std::vector& site_weights); + const FloatWeightVectorList& site_weights); void assert_lh_improvement(double old_lh, double new_lh, const std::string& where = ""); }; @@ -85,6 +86,6 @@ void assign(Model& model, const TreeInfo& treeinfo, size_t partition_id); pll_partition_t* create_pll_partition(const Options& opts, const PartitionInfo& pinfo, const IDVector& tip_msa_idmap, - const PartitionRange& part_region, const uintVector& weights); + const PartitionRange& part_region, const FloatWeightVector& weights); #endif /* RAXML_TREEINFO_HPP_ */ diff --git a/src/bootstrap/BootstrapGenerator.cpp b/src/bootstrap/BootstrapGenerator.cpp index 2392761f..e52c833a 100644 --- a/src/bootstrap/BootstrapGenerator.cpp +++ b/src/bootstrap/BootstrapGenerator.cpp @@ -24,19 +24,19 @@ BootstrapReplicate BootstrapGenerator::generate(const PartitionedMSA& parted_msa return result; } -WeightVector BootstrapGenerator::generate(const MSA& msa, unsigned long random_seed) +FloatWeightVector BootstrapGenerator::generate(const MSA& msa, unsigned long random_seed) { RandomGenerator gen(random_seed); return generate(msa, gen); } -WeightVector BootstrapGenerator::generate(const MSA& msa, RandomGenerator& gen) +FloatWeightVector BootstrapGenerator::generate(const MSA& msa, RandomGenerator& gen) { unsigned int orig_len = msa.num_sites(); unsigned int comp_len = msa.length(); - WeightVector w_buf(orig_len, 0); + FloatWeightVector w_buf(orig_len, 0); std::uniform_int_distribution distr(0, orig_len-1); @@ -50,7 +50,7 @@ WeightVector BootstrapGenerator::generate(const MSA& msa, RandomGenerator& gen) return w_buf; else { - WeightVector result(comp_len, 0); + FloatWeightVector result(comp_len, 0); auto orig_weights = msa.weights(); assert(!orig_weights.empty()); diff --git a/src/bootstrap/BootstrapGenerator.hpp b/src/bootstrap/BootstrapGenerator.hpp index 707b9091..97e0a6fa 100644 --- a/src/bootstrap/BootstrapGenerator.hpp +++ b/src/bootstrap/BootstrapGenerator.hpp @@ -5,7 +5,7 @@ struct BootstrapReplicate { - WeightVectorList site_weights; + FloatWeightVectorList site_weights; }; typedef std::vector BootstrapReplicateList; @@ -20,10 +20,10 @@ class BootstrapGenerator ~BootstrapGenerator (); BootstrapReplicate generate(const PartitionedMSA& parted_msa, unsigned long random_seed); - WeightVector generate(const MSA& msa, unsigned long random_seed); + FloatWeightVector generate(const MSA& msa, unsigned long random_seed); private: - WeightVector generate(const MSA& msa, RandomGenerator& gen); + FloatWeightVector generate(const MSA& msa, RandomGenerator& gen); }; #endif /* RAXML_BOOTSTRAP_BOOTSTRAPGENERATOR_HPP_ */ diff --git a/src/io/binary_io.cpp b/src/io/binary_io.cpp index b9a23457..b43afb6c 100644 --- a/src/io/binary_io.cpp +++ b/src/io/binary_io.cpp @@ -312,7 +312,7 @@ BasicBinaryStream& operator>>(BasicBinaryStream& stream, MSA& m) m = MSA(pat_count); - m.weights(stream.get()); + m.weights(stream.get()); std::string seq(pat_count, 0); for (size_t i = 0; i < taxa_count; ++i) @@ -355,7 +355,7 @@ BasicBinaryStream& operator>>(BasicBinaryStream& stream, MSARange mr) assert(local_len <= pat_count); assert(local_len <= weight_count); - WeightVector w(local_len); + FloatWeightVector w(local_len); read_vector_range(stream, &w[0], rl, pat_count); m.weights(std::move(w)); diff --git a/src/main.cpp b/src/main.cpp index cd2b5091..98544854 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -985,12 +985,12 @@ void load_msa_weights(MSA& msa, const Options& opts) if (!f) throw runtime_error("Unable to open site weights file: " + opts.weights_file); - WeightVector w; + FloatWeightVector w; w.reserve(msa.length()); - const auto maxw = std::numeric_limits::max(); + const auto maxw = std::numeric_limits::max(); int fres; - intmax_t x; - while ((fres = fscanf(f,"%jd", &x)) != EOF) + double x; + while ((fres = fscanf(f,"%lf", &x)) != EOF) { if (!fres) { @@ -1000,7 +1000,7 @@ void load_msa_weights(MSA& msa, const Options& opts) fclose(f); throw runtime_error("Invalid site weight entry found near: " + string(buf)); } - else if (x <= 0) + else if (x < 0) { fclose(f); throw runtime_error("Non-positive site weight found: " + to_string(x) + @@ -1013,7 +1013,7 @@ void load_msa_weights(MSA& msa, const Options& opts) " (max: " + to_string(maxw) + ")"); } else - w.push_back((WeightType) x); + w.push_back((FloatWeightType) x); } fclose(f); @@ -1587,7 +1587,7 @@ void balance_load(RaxmlInstance& instance) LOG_VERB << endl << instance.proc_part_assign; } -PartitionAssignmentList balance_load(RaxmlInstance& instance, WeightVectorList part_site_weights) +PartitionAssignmentList balance_load(RaxmlInstance& instance, FloatWeightVectorList part_site_weights) { /* This function is used to re-distribute sites across processes for each bootstrap replicate. * Since during bootstrapping alignment sites are sampled with replacement, some sites will be @@ -1598,7 +1598,7 @@ PartitionAssignmentList balance_load(RaxmlInstance& instance, WeightVectorList p PartitionAssignmentList assign_list; PartitionAssignment part_sizes; - WeightVectorList comp_pos_map(part_site_weights.size()); + FloatWeightVectorList comp_pos_map(part_site_weights.size()); /* init list of partition sizes */ size_t i = 0; diff --git a/src/types.hpp b/src/types.hpp index 47da9b98..5177f40a 100644 --- a/src/types.hpp +++ b/src/types.hpp @@ -141,6 +141,11 @@ typedef std::vector WeightVector; typedef std::vector WeightVectorList; typedef std::unordered_map WeightVectorMap; +typedef double FloatWeightType; +typedef std::vector FloatWeightVector; +typedef std::vector FloatWeightVectorList; +typedef std::unordered_map FloatVectorMap; + typedef std::default_random_engine RandomGenerator; /* diff --git a/test/src/PartitionedMSAViewTest.cpp b/test/src/PartitionedMSAViewTest.cpp index 66017312..59e16412 100644 --- a/test/src/PartitionedMSAViewTest.cpp +++ b/test/src/PartitionedMSAViewTest.cpp @@ -99,7 +99,7 @@ TEST(PartitionedMSAViewTest, view_dummy_patcomp) } void check_vmsa_part(PartitionedMSA& pmsa, PartitionedMSAView& vmsa, size_t p, - size_t num_ex, size_t weight_ex, WeightVector weights = WeightVector()) + size_t num_ex, size_t weight_ex, FloatWeightVector weights = FloatWeightVector()) { auto& msa = pmsa.part_info(p).msa(); size_t num_sites = weights.empty() ? msa.num_sites() : @@ -233,7 +233,7 @@ void view_weight_test_p1(bool compress) auto pmsa = part_msa_p1(compress); PartitionedMSAView vmsa(pmsa); - WeightVector w(pmsa.part_info(0).msa().length(), 1); + FloatWeightVector w(pmsa.part_info(0).msa().length(), 1); for (size_t s = 0; s < pmsa.part_info(0).msa().length(); ++s) w[s] = s % 5; @@ -257,14 +257,14 @@ void view_weight_test_p3(bool compress) auto pmsa = part_msa_p3(compress); PartitionedMSAView vmsa(pmsa); - WeightVectorList ww(3); + FloatWeightVectorList ww(3); auto total_sites = 0; for (size_t p = 0; p < vmsa.part_count(); ++p) { auto& w = ww[p]; - w = WeightVector(pmsa.part_info(p).msa().length(), 1); + w = FloatWeightVector(pmsa.part_info(p).msa().length(), 1); for (size_t s = 0; s < w.size(); ++s) w[s] = s % 5; From 04c1c0fcca8909e1e514493e57bca02d80c3cb04 Mon Sep 17 00:00:00 2001 From: computations Date: Thu, 28 Jan 2021 11:47:13 +0100 Subject: [PATCH 2/3] update pllmodules to use the forked branch --- .gitmodules | 3 ++- libs/pll-modules | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index c0cbe2a9..7fca7679 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,7 @@ [submodule "libs/pll-modules"] path = libs/pll-modules - url = https://github.com/ddarriba/pll-modules.git + url = https://github.com/computations/pll-modules.git + branch = floating_weights [submodule "libs/terraphast"] path = libs/terraphast url = https://github.com/amkozlov/terraphast-one diff --git a/libs/pll-modules b/libs/pll-modules index 83b49af8..edf8c36f 160000 --- a/libs/pll-modules +++ b/libs/pll-modules @@ -1 +1 @@ -Subproject commit 83b49af85e7e1ffaa081cd3926f5b6d7007742f3 +Subproject commit edf8c36f256ca7f164ca774e98f1204c8139c492 From 103fe4803be3d8514e0fdace4a04b36c8bdb5e6d Mon Sep 17 00:00:00 2001 From: computations Date: Mon, 1 Mar 2021 11:28:37 +0100 Subject: [PATCH 3/3] Fixes a bug when trimming zero weight sites We erroniously converted the site weights to ints, even though they could be floats. This means that any sites with a weight less than 1 was trimmed. This change simply updates the type in the lambda. --- src/TreeInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TreeInfo.cpp b/src/TreeInfo.cpp index 080e5947..dbc52a43 100644 --- a/src/TreeInfo.cpp +++ b/src/TreeInfo.cpp @@ -608,7 +608,7 @@ pll_partition_t* create_pll_partition(const Options& opts, const PartitionInfo& const size_t part_length = weights.empty() ? part_region.length : std::count_if(weights.begin() + pstart, weights.begin() + pstart + part_region.length, - [](uintVector::value_type w) -> bool + [](FloatWeightVector::value_type w) -> bool { return w > 0; } );