diff --git a/README.md b/README.md index 2ede660e..094f3c7f 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,15 @@ Requirements For Ubuntu 18.04 LTS the following commands installs the requirements: -`sudo apt install --no-install-recommends git cmake make g++ mpi-default-dev libprotobuf-dev libboost-dev libboost-program-options-dev libboost-filesystem-dev libboost-iostreams-dev libboost-date-time-dev protobuf-compiler automake autoconf libtool nasm` +```shell +sudo apt install --no-install-recommends git cmake make g++ mpi-default-dev libprotobuf-dev libboost-dev libboost-program-options-dev libboost-filesystem-dev libboost-iostreams-dev libboost-date-time-dev protobuf-compiler automake autoconf libtool nasm +``` To get a recent cmake, download from `https://cmake.org/download/`, for example: -`wget https://github.com/Kitware/CMake/releases/download/v3.23.1/cmake-3.23.1-linux-x86_64.tar.gz` +```shell +wget https://github.com/Kitware/CMake/releases/download/v3.23.1/cmake-3.23.1-linux-x86_64.tar.gz +``` Build Environments ------------------ @@ -24,30 +28,46 @@ Larch can be built utilizing a Singularity container or a Conda environment. To build Singularity image, use the definition provided: -`singularity build larch-singularity.sif larch-singularity.def` -`singularity shell larch-singularity.sif --net` +```shell +singularity build larch-singularity.sif larch-singularity.def +singularity shell larch-singularity.sif --net +``` To setup a conda environment capable of building Larch, use: -`conda create -n larch` -`conda activate larch` -`conda install --channel "conda-forge" --update-deps --override-channels cmake make cxx-compiler openmpi openmpi-mpicc openmpi-mpicxx boost-cpp automake autoconf libtool yasm ucx zlib` +```shell +conda create -n larch +conda activate larch +conda install --channel "conda-forge" --update-deps --override-channels cmake make cxx-compiler openmpi openmpi-mpicc openmpi-mpicxx boost-cpp automake autoconf libtool yasm ucx zlib +``` To setup a conda environment capable of building Larch including development tools, create `larch-dev` using the environment file provided: -`conda env create -f environment.yml` +```shell +conda env create -f environment.yml +``` Building -------- -- Note: If you run against memory limitations during the cmake step, you can regulate number of parallel threads with `export CMAKE_NUM_THREADS="8"` (reduce number as necessary). +There are 4 executables that are built automatically as part of the larch package and provide various methods for exploring tree space and manipulating DAGs/trees: +- `larch-test` is the suite of tests used to validate the various routines. +- `larch-usher` takes an input tree/DAG and explores tree space through SPR moves. +- `merge` utility is used to manipulate(e.g. combine, prune)DAGs/trees. +- `dag2dot` utility writes a provided protobuf file in dot format for easy viewing. -`git submodule update --init --recursive` -`mkdir build` -`cd build` -`cmake -DCMAKE_BUILD_TYPE=Debug ..` -`make -j16` +Note: If you run against memory limitations during the cmake step, you can regulate number of parallel threads with `export CMAKE_NUM_THREADS="8"` (reduce number as necessary). + +To build all from `larch/` directory, run: + +```shell +git submodule update --init --recursive +mkdir build +cd build +cmake -DCMAKE_BUILD_TYPE=Debug .. +make -j16 +``` Cmake build options: - add `-DCMAKE_CXX_CLANG_TIDY="clang-tidy"` to enable clang-tidy. @@ -56,10 +76,13 @@ Cmake build options: Running ------- -From the build directory: +### larch-test -`ln -s ../data` -`./larch-test` +From the `larch/build/` directory: +```shell +ln -s ../data +./larch-test +``` Passing *nocatch* to the tests executable will allow exceptions to escape, which is useful for debugging. A gdb session can be started with `gdb --args build/larch-test nocatch`. @@ -72,6 +95,65 @@ larch-test options: - `+tag` includes tests with a given tag. - For example, the `-tag "slow"` removes tests which require an long runtime to complete. +### larch-usher + +From the `larch/build/` directory: +```shell +./larch-usher -i ../data/testcase/tree_1.pb.gz -o output_dag.pb -c 10 +``` +This command runs 10 iterations of larch-usher on the provided tree, and writes the final result to the file `output_dag.pb` + +larch-usher options: +- `-i,--input` [REQUIRED] The name of the input tree/DAG (accepted file formats are: MADAG protobuf, MAT protobuf, JSON). +- `-o,--output` [REQUIRED] The file path to write the resulting DAG to. +- `-c,--count` [Default: 1] Number of larch-usher iterations to run. +- `-r,--MAT-refseq-file` [REQUIRED if provided input file is a MAT protobuf] Reference sequence file. +- `-v,--VCF-input-file` VCF file containing ambiguous sequence data. +- `-l,--logpath` [Default: `optimization_log`] Filepath to write log to. +- `-s,--switch-subtrees` [Default: never] Switch to optimizing subtrees after the specified number of iterations. +- `--min-subtree-clade-size` [Default: 100] The minimum number of leaves in a subtree sampled for optimization (ignored without option `-s`). +- `--max-subtree-clade-size` [Default: 1000] The maximum number of leaves in a subtree sampled for optimization (ignored without option `-s`). +- `--move-coeff-nodes` [Default: 1] New node coefficient for scoring moves. Set to 0 to apply only parsimony-optimal SPR moves. +- `--move-coeff-pscore` [Default: 1] Parsimony score coefficient for scoring moves. Set to 0 to apply only topologically novel SPR moves. +- `--sample-method` [Default: `parsimony`] Select method for sampling optimization tree from the DAG. Options are: (`parsimony`, `random`, `rf-minsum`, `rf-maxsum`). +- `--sample-uniformly` [Default: use natural distribution] Use a uniform distribution to sample trees for optimization. +- For example, if the sampling method is `parsimony` and `--sample-uniformly` is provided, then a uniform distribution on parsimony-optimal trees is sampled from. +- `--callback-option` [Default: `best-moves`] Specify which SPR moves are chosen and applied. Options are: (`all-moves`, `best-moves-fixed-tree`, `best-moves-treebased`, `best-moves`). +- `--trim` [Default: do not trim] Trim optimized dag to contain only parsimony-optimal trees before writing to protobuf. +- `--keep-fragment-uncollapsed` [Default: collapse] Do not collapse empty (non-mutation-bearing) edges in the optimization tree. +- `--quiet` [Default: write intermediate files] Do not write intermediate protobuf file at each iteration. + +### merge + +From the `larch/build/` directory: +```shell +./merge -i ../data/testcase/tree1.pb.gz -i ../data/testcase/tree2.pb.gz -d -o merged_trees.pb +``` +This executable takes a list of protobuf files and merges the resulting DAGs together into one. + +merge options: +- `-i,--input` Input protobuf files. +- `-o,--output` [Default: `merged.pb`] Save the output to filename. +- `-r,--refseq` [REQUIRED if input protobufs are MAT protobuf format] Read reference sequence from file. +- `-d,--dag` Input files are MADAG protobuf format\n"; +- `-t,--trim` Trim output (default trimming method is trim to best parsimony). +- `--rf` Trim output to minimize RF distance to the provided protobuf(Ignored if `-t` flag is not provided). +- `-s,--sample` Write a sampled single tree from DAG to file, rather than the whole DAG. + +### dag2dot + +From the `larch/build/` directory: +```shell +./dag2dot -d ../data/testcase/full_dag.pb +``` +This command writes the provided DAG in dot format to stdout. + +dag2dot options: +- `-t,--tree-pb` Input MAT protobuf filename. +- `-d,--dag-pb` Input DAG protobuf filename. +- `-j,--dag-json` Input DAG json filename. + + Third-party ----------- diff --git a/include/larch/impl/subtree/subtree_weight_impl.hpp b/include/larch/impl/subtree/subtree_weight_impl.hpp index bfad00fb..5d1aa01d 100644 --- a/include/larch/impl/subtree/subtree_weight_impl.hpp +++ b/include/larch/impl/subtree/subtree_weight_impl.hpp @@ -88,6 +88,7 @@ SubtreeWeight::TrimToMinWeight(const WeightOps& weight_ops) { }, mapped_node, result.View()); + result.View().BuildConnections(); return result; } diff --git a/tools/dag2dot.cpp b/tools/dag2dot.cpp index 8ffa28e7..95d4f00e 100644 --- a/tools/dag2dot.cpp +++ b/tools/dag2dot.cpp @@ -7,7 +7,6 @@ [[noreturn]] static void Usage() { std::cout << "Usage:\n"; std::cout << "dag2dot -t,--tree-pb file\n"; - std::cout << "dag2dot [-c,--cgs] -d,--dag-pb file\n"; std::cout << "dag2dot -j,--dag-json file\n"; std::cout << " -t,--tree-pb Input protobuf tree filename\n"; std::cout << " -d,--dag-pb Input protobuf DAG filename\n"; diff --git a/tools/larch-usher.cpp b/tools/larch-usher.cpp index b786ceb1..6c11849c 100644 --- a/tools/larch-usher.cpp +++ b/tools/larch-usher.cpp @@ -34,7 +34,7 @@ std::cout << " -i,--input Path to input DAG\n"; std::cout << " -r,--MAT-refseq-file Provide a path to a file containing a " "reference sequence\nif input points to MAT protobuf\n"; - std::cout << " -v,--vcf-file Provide a path to a vcf file containing " + std::cout << " -v,--VCF-input-file Provide a path to a vcf file containing " "ambiguous leaf sequence data\n"; std::cout << " -o,--output Path to output DAG\n"; std::cout << " -l,--logpath Path for logging\n"; @@ -721,7 +721,7 @@ int main(int argc, char** argv) { // NOLINT(bugprone-exception-escape) }; auto logger = [&merge, &logfile, &time_elapsed, - &write_intermediate_pb](size_t iteration) { + &write_intermediate_pb, &output_dag_path](size_t iteration) { SubtreeWeight parsimonyscorer{merge.GetResult()}; SubtreeWeight maxparsimonyscorer{ merge.GetResult()}; @@ -766,8 +766,7 @@ int main(int argc, char** argv) { // NOLINT(bugprone-exception-escape) << '\t' << max_rf_distance << '\t' << min_rf_count << '\t' << max_rf_count << '\t' << time_elapsed() << std::flush; if (write_intermediate_pb) { - std::string intermediate_dag_path = "intermediate_MADAG_untrimmed.pb"; - StoreDAGToProtobuf(merge.GetResult(), intermediate_dag_path); + StoreDAGToProtobuf(merge.GetResult(), output_dag_path+"_intermediate"); } }; logger(0); @@ -870,10 +869,10 @@ int main(int argc, char** argv) { // NOLINT(bugprone-exception-escape) return AddMATConversion(weight.MinWeightUniformSampleTree({}, subtree_node)); case SampleMethod::MinSumRFDistance: return AddMATConversion( - min_sum_rf_dist.SampleTree(min_rf_weight_ops, subtree_node)); + min_sum_rf_dist.MinWeightSampleTree(min_rf_weight_ops, subtree_node)); case SampleMethod::MaxSumRFDistance: return AddMATConversion( - max_sum_rf_dist.SampleTree(max_rf_weight_ops, subtree_node)); + max_sum_rf_dist.MinWeightSampleTree(max_rf_weight_ops, subtree_node)); default: std::cerr << "ERROR: Invalid SampleMethod" << std::endl; std::exit(EXIT_FAILURE); @@ -921,6 +920,7 @@ int main(int argc, char** argv) { // NOLINT(bugprone-exception-escape) optimized_view.RecomputeCompactGenomes(false); merge.AddDAG(optimized_view); logger(i + 1); + StoreDAGToProtobuf(merge.GetResult(), output_dag_path+"_intermediate"); } std::cout << "new node coefficient: " << move_coeff_nodes << "\n"; diff --git a/tools/merge.cpp b/tools/merge.cpp index fc2a7958..284158d4 100644 --- a/tools/merge.cpp +++ b/tools/merge.cpp @@ -6,6 +6,7 @@ #include "larch/subtree/subtree_weight.hpp" #include "larch/subtree/parsimony_score_binary.hpp" #include "arguments.hpp" +#include "larch/rf_distance.hpp" #include "larch/merge/merge.hpp" #include "larch/dag_loader.hpp" #include "larch/benchmark.hpp" @@ -18,7 +19,8 @@ std::cout << " -o,--output Save the output to filename (default is merged.pb)\n"; std::cout << " -r,--refseq Read reference sequence from Json file\n"; std::cout << " -d,--dag Input files are DAGs\n"; - std::cout << " -t,--trim Trim output to best parsimony\n"; + std::cout << " -t,--trim Trim output(default to best parsimony)\n"; + std::cout << " --rf Trim output to minimize RF distance to provided protobuf\n"; std::cout << " -s,--sample Sample a single tree from DAG\n"; std::exit(EXIT_SUCCESS); @@ -32,7 +34,7 @@ static int MergeTrees(const std::vector& paths, std::string_view refseq_json_path, std::string_view out_path, - bool dags, bool trim, bool sample_tree) { + bool dags, bool trim, bool sample_tree, std::string rf_file_path) { std::vector> trees; std::string reference_sequence = ""; if (not refseq_json_path.empty()) { @@ -68,17 +70,31 @@ static int MergeTrees(const std::vector& paths, std::cout << "DAG nodes(without trimming): " << merge.GetResult().GetNodesCount() << "\n"; std::cout << "DAG edges(without trimming): " << merge.GetResult().GetEdgesCount() << "\n"; + merge.ComputeResultEdgeMutations(); if (trim) { - merge.ComputeResultEdgeMutations(); - SubtreeWeight weight{merge.GetResult()}; - if (sample_tree) { - std::cout << "sampling a tree from the minweight options\n"; - StoreDAGToProtobuf(weight.MinWeightSampleTree({}).View(), out_path); + if (rf_file_path == "none") { + SubtreeWeight weight{merge.GetResult()}; + if (sample_tree) { + std::cout << "sampling a tree from the minweight options\n"; + StoreDAGToProtobuf(weight.MinWeightSampleTree({}).View(), out_path); + } else { + StoreDAGToProtobuf(weight.TrimToMinWeight({}).View(), out_path); + } } else { - StoreDAGToProtobuf(weight.TrimToMinWeight({}).View(), out_path); + auto tree = dags ? LoadDAGFromProtobuf(rf_file_path) : LoadTreeFromProtobuf(rf_file_path, reference_sequence); + Merge comparetree(tree.View().GetReferenceSequence()); + comparetree.AddDAG(tree.View()); + comparetree.ComputeResultEdgeMutations(); + SubtreeWeight min_sum_rf_dist{merge.GetResult()}; + SumRFDistance min_rf_weight_ops{comparetree, merge}; + if (sample_tree) { + std::cout << "sampling a tree from the minweight options\n"; + StoreDAGToProtobuf(min_sum_rf_dist.MinWeightSampleTree(min_rf_weight_ops, {}).View(), out_path); + } else { + StoreDAGToProtobuf(min_sum_rf_dist.TrimToMinWeight(min_rf_weight_ops).View(), out_path); + } } } else { - merge.ComputeResultEdgeMutations(); if (sample_tree) { std::cout << "sampling a tree from the merge DAG\n"; SubtreeWeight weight{merge.GetResult()}; @@ -87,8 +103,6 @@ static int MergeTrees(const std::vector& paths, StoreDAGToProtobuf(merge.GetResult(), out_path); } } - - return EXIT_SUCCESS; } @@ -98,6 +112,7 @@ int main(int argc, char** argv) try { std::vector input_filenames; std::string result_filename = "merged.pb"; std::string refseq_filename; + std::string rf_filename = "none"; bool dags = false; bool trim = false; bool sample_tree = false; @@ -123,6 +138,12 @@ int main(int argc, char** argv) try { dags = true; } else if (name == "-t" or name == "--trim") { trim = true; + } else if (name == "--rf") { + if (params.empty()) { + std::cerr << "Specify rf-trim protobuf file name.\n"; + Fail(); + } + rf_filename = *params.begin(); } else if (name == "-s" or name == "--sample") { sample_tree = true; } @@ -136,7 +157,7 @@ int main(int argc, char** argv) try { std::cout << pth << " to be merged\n"; } - return MergeTrees(input_filenames, refseq_filename, result_filename, dags, trim, sample_tree); + return MergeTrees(input_filenames, refseq_filename, result_filename, dags, trim, sample_tree, rf_filename); } catch (std::exception& e) { std::cerr << "Uncaught exception: " << e.what() << std::endl; std::terminate();