diff --git a/CMakeLists.txt b/CMakeLists.txt index b31c85e..4c138f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,7 +128,7 @@ include( "${CMAKE_CURRENT_LIST_DIR}/tools/cmake/DownloadDependency.cmake" ) # These are replaced by tools/cmake/update_dependencies.sh to the hashes that are currently checked out. # Thus, do not replace the hashes manually! SET( CLI11_COMMIT_HASH "13becaddb657eacd090537719a669d66d393b8b2" ) #CLI11_COMMIT_HASH# -SET( genesis_COMMIT_HASH "1bafc7b9b0ebfe07f366c818c524c3991490aa29" ) #genesis_COMMIT_HASH# +SET( genesis_COMMIT_HASH "d0d574ed8026b2ca557a3b8458102cc142ca2558" ) #genesis_COMMIT_HASH# SET( sparsepp_COMMIT_HASH "6bfe3b4bdb364993e612d6bb729d680cf4c77649" ) #sparsepp_COMMIT_HASH# # Call the github download function, which takes four arguments: diff --git a/doc/md/clean-tree.md b/doc/md/clean-tree.md new file mode 100644 index 0000000..e551742 --- /dev/null +++ b/doc/md/clean-tree.md @@ -0,0 +1,15 @@ +## Description + +The command cleans a tree in Newick format (and some of its extensions) by removing parts that might lead some downstream parsers to fail. + +The Newick file format for phylogenetic trees in its original standard only supports node names (taxa names) and branch lengths. Over the years, many ad-hoc and custom extensions have been suggested and used in practice, to compensate for missing flexibility of the format. +This however lead to many downstream parsers not being able to work with all those dialects of the format, see + +> A Critical Review on the Use of Support Values in Tree Viewers and Bioinformatics Toolkits.
+> Czech L, Huerta-Cepas J, Stamatakis A.
+> Molecular Biology and Evolution, 17(4), 2017.
+> https://doi.org/10.1093/molbev/msx055 + +for some of the issues that might arise. + +This command can be used to clean some of those difficult extensions/annotations, by simply removing them. It is meant as a cleaning tool for other software packages that cannot read a given Newick tree. When all options are activated, all types of extra data (that we know of) are removed, leading to a tree with just node names at the terminal (leaf) nodes, and branch lengths. Note that branch lengths might slightly change even if nothing is removed, due to numerical rounding. diff --git a/libs/genesis b/libs/genesis index 1bafc7b..d0d574e 160000 --- a/libs/genesis +++ b/libs/genesis @@ -1 +1 @@ -Subproject commit 1bafc7b9b0ebfe07f366c818c524c3991490aa29 +Subproject commit d0d574ed8026b2ca557a3b8458102cc142ca2558 diff --git a/src/commands/prepare.hpp b/src/commands/prepare.hpp index c182570..a614cb3 100644 --- a/src/commands/prepare.hpp +++ b/src/commands/prepare.hpp @@ -3,7 +3,7 @@ /* gappa - Genesis Applications for Phylogenetic Placement Analysis - Copyright (C) 2017-2021 Lucas Czech + Copyright (C) 2017-2022 Lucas Czech This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,6 +27,7 @@ #include "CLI/CLI.hpp" #include "commands/prepare/chunkify.hpp" +#include "commands/prepare/clean_tree.hpp" #include "commands/prepare/extract.hpp" #include "commands/prepare/phat.hpp" #include "commands/prepare/taxonomy_tree.hpp" @@ -54,6 +55,7 @@ inline void setup_prepare( CLI::App& app ) // Add module subcommands. setup_chunkify( *sub ); + setup_clean_tree( *sub ); setup_extract( *sub ); setup_phat( *sub ); setup_taxonomy_tree( *sub ); diff --git a/src/commands/prepare/clean_tree.cpp b/src/commands/prepare/clean_tree.cpp new file mode 100644 index 0000000..ee359b7 --- /dev/null +++ b/src/commands/prepare/clean_tree.cpp @@ -0,0 +1,259 @@ +/* + gappa - Genesis Applications for Phylogenetic Placement Analysis + Copyright (C) 2017-2022 Lucas Czech + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + Contact: + Lucas Czech + Department of Plant Biology, Carnegie Institution For Science + 260 Panama Street, Stanford, CA 94305, USA +*/ + +#include "commands/prepare/clean_tree.hpp" + +#include "options/global.hpp" +#include "tools/cli_setup.hpp" + +#include "CLI/CLI.hpp" + +#include "genesis/tree/formats/newick/simple_reader.hpp" +#include "genesis/tree/formats/newick/simple_tree.hpp" +#include "genesis/tree/formats/newick/simple_writer.hpp" +#include "genesis/tree/function/functions.hpp" + +#include "genesis/utils/core/fs.hpp" + +#include +#include +#include + +// ================================================================================================= +// Setup +// ================================================================================================= + +void setup_clean_tree( CLI::App& app ) +{ + // Create the options and subcommand objects. + auto opt = std::make_shared(); + auto sub = app.add_subcommand( + "clean-tree", + "Clean a tree in Newick format by removing parts that other parsers have difficulties with." + ); + + // ----------------------------------------------------------- + // Input Data + // ----------------------------------------------------------- + + // Tree file + auto tree_file_opt = sub->add_option( + "--tree-file", + opt->tree_file, + "Tree file in Newick format." + ); + tree_file_opt->check( CLI::ExistingFile ); + tree_file_opt->group( "Input" ); + tree_file_opt->required(); + + // ----------------------------------------------------------- + // Settings + // ----------------------------------------------------------- + + // remove inner labels + sub->add_flag( + "--remove-inner-labels", + opt->remove_inner_labels, + "Some Newick trees contain inner node labels, which can confuse some parsers. " + "This option removes them." + )->group( "Settings" ); + + // replace invalid chars + sub->add_flag( + "--replace-invalid-chars", + opt->replace_invalid_chars, + "Replace invalid characters in node labels (` ,:;\"()[]`) by underscores. " + "The Newick format requires node labels to be wrapped in double quotation marks " + "if they contain these characters, but many parsers cannot handle this. " + "For such cases, replacing the characters can help." + )->group( "Settings" ); + + // remove comments and nhx + sub->add_flag( + "--remove-comments-and-nhx", + opt->remove_comments_and_nhx, + "The Newick format allows for comments in square brackets `[]`, " + "which are also often (mis-)used for ad-hoc and more established extensions such as the " + "New Hampshire eXtended (NHX) format `[&&NHX:key=value:...]`. " + "Many parsers cannot handle this; this option removes such annotations." + )->group( "Settings" ); + + // remove extra numbers + sub->add_flag( + "--remove-extra-numbers", + opt->remove_extra_numbers, + "The Rich/Rice Newick format extension allows to annotate bootstrap values and probabilities " + "per branch, by adding additional `:[bootstrap]:[prob]` fields after the branch length. " + "Many parsers cannot handle this; this option removes such annotations." + )->group( "Settings" ); + + // remove jplace tags + sub->add_flag( + "--remove-jplace-tags", + opt->remove_jplace_tags, + "The Jplace file format for phylogenetic placements also uses a custom Newick extension, " + "by introducing curly brackets to annotate edge numbers in the tree `{1}`. " + "We are not aware of any other Newick extension that uses this style, " + "but still, with this option, all annotations in curly brackets is removed." + )->group( "Settings" ); + + // ----------------------------------------------------------- + // Output Options + // ----------------------------------------------------------- + + opt->file_output.add_default_output_opts_to_app( sub ); + + // ----------------------------------------------------------- + // Callback + // ----------------------------------------------------------- + + // Set the run function as callback to be called when this subcommand is issued. + // Hand over the options by copy, so that their shared ptr stays alive in the lambda. + sub->callback( gappa_cli_callback( + sub, + {}, + [ opt ]() { + run_clean_tree( *opt ); + } + )); +} + +// ================================================================================================= +// Run +// ================================================================================================= + +void run_clean_tree( CleanTreeOptions const& options ) +{ + using namespace ::genesis; + using namespace ::genesis::tree; + using namespace ::genesis::utils; + + // Check if the output file name already exists. If so, fail early. + options.file_output.check_output_files_nonexistence( "clean-tree", "newick" ); + + // If clean is given, read it. + LOG_MSG1 << "Reading input tree."; + auto tree = SimpleNewickTreeNewickReader().read( from_file( options.tree_file )); + LOG_MSG1 << "Tree contains " << leaf_node_count( tree ) << " taxa (terminal branches)."; + + // We want to warn the user if no cleaning option was provided. + bool ran_one = false; + + // Remove inner labels + if( options.remove_inner_labels ) { + size_t cnt = 0; + for( auto& node : tree.nodes() ) { + auto& data = node.data(); + if( is_inner( node ) && ! data.name.empty() ) { + data.name = ""; + ++cnt; + } + } + LOG_MSG1 << "Removed " << cnt << " inner node labels."; + ran_one = true; + } + + // Replace invalid chars + if( options.replace_invalid_chars ) { + auto is_valid_name_char = [&]( char c ){ + return ::isprint(c) + && ! ::isspace(c) + && c != ':' + && c != ';' + && c != '(' + && c != ')' + && c != '[' + && c != ']' + && c != ',' + && c != '"' + ; + }; + size_t cnt = 0; + for( auto& node : tree.nodes() ) { + auto& name = node.data().name; + bool valid_name = true; + for( size_t i = 0; i < name.size(); ++i ) { + if( ! is_valid_name_char( name[i] ) ) { + valid_name = false; + name[i] = '_'; + } + } + if( !valid_name ) { + ++cnt; + } + } + LOG_MSG1 << "Replaced invalid characters in " << cnt << " node labels."; + ran_one = true; + } + + // Remove comments and nhx + if( options.remove_comments_and_nhx ) { + size_t cnt = 0; + for( auto& node : tree.nodes() ) { + auto& data = node.data(); + if( data.comments.size() > 0 ) { + ++cnt; + } + data.comments.clear(); + } + LOG_MSG1 << "Removed comments (such as NHX information) from " << cnt << " nodes."; + ran_one = true; + } + + // Remove extra numbers + if( options.remove_extra_numbers ) { + size_t cnt = 0; + for( auto& edge : tree.edges() ) { + auto& data = edge.data(); + if( data.values.size() > 0 ) { + ++cnt; + } + data.values.clear(); + } + LOG_MSG1 << "Removed extra branch numbers on " << cnt << " branches."; + ran_one = true; + } + + // Remove jplace tags + if( options.remove_jplace_tags ) { + size_t cnt = 0; + for( auto& edge : tree.edges() ) { + auto& data = edge.data(); + if( data.tags.size() > 0 ) { + ++cnt; + } + data.tags.clear(); + } + LOG_MSG1 << "Removed (jplace) tags on " << cnt << " branches."; + ran_one = true; + } + + if( ! ran_one ) { + LOG_WARN << "No cleaning option was provided. Tree will be written as-is."; + } + + // Create a newick tree from it. + LOG_MSG1 << "Writing output tree."; + auto nw = SimpleNewickTreeNewickWriter(); + nw.write( tree, options.file_output.get_output_target( "clean-tree", "newick" )); +} diff --git a/src/commands/prepare/clean_tree.hpp b/src/commands/prepare/clean_tree.hpp new file mode 100644 index 0000000..c6298c6 --- /dev/null +++ b/src/commands/prepare/clean_tree.hpp @@ -0,0 +1,65 @@ +#ifndef GAPPA_COMMANDS_PREPARE_CLEAN_TREE_H_ +#define GAPPA_COMMANDS_PREPARE_CLEAN_TREE_H_ + +/* + gappa - Genesis Applications for Phylogenetic Placement Analysis + Copyright (C) 2017-2022 Lucas Czech + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + Contact: + Lucas Czech + Department of Plant Biology, Carnegie Institution For Science + 260 Panama Street, Stanford, CA 94305, USA +*/ + +#include "CLI/CLI.hpp" + +#include "options/file_input.hpp" +#include "options/file_output.hpp" +#include "options/tree_output.hpp" + +#include +#include + +// ================================================================================================= +// Options +// ================================================================================================= + +class CleanTreeOptions +{ +public: + + // Input data. + std::string tree_file; + + // Settings. + bool remove_inner_labels = false; + bool replace_invalid_chars = false; + bool remove_comments_and_nhx = false; + bool remove_extra_numbers = false; + bool remove_jplace_tags = false; + + // Output options. + FileOutputOptions file_output; +}; + +// ================================================================================================= +// Functions +// ================================================================================================= + +void setup_clean_tree( CLI::App& app ); +void run_clean_tree( CleanTreeOptions const& options ); + +#endif // include guard