diff --git a/CMakeLists.txt b/CMakeLists.txt
index b31c85e..4c138f6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,7 +128,7 @@ include( "${CMAKE_CURRENT_LIST_DIR}/tools/cmake/DownloadDependency.cmake" )
# These are replaced by tools/cmake/update_dependencies.sh to the hashes that are currently checked out.
# Thus, do not replace the hashes manually!
SET( CLI11_COMMIT_HASH "13becaddb657eacd090537719a669d66d393b8b2" ) #CLI11_COMMIT_HASH#
-SET( genesis_COMMIT_HASH "1bafc7b9b0ebfe07f366c818c524c3991490aa29" ) #genesis_COMMIT_HASH#
+SET( genesis_COMMIT_HASH "d0d574ed8026b2ca557a3b8458102cc142ca2558" ) #genesis_COMMIT_HASH#
SET( sparsepp_COMMIT_HASH "6bfe3b4bdb364993e612d6bb729d680cf4c77649" ) #sparsepp_COMMIT_HASH#
# Call the github download function, which takes four arguments:
diff --git a/doc/md/clean-tree.md b/doc/md/clean-tree.md
new file mode 100644
index 0000000..e551742
--- /dev/null
+++ b/doc/md/clean-tree.md
@@ -0,0 +1,15 @@
+## Description
+
+The command cleans a tree in Newick format (and some of its extensions) by removing parts that might lead some downstream parsers to fail.
+
+The Newick file format for phylogenetic trees in its original standard only supports node names (taxa names) and branch lengths. Over the years, many ad-hoc and custom extensions have been suggested and used in practice, to compensate for missing flexibility of the format.
+This however lead to many downstream parsers not being able to work with all those dialects of the format, see
+
+> A Critical Review on the Use of Support Values in Tree Viewers and Bioinformatics Toolkits.
+> Czech L, Huerta-Cepas J, Stamatakis A.
+> Molecular Biology and Evolution, 17(4), 2017.
+> https://doi.org/10.1093/molbev/msx055
+
+for some of the issues that might arise.
+
+This command can be used to clean some of those difficult extensions/annotations, by simply removing them. It is meant as a cleaning tool for other software packages that cannot read a given Newick tree. When all options are activated, all types of extra data (that we know of) are removed, leading to a tree with just node names at the terminal (leaf) nodes, and branch lengths. Note that branch lengths might slightly change even if nothing is removed, due to numerical rounding.
diff --git a/libs/genesis b/libs/genesis
index 1bafc7b..d0d574e 160000
--- a/libs/genesis
+++ b/libs/genesis
@@ -1 +1 @@
-Subproject commit 1bafc7b9b0ebfe07f366c818c524c3991490aa29
+Subproject commit d0d574ed8026b2ca557a3b8458102cc142ca2558
diff --git a/src/commands/prepare.hpp b/src/commands/prepare.hpp
index c182570..a614cb3 100644
--- a/src/commands/prepare.hpp
+++ b/src/commands/prepare.hpp
@@ -3,7 +3,7 @@
/*
gappa - Genesis Applications for Phylogenetic Placement Analysis
- Copyright (C) 2017-2021 Lucas Czech
+ Copyright (C) 2017-2022 Lucas Czech
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -27,6 +27,7 @@
#include "CLI/CLI.hpp"
#include "commands/prepare/chunkify.hpp"
+#include "commands/prepare/clean_tree.hpp"
#include "commands/prepare/extract.hpp"
#include "commands/prepare/phat.hpp"
#include "commands/prepare/taxonomy_tree.hpp"
@@ -54,6 +55,7 @@ inline void setup_prepare( CLI::App& app )
// Add module subcommands.
setup_chunkify( *sub );
+ setup_clean_tree( *sub );
setup_extract( *sub );
setup_phat( *sub );
setup_taxonomy_tree( *sub );
diff --git a/src/commands/prepare/clean_tree.cpp b/src/commands/prepare/clean_tree.cpp
new file mode 100644
index 0000000..ee359b7
--- /dev/null
+++ b/src/commands/prepare/clean_tree.cpp
@@ -0,0 +1,259 @@
+/*
+ gappa - Genesis Applications for Phylogenetic Placement Analysis
+ Copyright (C) 2017-2022 Lucas Czech
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+ Contact:
+ Lucas Czech
+ Department of Plant Biology, Carnegie Institution For Science
+ 260 Panama Street, Stanford, CA 94305, USA
+*/
+
+#include "commands/prepare/clean_tree.hpp"
+
+#include "options/global.hpp"
+#include "tools/cli_setup.hpp"
+
+#include "CLI/CLI.hpp"
+
+#include "genesis/tree/formats/newick/simple_reader.hpp"
+#include "genesis/tree/formats/newick/simple_tree.hpp"
+#include "genesis/tree/formats/newick/simple_writer.hpp"
+#include "genesis/tree/function/functions.hpp"
+
+#include "genesis/utils/core/fs.hpp"
+
+#include
+#include
+#include
+
+// =================================================================================================
+// Setup
+// =================================================================================================
+
+void setup_clean_tree( CLI::App& app )
+{
+ // Create the options and subcommand objects.
+ auto opt = std::make_shared();
+ auto sub = app.add_subcommand(
+ "clean-tree",
+ "Clean a tree in Newick format by removing parts that other parsers have difficulties with."
+ );
+
+ // -----------------------------------------------------------
+ // Input Data
+ // -----------------------------------------------------------
+
+ // Tree file
+ auto tree_file_opt = sub->add_option(
+ "--tree-file",
+ opt->tree_file,
+ "Tree file in Newick format."
+ );
+ tree_file_opt->check( CLI::ExistingFile );
+ tree_file_opt->group( "Input" );
+ tree_file_opt->required();
+
+ // -----------------------------------------------------------
+ // Settings
+ // -----------------------------------------------------------
+
+ // remove inner labels
+ sub->add_flag(
+ "--remove-inner-labels",
+ opt->remove_inner_labels,
+ "Some Newick trees contain inner node labels, which can confuse some parsers. "
+ "This option removes them."
+ )->group( "Settings" );
+
+ // replace invalid chars
+ sub->add_flag(
+ "--replace-invalid-chars",
+ opt->replace_invalid_chars,
+ "Replace invalid characters in node labels (` ,:;\"()[]`) by underscores. "
+ "The Newick format requires node labels to be wrapped in double quotation marks "
+ "if they contain these characters, but many parsers cannot handle this. "
+ "For such cases, replacing the characters can help."
+ )->group( "Settings" );
+
+ // remove comments and nhx
+ sub->add_flag(
+ "--remove-comments-and-nhx",
+ opt->remove_comments_and_nhx,
+ "The Newick format allows for comments in square brackets `[]`, "
+ "which are also often (mis-)used for ad-hoc and more established extensions such as the "
+ "New Hampshire eXtended (NHX) format `[&&NHX:key=value:...]`. "
+ "Many parsers cannot handle this; this option removes such annotations."
+ )->group( "Settings" );
+
+ // remove extra numbers
+ sub->add_flag(
+ "--remove-extra-numbers",
+ opt->remove_extra_numbers,
+ "The Rich/Rice Newick format extension allows to annotate bootstrap values and probabilities "
+ "per branch, by adding additional `:[bootstrap]:[prob]` fields after the branch length. "
+ "Many parsers cannot handle this; this option removes such annotations."
+ )->group( "Settings" );
+
+ // remove jplace tags
+ sub->add_flag(
+ "--remove-jplace-tags",
+ opt->remove_jplace_tags,
+ "The Jplace file format for phylogenetic placements also uses a custom Newick extension, "
+ "by introducing curly brackets to annotate edge numbers in the tree `{1}`. "
+ "We are not aware of any other Newick extension that uses this style, "
+ "but still, with this option, all annotations in curly brackets is removed."
+ )->group( "Settings" );
+
+ // -----------------------------------------------------------
+ // Output Options
+ // -----------------------------------------------------------
+
+ opt->file_output.add_default_output_opts_to_app( sub );
+
+ // -----------------------------------------------------------
+ // Callback
+ // -----------------------------------------------------------
+
+ // Set the run function as callback to be called when this subcommand is issued.
+ // Hand over the options by copy, so that their shared ptr stays alive in the lambda.
+ sub->callback( gappa_cli_callback(
+ sub,
+ {},
+ [ opt ]() {
+ run_clean_tree( *opt );
+ }
+ ));
+}
+
+// =================================================================================================
+// Run
+// =================================================================================================
+
+void run_clean_tree( CleanTreeOptions const& options )
+{
+ using namespace ::genesis;
+ using namespace ::genesis::tree;
+ using namespace ::genesis::utils;
+
+ // Check if the output file name already exists. If so, fail early.
+ options.file_output.check_output_files_nonexistence( "clean-tree", "newick" );
+
+ // If clean is given, read it.
+ LOG_MSG1 << "Reading input tree.";
+ auto tree = SimpleNewickTreeNewickReader().read( from_file( options.tree_file ));
+ LOG_MSG1 << "Tree contains " << leaf_node_count( tree ) << " taxa (terminal branches).";
+
+ // We want to warn the user if no cleaning option was provided.
+ bool ran_one = false;
+
+ // Remove inner labels
+ if( options.remove_inner_labels ) {
+ size_t cnt = 0;
+ for( auto& node : tree.nodes() ) {
+ auto& data = node.data();
+ if( is_inner( node ) && ! data.name.empty() ) {
+ data.name = "";
+ ++cnt;
+ }
+ }
+ LOG_MSG1 << "Removed " << cnt << " inner node labels.";
+ ran_one = true;
+ }
+
+ // Replace invalid chars
+ if( options.replace_invalid_chars ) {
+ auto is_valid_name_char = [&]( char c ){
+ return ::isprint(c)
+ && ! ::isspace(c)
+ && c != ':'
+ && c != ';'
+ && c != '('
+ && c != ')'
+ && c != '['
+ && c != ']'
+ && c != ','
+ && c != '"'
+ ;
+ };
+ size_t cnt = 0;
+ for( auto& node : tree.nodes() ) {
+ auto& name = node.data().name;
+ bool valid_name = true;
+ for( size_t i = 0; i < name.size(); ++i ) {
+ if( ! is_valid_name_char( name[i] ) ) {
+ valid_name = false;
+ name[i] = '_';
+ }
+ }
+ if( !valid_name ) {
+ ++cnt;
+ }
+ }
+ LOG_MSG1 << "Replaced invalid characters in " << cnt << " node labels.";
+ ran_one = true;
+ }
+
+ // Remove comments and nhx
+ if( options.remove_comments_and_nhx ) {
+ size_t cnt = 0;
+ for( auto& node : tree.nodes() ) {
+ auto& data = node.data();
+ if( data.comments.size() > 0 ) {
+ ++cnt;
+ }
+ data.comments.clear();
+ }
+ LOG_MSG1 << "Removed comments (such as NHX information) from " << cnt << " nodes.";
+ ran_one = true;
+ }
+
+ // Remove extra numbers
+ if( options.remove_extra_numbers ) {
+ size_t cnt = 0;
+ for( auto& edge : tree.edges() ) {
+ auto& data = edge.data();
+ if( data.values.size() > 0 ) {
+ ++cnt;
+ }
+ data.values.clear();
+ }
+ LOG_MSG1 << "Removed extra branch numbers on " << cnt << " branches.";
+ ran_one = true;
+ }
+
+ // Remove jplace tags
+ if( options.remove_jplace_tags ) {
+ size_t cnt = 0;
+ for( auto& edge : tree.edges() ) {
+ auto& data = edge.data();
+ if( data.tags.size() > 0 ) {
+ ++cnt;
+ }
+ data.tags.clear();
+ }
+ LOG_MSG1 << "Removed (jplace) tags on " << cnt << " branches.";
+ ran_one = true;
+ }
+
+ if( ! ran_one ) {
+ LOG_WARN << "No cleaning option was provided. Tree will be written as-is.";
+ }
+
+ // Create a newick tree from it.
+ LOG_MSG1 << "Writing output tree.";
+ auto nw = SimpleNewickTreeNewickWriter();
+ nw.write( tree, options.file_output.get_output_target( "clean-tree", "newick" ));
+}
diff --git a/src/commands/prepare/clean_tree.hpp b/src/commands/prepare/clean_tree.hpp
new file mode 100644
index 0000000..c6298c6
--- /dev/null
+++ b/src/commands/prepare/clean_tree.hpp
@@ -0,0 +1,65 @@
+#ifndef GAPPA_COMMANDS_PREPARE_CLEAN_TREE_H_
+#define GAPPA_COMMANDS_PREPARE_CLEAN_TREE_H_
+
+/*
+ gappa - Genesis Applications for Phylogenetic Placement Analysis
+ Copyright (C) 2017-2022 Lucas Czech
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+ Contact:
+ Lucas Czech
+ Department of Plant Biology, Carnegie Institution For Science
+ 260 Panama Street, Stanford, CA 94305, USA
+*/
+
+#include "CLI/CLI.hpp"
+
+#include "options/file_input.hpp"
+#include "options/file_output.hpp"
+#include "options/tree_output.hpp"
+
+#include
+#include
+
+// =================================================================================================
+// Options
+// =================================================================================================
+
+class CleanTreeOptions
+{
+public:
+
+ // Input data.
+ std::string tree_file;
+
+ // Settings.
+ bool remove_inner_labels = false;
+ bool replace_invalid_chars = false;
+ bool remove_comments_and_nhx = false;
+ bool remove_extra_numbers = false;
+ bool remove_jplace_tags = false;
+
+ // Output options.
+ FileOutputOptions file_output;
+};
+
+// =================================================================================================
+// Functions
+// =================================================================================================
+
+void setup_clean_tree( CLI::App& app );
+void run_clean_tree( CleanTreeOptions const& options );
+
+#endif // include guard