From a3b57fadfe09e4f348e4f892961e8b56b505b0b9 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 19 Dec 2024 10:04:08 -0500 Subject: [PATCH] add --snarl-outlier option --- src/subcommand/stats_main.cpp | 102 +++++++++++++++++++++++++++------- 1 file changed, 83 insertions(+), 19 deletions(-) diff --git a/src/subcommand/stats_main.cpp b/src/subcommand/stats_main.cpp index 79145800a7..88daaff337 100644 --- a/src/subcommand/stats_main.cpp +++ b/src/subcommand/stats_main.cpp @@ -34,6 +34,7 @@ #include "../io/save_handle_graph.hpp" #include "../gbzgraph.hpp" #include "../traversal_finder.hpp" +#include "../traversal_clusters.hpp" using namespace std; using namespace vg; @@ -66,7 +67,8 @@ void help_stats(char** argv) { << " -O, --overlap-all print overlap table for the cartesian product of paths" << endl << " -R, --snarls print statistics for each snarl" << endl << " --snarl-contents print out a table of " << endl - << " --snarl-sample print out reference coordinates on given sample" << endl + << " --snarl-sample S print out reference coordinates on given sample S" << endl + << " --snarl-outlier print most diverged path from reference (--snarl-sample required)" << endl << " -C, --chains print statistics for each chain" << endl << " -F, --format graph format from {VG-Protobuf, PackedGraph, HashGraph, XG}. " << "Can't detect Protobuf if graph read from stdin" << endl @@ -109,6 +111,7 @@ int main_stats(int argc, char** argv) { bool chain_stats = false; bool snarl_contents = false; string snarl_sample; + bool snarl_outlier = false; bool format = false; bool degree_dist = false; string distance_index_filename; @@ -116,6 +119,7 @@ int main_stats(int argc, char** argv) { // Long options with no corresponding short options. constexpr int OPT_SNARL_CONTENTS = 1000; constexpr int OPT_SNARL_SAMPLE = 1001; + constexpr int OPT_SNARL_OUTLIER = 1002; constexpr int64_t snarl_search_context = 100; int c; @@ -146,6 +150,7 @@ int main_stats(int argc, char** argv) { {"snarls", no_argument, 0, 'R'}, {"snarl-contents", no_argument, 0, OPT_SNARL_CONTENTS}, {"snarl-sample", required_argument, 0, OPT_SNARL_SAMPLE}, + {"snarl-outlier", no_argument, 0, OPT_SNARL_OUTLIER}, {"chains", no_argument, 0, 'C'}, {"format", no_argument, 0, 'F'}, {"degree-dist", no_argument, 0, 'D'}, @@ -251,6 +256,10 @@ int main_stats(int argc, char** argv) { case OPT_SNARL_SAMPLE: snarl_sample = optarg; break; + + case OPT_SNARL_OUTLIER: + snarl_outlier = true; + break; case 'C': chain_stats = true; @@ -297,6 +306,11 @@ int main_stats(int argc, char** argv) { exit(1); } + if (snarl_outlier && snarl_sample.empty()) { + cerr << "error [vg stats]: --snarl-outlier can only be used with --snarl-sample" << endl; + exit(1); + } + bdsg::ReferencePathOverlayHelper overlay_helper; unique_ptr path_handle_graph; PathHandleGraph* graph = nullptr; @@ -1142,18 +1156,24 @@ int main_stats(int argc, char** argv) { if (!snarl_sample.empty()) { // optionally prefix with bed-like refpath coordinates if --snarl-sample given cout <<"Contig\tStartPos\tEndPos\t"; + if (snarl_outlier) { + cout << "OutlierName\tOutlierJaccard\tOutlierLen\t"; + } if (pp_graph == nullptr) { pp_graph = overlay_helper.apply(graph); } vector ref_path_names; pp_graph->for_each_path_of_sample(snarl_sample, [&](path_handle_t path_handle) { - ref_path_names.push_back(graph->get_path_name(path_handle)); + ref_path_names.push_back(pp_graph->get_path_name(path_handle)); }); if (ref_path_names.empty()) { cerr << "error [vg stats]: unable to find any paths of --snarl-sample " << snarl_sample << endl; exit(1); } + if (snarl_outlier) { + ref_path_names.clear(); // if searching for outliers, we want all traversals + } path_trav_finder = unique_ptr(new PathTraversalFinder(*pp_graph, ref_path_names)); } cout << "Start\tStart-Reversed\tEnd\tEnd-Reversed\tUltrabubble\tUnary\tShallow-Nodes\tShallow-Edges\tShallow-bases\tDeep-Nodes\tDeep-Edges\tDeep-Bases\tDepth\tChildren\tChains\tChains-Children\tNet-Graph-Size\n"; @@ -1164,15 +1184,26 @@ int main_stats(int argc, char** argv) { if (snarl_stats) { if (!snarl_sample.empty()) { // find the reference interval from path index if snarl lies directly on reference path - vector travs; - std::tie(std::ignore, travs) = path_trav_finder->find_path_traversals( + vector travs; + vector trav_intervals; + std::tie(travs, trav_intervals) = path_trav_finder->find_path_traversals( pp_graph->get_handle(snarl->start().node_id(), snarl->start().backward()), pp_graph->get_handle(snarl->end().node_id(), snarl->end().backward())); - if (travs.empty() && snarl_to_ref.count(manager.parent_of(snarl))) { + for (int64_t i = 0; i < trav_intervals.size(); ++i) { + // make sure ref trav is first + if (pp_graph->get_sample_name(pp_graph->get_path_handle_of_step(trav_intervals[i].first)) == snarl_sample) { + if (i > 1) { + swap(travs[0], travs[1]); + swap(trav_intervals[0], trav_intervals[i]); + } + break; + } + } + if (trav_intervals.empty() && snarl_to_ref.count(manager.parent_of(snarl))) { // snarl's not on the reference path, us its parent interval - travs.push_back(snarl_to_ref.at(manager.parent_of(snarl))); + trav_intervals.push_back(snarl_to_ref.at(manager.parent_of(snarl))); } - if (travs.empty()) { + if (trav_intervals.empty()) { // search toward the reference path unordered_map> start_steps; unordered_map> end_steps; @@ -1206,43 +1237,43 @@ int main_stats(int argc, char** argv) { step != graph->path_end(graph->get_path_handle_of_step(start_path_steps.second[0])); step = graph->get_next_step(step)) { if (end_ids.count(graph->get_id(graph->get_handle_of_step(step)))) { - travs.push_back(make_pair(start_path_steps.second[0], + trav_intervals.push_back(make_pair(start_path_steps.second[0], end_ids[graph->get_id(graph->get_handle_of_step(step))])); break; } } // search backward - if (travs.empty()) { + if (trav_intervals.empty()) { for (step_handle_t step = graph->get_previous_step(start_path_steps.second[0]); step != graph->path_front_end(graph->get_path_handle_of_step(start_path_steps.second[0])); step = graph->get_previous_step(step)) { if (end_ids.count(graph->get_id(graph->get_handle_of_step(step)))) { - travs.push_back(make_pair(start_path_steps.second[0], + trav_intervals.push_back(make_pair(start_path_steps.second[0], end_ids[graph->get_id(graph->get_handle_of_step(step))])); break; } } } } - if (!travs.empty()) { + if (!trav_intervals.empty()) { break; } } // unlikely, but maybe only one end of the snarl reaches the reference. in that case // we just consider it both ends of the interval - if (travs.empty() && !start_steps.empty()) { - travs.push_back(make_pair(start_steps.begin()->second.front(), start_steps.begin()->second.front())); + if (trav_intervals.empty() && !start_steps.empty()) { + trav_intervals.push_back(make_pair(start_steps.begin()->second.front(), start_steps.begin()->second.front())); } - if (travs.empty() && !end_steps.empty()) { - travs.push_back(make_pair(end_steps.begin()->second.front(), end_steps.begin()->second.front())); + if (trav_intervals.empty() && !end_steps.empty()) { + trav_intervals.push_back(make_pair(end_steps.begin()->second.front(), end_steps.begin()->second.front())); } } - if (travs.empty()) { + if (trav_intervals.empty()) { cout << ".\t.\t.\t"; } else { // note: in case of a cycle, we're just taking the first found - step_handle_t start_step = travs.front().first; - step_handle_t end_step = travs.front().second; + step_handle_t start_step = trav_intervals.front().first; + step_handle_t end_step = trav_intervals.front().second; int64_t start_pos = pp_graph->get_position_of_step(start_step); int64_t end_pos = pp_graph->get_position_of_step(end_step); if (end_pos < start_pos) { @@ -1254,7 +1285,40 @@ int main_stats(int argc, char** argv) { << end_pos << "\t"; // use this interval for off-reference children - snarl_to_ref[snarl] = travs.front(); + snarl_to_ref[snarl] = trav_intervals.front(); + + // print the outlier of snarl (only works if snarl is exactly on reference since it + // requires the full traversal list) + if (snarl_outlier) { + int64_t outlier_idx = -1; + double outlier_jaccard = numeric_limits::max(); + int64_t outlier_len = -1; + if (trav_intervals.size() > 1) { + vector jaccards(trav_intervals.size()); + multiset refset; + for (handle_t handle : travs[0]) { + refset.insert(handle); + } + for (int64_t i = 1; i < trav_intervals.size(); ++i) { + multiset travset; + int64_t travlen = 0; + for (handle_t handle : travs[i]) { + travset.insert(handle); + travlen += pp_graph->get_length(handle); + } + double jaccard = weighted_jaccard_coefficient(pp_graph, refset, travset); + if (jaccard < outlier_jaccard) { + outlier_idx = i; + outlier_jaccard = jaccard; + outlier_len = travlen; + } + } + cout << pp_graph->get_path_name(pp_graph->get_path_handle_of_step(trav_intervals[outlier_idx].first)) + << "\t" << outlier_jaccard << "\t" << outlier_len << "\t"; + } else { + cout << ".\t.\t.\t"; + } + } } }