Skip to content

Commit

Permalink
Adds method to prevent refinding traversal over nodes that are alread…
Browse files Browse the repository at this point in the history
…y part of called ORFs.
  • Loading branch information
samhorsfield96 committed Jul 28, 2022
1 parent 8a0e6f2 commit db2d0df
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 29 deletions.
11 changes: 10 additions & 1 deletion panaroo_runner/find_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,11 +215,20 @@ def search_graph(search_pair,

node_locs = {}

# keep track of regions already with genes to avoid re-traversal
to_avoid = set()

# mask regions that already have genes
for node, ORF_info in conflicts.items():
# determine sequence overlap of ORFs
total_overlap = 0
for i, node_coords in enumerate(ORF_info[1]):
node_overlap = (node_coords[1] - node_coords[0]) + 1

# if node is fully traversed for given strand, add to nodes to avoid
if node_overlap == graph_shd_arr[0].node_size(ORF_info[0][i]):
to_avoid.add(ORF_info[0][i])

if i != 0:
if node_coords[1] >= kmer - 1:
total_overlap += ((kmer - 1) - node_coords[0])
Expand All @@ -229,7 +238,7 @@ def search_graph(search_pair,

# get sequences to search
refind_map, is_ref = graph_shd_arr[0].refind_gene(member, node_search_dict, search_radius, kmer, fasta,
repeat)
repeat, to_avoid)

# search for matches
hits = []
Expand Down
52 changes: 30 additions & 22 deletions src/gene_refinding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ RefindPathVector iter_nodes_length (const ColoredCDBG<MyUnitigMap>& ccdbg,
const bool& repeat,
const bool& is_ref,
const fm_index_coll& fm_idx,
const int overlap)
const int overlap,
const std::unordered_set<int>& to_avoid)
{
// generate path list, vector for path and the stack
RefindPathVector path_list;
Expand Down Expand Up @@ -196,23 +197,17 @@ RefindPathVector iter_nodes_length (const ColoredCDBG<MyUnitigMap>& ccdbg,
// update path_list to enable to return all nodes up to point where stop encountered
path_list.push_back({path_length, node_vector});
continue;
} else
{
// if not is_ref, check that unitig is shared in at least one other colour
if (!is_ref)
{
if (neighbour_colour.count() < 2)
{
// update path_list to enable to return all nodes up to point where stop encountered
path_list.push_back({path_length, node_vector});
continue;
}
}
}

// parse neighbour information. Frame is next stop codon, with first dictating orientation and second the stop codon index
const int neighbour_id = (neighbour_strand) ? neighbour_um_data->get_id() : neighbour_um_data->get_id() * -1;

// check if node is already fully traversed by another gene
if (to_avoid.find(neighbour_id) != to_avoid.end())
{
continue;
}

// check against fm-idx every node, pass if not present
if (is_ref)
{
Expand All @@ -225,11 +220,22 @@ RefindPathVector iter_nodes_length (const ColoredCDBG<MyUnitigMap>& ccdbg,
path_list.push_back({path_length, node_vector});
continue;
}
} else if (!is_ref && !repeat)
} else
{
// if using reads, check if unitig has already been traversed, and pass if repeat not specified
const bool is_in = std::find(node_vector.begin(), node_vector.end(), neighbour_id) != node_vector.end();
if (is_in)
if (!repeat)
{
// if using reads, check if unitig has already been traversed, and pass if repeat not specified
const bool is_in = std::find(node_vector.begin(), node_vector.end(), neighbour_id) != node_vector.end();
if (is_in)
{
// update path_list to enable to return all nodes up to point where stop encountered
path_list.push_back({path_length, node_vector});
continue;
}
}

// if not is_ref, check that unitig is shared in at least one other colour
if (neighbour_colour.count() < 2)
{
// update path_list to enable to return all nodes up to point where stop encountered
path_list.push_back({path_length, node_vector});
Expand Down Expand Up @@ -281,7 +287,8 @@ RefindTuple traverse_outward(const ColoredCDBG<MyUnitigMap>& ccdbg,
const bool is_ref,
const int kmer,
const fm_index_coll& fm_idx,
const bool repeat)
const bool repeat,
const std::unordered_set<int>& to_avoid)
{
// initialise upstream and downstream strings
std::string upstream_seq;
Expand Down Expand Up @@ -326,7 +333,7 @@ RefindTuple traverse_outward(const ColoredCDBG<MyUnitigMap>& ccdbg,
NodeTuple head_node_tuple(0, head_id, codon_arr, colour_arr, path_length);

// recur paths
unitig_complete_paths = iter_nodes_length(ccdbg, head_kmer_arr, head_node_tuple, colour_ID, radius, repeat, is_ref, fm_idx, kmer - 1);
unitig_complete_paths = iter_nodes_length(ccdbg, head_kmer_arr, head_node_tuple, colour_ID, radius, repeat, is_ref, fm_idx, kmer - 1, to_avoid);
}

if (!unitig_complete_paths.empty())
Expand Down Expand Up @@ -399,7 +406,7 @@ RefindTuple traverse_outward(const ColoredCDBG<MyUnitigMap>& ccdbg,
NodeTuple head_node_tuple(0, head_id, codon_arr, colour_arr, path_length);

// recur paths
unitig_complete_paths = iter_nodes_length(ccdbg, head_kmer_arr, head_node_tuple, colour_ID, radius, repeat, is_ref, fm_idx, kmer - 1);
unitig_complete_paths = iter_nodes_length(ccdbg, head_kmer_arr, head_node_tuple, colour_ID, radius, repeat, is_ref, fm_idx, kmer - 1, to_avoid);
}

if (!unitig_complete_paths.empty())
Expand Down Expand Up @@ -445,7 +452,8 @@ RefindMap refind_in_nodes(const ColoredCDBG<MyUnitigMap>& ccdbg,
const bool is_ref,
const int kmer,
const fm_index_coll& fm_idx,
const bool repeat)
const bool repeat,
const std::unordered_set<int>& to_avoid)
{
RefindMap refind_map;

Expand All @@ -459,7 +467,7 @@ RefindMap refind_in_nodes(const ColoredCDBG<MyUnitigMap>& ccdbg,
for (const auto& ORF_info : seq_search.second)
{
refind_map[node].push_back(std::move(traverse_outward(ccdbg, head_kmer_arr, colour_ID, ORF_info, radius, is_ref,
kmer, fm_idx, repeat)));
kmer, fm_idx, repeat, to_avoid)));
}
}
return refind_map;
Expand Down
9 changes: 6 additions & 3 deletions src/gene_refinding.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ RefindPathVector iter_nodes_length (const ColoredCDBG<MyUnitigMap>& ccdbg,
const bool& repeat,
const bool& is_ref,
const fm_index_coll& fm_idx,
const int overlap);
const int overlap,
const std::unordered_set<int>& to_avoid);

RefindTuple traverse_outward(const ColoredCDBG<MyUnitigMap>& ccdbg,
const std::vector<Kmer>& head_kmer_arr,
Expand All @@ -37,7 +38,8 @@ RefindTuple traverse_outward(const ColoredCDBG<MyUnitigMap>& ccdbg,
const bool is_ref,
const int kmer,
const fm_index_coll& fm_idx,
const bool repeat);
const bool repeat,
const std::unordered_set<int>& to_avoid);

RefindMap refind_in_nodes(const ColoredCDBG<MyUnitigMap>& ccdbg,
const std::vector<Kmer>& head_kmer_arr,
Expand All @@ -47,6 +49,7 @@ RefindMap refind_in_nodes(const ColoredCDBG<MyUnitigMap>& ccdbg,
const bool is_ref,
const int kmer,
const fm_index_coll& fm_idx,
const bool repeat);
const bool repeat,
const std::unordered_set<int>& to_avoid);

#endif //GENE_REFINDING_H
5 changes: 3 additions & 2 deletions src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,8 @@ std::pair<RefindMap, bool> Graph::refind_gene(const size_t& colour_ID,
const size_t radius,
const int kmer,
const std::string& FM_fasta_file,
const bool repeat)
const bool repeat,
const std::unordered_set<int>& to_avoid)
{
// get whether colour is reference or not
bool is_ref = ((bool)_RefSet[colour_ID]) ? true : false;
Expand All @@ -726,7 +727,7 @@ std::pair<RefindMap, bool> Graph::refind_gene(const size_t& colour_ID,
}

return {refind_in_nodes(_ccdbg, _KmerArray, colour_ID, node_search_dict, radius, is_ref,
kmer, fm_idx, repeat), is_ref};
kmer, fm_idx, repeat, to_avoid), is_ref};
}

std::string Graph::generate_sequence(const std::vector<int>& nodelist,
Expand Down
3 changes: 2 additions & 1 deletion src/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ class Graph {
const size_t radius,
const int kmer,
const std::string& FM_fasta_file,
const bool repeat);
const bool repeat,
const std::unordered_set<int>& to_avoid);

// generate sequences from ORF node_lists
std::string generate_sequence(const std::vector<int>& nodelist,
Expand Down

0 comments on commit db2d0df

Please sign in to comment.