diff --git a/src/call_ORFs.cpp b/src/call_ORFs.cpp index 83de06a..2d391ef 100644 --- a/src/call_ORFs.cpp +++ b/src/call_ORFs.cpp @@ -8,6 +8,7 @@ void generate_ORFs(const int& colour_ID, std::unordered_set& hashes_to_remove, const ColoredCDBG& ccdbg, const std::vector& head_kmer_arr, + const float& stop_codon_freq, const std::vector& stop_codons, const std::vector& start_codons, const std::vector& nodelist, @@ -135,7 +136,7 @@ void generate_ORFs(const int& colour_ID, // iterate over each stop codon for (const auto &codon : stop_codons) { - found_indices = findIndex(path_sequence, codon, 0, 0, false); + found_indices = findIndex(path_sequence, codon, 0, false); stop_codon_indices.insert(stop_codon_indices.end(), std::make_move_iterator(found_indices.begin()), std::make_move_iterator(found_indices.end())); found_indices.clear(); @@ -143,7 +144,7 @@ void generate_ORFs(const int& colour_ID, // iterate over each start codon for (const auto &codon : start_codons) { - found_indices = findIndex(path_sequence, codon, 0, 0, false); + found_indices = findIndex(path_sequence, codon, 0, false); start_codon_indices.insert(start_codon_indices.end(), std::make_move_iterator(found_indices.begin()), std::make_move_iterator(found_indices.end())); found_indices.clear(); @@ -253,7 +254,6 @@ void generate_ORFs(const int& colour_ID, if (!no_filter) { // Get TIS score for all entries - std::vector TIS_scores(stop_codon.second.size()); std::vector> TIS_list(stop_codon.second.size()); // create vector of all entries @@ -279,13 +279,12 @@ void generate_ORFs(const int& colour_ID, std::pair best_codon = {0,0}; size_t best_hash; bool best_TIS_present; - size_t best_ORF_len = 0; ORFCoords best_ORF_coords; float best_overall_score = 0; float best_TIS_score = 0; - // set maximum ORF length for stop codon frame - const size_t max_ORF_length = stop_codon.second.at(0).first; + // set best ORF length as longest possible ORF + size_t best_ORF_len = stop_codon.second.at(0).first; // unpack all ORFs with same stop codon for (int i = 0; i < stop_codon.second.size(); i++) @@ -313,11 +312,11 @@ void generate_ORFs(const int& colour_ID, auto& um_data = um_pair.second; float start_coverage = (float)um_data->full_colour().count() / (float)nb_colours; - // calculate length as proportion of previous ORF in list - float prev_len_prop = (float)ORF_len / (float)max_ORF_length; + // calculate delta length from max ORF in codon space + float delta_length = (float)(best_ORF_len / 3) - (float)(ORF_len / 3); // generate score based on start coverage multiplied by dataset size, TIS score and size proportion to previous ORF - const float overall_score = start_coverage * TIS_score * prev_len_prop; + const float overall_score = start_coverage * TIS_score * std::pow((1 - stop_codon_freq), delta_length); // determine if score is better and start site is better supported if (overall_score > best_overall_score) diff --git a/src/call_ORFs.h b/src/call_ORFs.h index 4475822..9d7230b 100644 --- a/src/call_ORFs.h +++ b/src/call_ORFs.h @@ -13,6 +13,7 @@ void generate_ORFs(const int& colour_ID, std::unordered_set& hashes_to_remove, const ColoredCDBG& ccdbg, const std::vector& head_kmer_arr, + const float& stop_codon_freq, const std::vector& stop_codons, const std::vector& start_codons, const std::vector& nodelist, diff --git a/src/graph.cpp b/src/graph.cpp index d528819..9388d6f 100644 --- a/src/graph.cpp +++ b/src/graph.cpp @@ -304,7 +304,7 @@ std::pair Graph::findGenes (const bool repeat, } // convert this to map to make removal easier - ORF_map = std::move(traverse_graph(_ccdbg, _KmerArray, colour_ID, node_ids, repeat, max_path_length, + ORF_map = std::move(traverse_graph(_ccdbg, _KmerArray, _stop_freq, colour_ID, node_ids, repeat, max_path_length, overlap, is_ref, _RefSet, fm_idx, stop_codons_for, start_codons_for, min_ORF_length, TIS_model, minimum_ORF_score, no_filter, all_TIS_scores)); @@ -807,7 +807,9 @@ void Graph::_index_graph (const std::vector& stop_codons_for, const size_t& nb_colours, const std::vector& input_colours) { - _NodeColourVector = std::move(index_graph(_KmerArray, _ccdbg, stop_codons_for, stop_codons_rev, kmer, nb_colours, input_colours, _RefSet)); + float stop_codon_freq = 0; + _NodeColourVector = std::move(index_graph(_KmerArray, _ccdbg, stop_codon_freq, stop_codons_for, stop_codons_rev, kmer, nb_colours, input_colours, _RefSet)); + _stop_freq= stop_codon_freq; } ORFClusterMap read_cluster_file(const std::string& cluster_file) diff --git a/src/graph.h b/src/graph.h index 099b4a8..9d554be 100644 --- a/src/graph.h +++ b/src/graph.h @@ -48,25 +48,25 @@ class Graph { void out(const std::string& outfile); // find ORFs -std::pair findGenes (const bool repeat, - const size_t overlap, - const size_t max_path_length, - bool no_filter, - const std::vector& stop_codons_for, - const std::vector& start_codons_for, - const size_t min_ORF_length, - const size_t max_overlap, - const std::vector& input_colours, - const std::string& ORF_model_file, - const std::string& TIS_model_file, - const float& minimum_ORF_score, - const float& minimum_path_score, - const size_t max_ORF_path_length, - const bool clustering, - const double& id_cutoff, - const double& len_diff_cutoff, - size_t num_threads, - const std::string& cluster_file); + std::pair findGenes (const bool repeat, + const size_t overlap, + const size_t max_path_length, + bool no_filter, + const std::vector& stop_codons_for, + const std::vector& start_codons_for, + const size_t min_ORF_length, + const size_t max_overlap, + const std::vector& input_colours, + const std::string& ORF_model_file, + const std::string& TIS_model_file, + const float& minimum_ORF_score, + const float& minimum_path_score, + const size_t max_ORF_path_length, + const bool clustering, + const double& id_cutoff, + const double& len_diff_cutoff, + size_t num_threads, + const std::string& cluster_file); std::pair refind_gene(const size_t& colour_ID, @@ -107,7 +107,6 @@ std::pair findGenes (const bool repeat, // clear graph object void clear() {_ccdbg.clear(); _KmerArray.clear();}; - private: // index graph void _index_graph (const std::vector& stop_codons_for, @@ -121,6 +120,9 @@ std::pair findGenes (const bool repeat, // stored bifrost DBG ColoredCDBG _ccdbg; + // stop codon frequency + float _stop_freq = 0; + // mapping of head kmers to nodes std::vector _KmerArray; diff --git a/src/indexing.cpp b/src/indexing.cpp index 037b7d0..d900d76 100644 --- a/src/indexing.cpp +++ b/src/indexing.cpp @@ -59,31 +59,26 @@ ColoredCDBG buildGraph (const std::string& infile_1, std::vector findIndex(const std::string& seq, const std::string& subseq, const int start_index, - const int overlap, const bool reverse) { - // search in forward direction from start index forwards - if (!reverse) + std::vector index_list; + size_t pos = seq.find(subseq, start_index); + + while(pos != string::npos) { - std::vector index_list; - size_t pos = seq.find(subseq, start_index); - while(pos != string::npos) - { - index_list.push_back(pos - overlap); - pos = seq.find(subseq,pos+1); - } - return index_list; - } else // search in reverse direction from start index backwards + index_list.push_back(pos); + pos = seq.find(subseq,pos+1); + } + + if (reverse) { - std::vector index_list; - size_t pos = seq.rfind(subseq, start_index); - while(pos != string::npos && pos != 0) + for (auto& entry : index_list) { - index_list.push_back(start_index - pos); - pos = seq.rfind(subseq,pos - 1); + // reverse by adding 3 (two to start from start of stop codon, 1 to make zero indexed) + entry = seq.size() - (entry + 3); } - return index_list; } + return index_list; } // calculate stop codon frames for each reading frame @@ -174,6 +169,8 @@ std::vector> get_neighbours (const T& neighbour_iterator) template void analyse_unitigs_binary (ColoredCDBG& ccdbg, UnitigMap, DataStorage, is_const> um, + size_t& num_stops, + size_t& num_codons, const std::vector& codon_for, const std::vector& codon_rev, const int& kmer, @@ -187,6 +184,10 @@ void analyse_unitigs_binary (ColoredCDBG& ccdbg, const std::string unitig = um.referenceUnitigToString(); const size_t unitig_len = unitig.size(); + // number of 3-mers is sequence length negate 2, multiplied by two for forward and reverse strand + #pragma omp atomic + num_codons += (unitig_len - 2) * 2; + // get head kmer for unitig, add to kmer dictionary const Kmer head_kmer_binary = um.getUnitigHead(); const std::string head_kmer = head_kmer_binary.toString(); @@ -209,40 +210,50 @@ void analyse_unitigs_binary (ColoredCDBG& ccdbg, std::vector full_indices_neg; std::vector part_indices_neg; - std::vector found_indices; + // use overlap - 2, taking into account unitigs where stop codon is split at end + const int overlap = kmer - 3; // find the full and part indices of each of the codons in the unitig in positive strand for (const auto& codon : codon_for) { // full positive strand - found_indices = findIndex(unitig, codon, 0, 0, false); - full_indices_pos.insert(full_indices_pos.end(), make_move_iterator(found_indices.begin()), make_move_iterator(found_indices.end())); - found_indices.clear(); - - // part positive strand, ensure that unitig length is sufficient to have at least 1 codon length once overlap negated - if (unitig_len > kmer + 2) { - found_indices = findIndex(unitig, codon, kmer - 1, kmer - 1,false); - part_indices_pos.insert(part_indices_pos.end(), make_move_iterator(found_indices.begin()), make_move_iterator(found_indices.end())); - found_indices.clear(); + std::vector found_indices = findIndex(unitig, codon, 0, false); + + #pragma omp atomic + num_stops += found_indices.size(); + + full_indices_pos.insert(full_indices_pos.end(), found_indices.begin(), found_indices.end()); + + // part positive strand, take off overlap - 2, as may be case that codon in upstream node is split at end of unitig + for (const auto& entry : found_indices) + { + if (entry >= (overlap)) + { + part_indices_pos.push_back(entry - (overlap)); + } } } // find the full and part indices of each of the codons in the unitig in negative strand + + // get this working, now working without rfind, need to determine correct coorindates for partial unitigs for (const auto& codon : codon_rev) { // full negative strand + std::vector found_indices = findIndex(unitig, codon, 0, true); + + #pragma omp atomic + num_stops += found_indices.size(); - // issue here when actually find a match - found_indices = findIndex(unitig, codon, unitig_len - 3, 0, true); - full_indices_neg.insert(full_indices_neg.end(), make_move_iterator(found_indices.begin()), make_move_iterator(found_indices.end())); - found_indices.clear(); + full_indices_neg.insert(full_indices_neg.end(), found_indices.begin(), found_indices.end()); - // part negative strand, ensure that unitig length is sufficient to have at least 1 codon length once overlap negated - if (unitig_len > kmer + 2) + // part negative strand, taking into account potential for stop codon to be cut off from previous unitig + for (const auto& entry : found_indices) { - found_indices = findIndex(unitig, codon, (unitig_len - 3) - (kmer - 1), 0, true); - part_indices_neg.insert(part_indices_neg.end(), make_move_iterator(found_indices.begin()), make_move_iterator(found_indices.end())); - found_indices.clear(); + if (entry >= (overlap)) + { + part_indices_neg.push_back(entry - (overlap)); + } } } @@ -360,6 +371,7 @@ void calculate_genome_paths(const std::vector& head_kmer_arr, NodeColourVector index_graph(std::vector& head_kmer_arr, ColoredCDBG& ccdbg, + float& stop_codon_freq, const std::vector& stop_codons_for, const std::vector& stop_codons_rev, const int kmer, @@ -379,6 +391,10 @@ NodeColourVector index_graph(std::vector& head_kmer_arr, NodeColourVector node_colour_vector(nb_colours); + // determine number of stops and codons + size_t num_stops = 0; + size_t num_codons = 0; + // run unitig indexing in parallel #pragma omp parallel { @@ -391,7 +407,7 @@ NodeColourVector index_graph(std::vector& head_kmer_arr, UnitigColorMap um = ccdbg.find(*it, true); // generate results per unitig - analyse_unitigs_binary(ccdbg, um, stop_codons_for, stop_codons_rev, kmer, nb_colours); + analyse_unitigs_binary(ccdbg, um, num_stops, num_codons, stop_codons_for, stop_codons_rev, kmer, nb_colours); DataAccessor* da = um.getData(); MyUnitigMap* um_data = da->getData(um); @@ -414,6 +430,9 @@ NodeColourVector index_graph(std::vector& head_kmer_arr, } } + // determine average stop codon frequency per codon + stop_codon_freq = (float)num_stops / (float)num_codons; + // generate index of references within ref_set std::vector ref_index; for (int i = 0; i < input_colours.size(); i++) diff --git a/src/indexing.h b/src/indexing.h index b091d36..787b0ec 100644 --- a/src/indexing.h +++ b/src/indexing.h @@ -16,7 +16,6 @@ ColoredCDBG buildGraph (const std::string& infile_1, std::vector findIndex(const std::string& seq, const std::string& subseq, const int start_index, - const int overlap, const bool reverse); std::bitset<3> calculateFrame_binary_full (const std::vector& index_list); @@ -34,6 +33,8 @@ std::vector> get_neighbours (const T& neighbour_iterator); template void analyse_unitigs_binary (ColoredCDBG& ccdbg, UnitigMap, DataStorage, is_const> um, + size_t& num_stops, + size_t& num_codons, const std::vector& codon_for, const std::vector& codon_rev, const int& kmer, @@ -48,6 +49,7 @@ void calculate_genome_paths(const std::vector& head_kmer_arr, NodeColourVector index_graph(std::vector& head_kmer_arr, ColoredCDBG& ccdbg, + float& stop_codon_freq, const std::vector& stop_codons_for, const std::vector& stop_codons_rev, const int kmer, diff --git a/src/traversal.cpp b/src/traversal.cpp index d02fa7a..d1bb694 100644 --- a/src/traversal.cpp +++ b/src/traversal.cpp @@ -138,8 +138,8 @@ PathVector iter_nodes_binary (const ColoredCDBG& ccdbg, continue; } - // calculate modulus for path and updated cached path reading frame - int modulus = path_length % 3; + // calculate modulus for path and updated cached path reading frame, adjust for case where stop codon is split at end of contig + int modulus = (path_length - 2) % 3; std::bitset<3> updated_codon_arr = (codon_arr & ~(neighbour_um_data->get_codon_arr(false, neighbour_strand, modulus))); // return path if end of contig found or if stop indexes paired @@ -189,6 +189,7 @@ PathVector iter_nodes_binary (const ColoredCDBG& ccdbg, ORFNodeRobMap traverse_graph(const ColoredCDBG& ccdbg, const std::vector& head_kmer_arr, + const float& stop_codon_freq, const size_t colour_ID, const std::vector& node_ids, const bool repeat, @@ -267,7 +268,7 @@ ORFNodeRobMap traverse_graph(const ColoredCDBG& ccdbg, for (int i = 0; i < unitig_complete_paths.size(); i++) { // generate all ORFs within the path for start and stop codon pairs - generate_ORFs(colour_ID, ORF_node_map, hashes_to_remove, ccdbg, head_kmer_arr, stop_codons_for, start_codons_for, unitig_complete_paths[i], overlap, min_ORF_length, is_ref, fm_idx, TIS_model, minimum_ORF_score, no_filter, nb_colours, all_TIS_scores); + generate_ORFs(colour_ID, ORF_node_map, hashes_to_remove, ccdbg, head_kmer_arr, stop_codon_freq, stop_codons_for, start_codons_for, unitig_complete_paths[i], overlap, min_ORF_length, is_ref, fm_idx, TIS_model, minimum_ORF_score, no_filter, nb_colours, all_TIS_scores); } } } @@ -324,7 +325,7 @@ ORFNodeRobMap traverse_graph(const ColoredCDBG& ccdbg, for (int i = 0; i < unitig_complete_paths.size(); i++) { // generate all ORFs within the path for start and stop codon pairs - generate_ORFs(colour_ID, ORF_node_map, hashes_to_remove, ccdbg, head_kmer_arr, stop_codons_for, start_codons_for, unitig_complete_paths[i], overlap, min_ORF_length, is_ref, fm_idx, TIS_model, minimum_ORF_score, no_filter, nb_colours, all_TIS_scores); + generate_ORFs(colour_ID, ORF_node_map, hashes_to_remove, ccdbg, head_kmer_arr, stop_codon_freq, stop_codons_for, start_codons_for, unitig_complete_paths[i], overlap, min_ORF_length, is_ref, fm_idx, TIS_model, minimum_ORF_score, no_filter, nb_colours, all_TIS_scores); } } } diff --git a/src/traversal.h b/src/traversal.h index 0624b79..110a5c9 100644 --- a/src/traversal.h +++ b/src/traversal.h @@ -20,6 +20,7 @@ PathVector iter_nodes_binary (const ColoredCDBG& ccdbg, ORFNodeRobMap traverse_graph(const ColoredCDBG& ccdbg, const std::vector& head_kmer_arr, + const float& stop_codon_freq, const size_t colour_ID, const std::vector& node_ids, const bool repeat,