Skip to content

Commit

Permalink
Adds probabilistic inference of ORF length based on stop codon propor…
Browse files Browse the repository at this point in the history
…tions within graph.
  • Loading branch information
samhorsfield96 committed Jul 24, 2022
1 parent 898e81e commit 8a0e6f2
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 74 deletions.
17 changes: 8 additions & 9 deletions src/call_ORFs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ void generate_ORFs(const int& colour_ID,
std::unordered_set<size_t>& hashes_to_remove,
const ColoredCDBG<MyUnitigMap>& ccdbg,
const std::vector<Kmer>& head_kmer_arr,
const float& stop_codon_freq,
const std::vector<std::string>& stop_codons,
const std::vector<std::string>& start_codons,
const std::vector<int>& nodelist,
Expand Down Expand Up @@ -135,15 +136,15 @@ void generate_ORFs(const int& colour_ID,

// iterate over each stop codon
for (const auto &codon : stop_codons) {
found_indices = findIndex(path_sequence, codon, 0, 0, false);
found_indices = findIndex(path_sequence, codon, 0, false);
stop_codon_indices.insert(stop_codon_indices.end(), std::make_move_iterator(found_indices.begin()),
std::make_move_iterator(found_indices.end()));
found_indices.clear();
}

// iterate over each start codon
for (const auto &codon : start_codons) {
found_indices = findIndex(path_sequence, codon, 0, 0, false);
found_indices = findIndex(path_sequence, codon, 0, false);
start_codon_indices.insert(start_codon_indices.end(), std::make_move_iterator(found_indices.begin()),
std::make_move_iterator(found_indices.end()));
found_indices.clear();
Expand Down Expand Up @@ -253,7 +254,6 @@ void generate_ORFs(const int& colour_ID,
if (!no_filter)
{
// Get TIS score for all entries
std::vector<float> TIS_scores(stop_codon.second.size());
std::vector<std::tuple<std::string, std::string, size_t>> TIS_list(stop_codon.second.size());

// create vector of all entries
Expand All @@ -279,13 +279,12 @@ void generate_ORFs(const int& colour_ID,
std::pair<size_t, size_t> best_codon = {0,0};
size_t best_hash;
bool best_TIS_present;
size_t best_ORF_len = 0;
ORFCoords best_ORF_coords;
float best_overall_score = 0;
float best_TIS_score = 0;

// set maximum ORF length for stop codon frame
const size_t max_ORF_length = stop_codon.second.at(0).first;
// set best ORF length as longest possible ORF
size_t best_ORF_len = stop_codon.second.at(0).first;

// unpack all ORFs with same stop codon
for (int i = 0; i < stop_codon.second.size(); i++)
Expand Down Expand Up @@ -313,11 +312,11 @@ void generate_ORFs(const int& colour_ID,
auto& um_data = um_pair.second;
float start_coverage = (float)um_data->full_colour().count() / (float)nb_colours;

// calculate length as proportion of previous ORF in list
float prev_len_prop = (float)ORF_len / (float)max_ORF_length;
// calculate delta length from max ORF in codon space
float delta_length = (float)(best_ORF_len / 3) - (float)(ORF_len / 3);

// generate score based on start coverage multiplied by dataset size, TIS score and size proportion to previous ORF
const float overall_score = start_coverage * TIS_score * prev_len_prop;
const float overall_score = start_coverage * TIS_score * std::pow((1 - stop_codon_freq), delta_length);

// determine if score is better and start site is better supported
if (overall_score > best_overall_score)
Expand Down
1 change: 1 addition & 0 deletions src/call_ORFs.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ void generate_ORFs(const int& colour_ID,
std::unordered_set<size_t>& hashes_to_remove,
const ColoredCDBG<MyUnitigMap>& ccdbg,
const std::vector<Kmer>& head_kmer_arr,
const float& stop_codon_freq,
const std::vector<std::string>& stop_codons,
const std::vector<std::string>& start_codons,
const std::vector<int>& nodelist,
Expand Down
6 changes: 4 additions & 2 deletions src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ std::pair<ColourORFMap, ColourEdgeMap> Graph::findGenes (const bool repeat,
}

// convert this to map to make removal easier
ORF_map = std::move(traverse_graph(_ccdbg, _KmerArray, colour_ID, node_ids, repeat, max_path_length,
ORF_map = std::move(traverse_graph(_ccdbg, _KmerArray, _stop_freq, colour_ID, node_ids, repeat, max_path_length,
overlap, is_ref, _RefSet, fm_idx, stop_codons_for, start_codons_for, min_ORF_length,
TIS_model, minimum_ORF_score, no_filter, all_TIS_scores));

Expand Down Expand Up @@ -807,7 +807,9 @@ void Graph::_index_graph (const std::vector<std::string>& stop_codons_for,
const size_t& nb_colours,
const std::vector<std::string>& input_colours)
{
_NodeColourVector = std::move(index_graph(_KmerArray, _ccdbg, stop_codons_for, stop_codons_rev, kmer, nb_colours, input_colours, _RefSet));
float stop_codon_freq = 0;
_NodeColourVector = std::move(index_graph(_KmerArray, _ccdbg, stop_codon_freq, stop_codons_for, stop_codons_rev, kmer, nb_colours, input_colours, _RefSet));
_stop_freq= stop_codon_freq;
}

ORFClusterMap read_cluster_file(const std::string& cluster_file)
Expand Down
42 changes: 22 additions & 20 deletions src/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,25 +48,25 @@ class Graph {
void out(const std::string& outfile);

// find ORFs
std::pair<ColourORFMap, ColourEdgeMap> findGenes (const bool repeat,
const size_t overlap,
const size_t max_path_length,
bool no_filter,
const std::vector<std::string>& stop_codons_for,
const std::vector<std::string>& start_codons_for,
const size_t min_ORF_length,
const size_t max_overlap,
const std::vector<std::string>& input_colours,
const std::string& ORF_model_file,
const std::string& TIS_model_file,
const float& minimum_ORF_score,
const float& minimum_path_score,
const size_t max_ORF_path_length,
const bool clustering,
const double& id_cutoff,
const double& len_diff_cutoff,
size_t num_threads,
const std::string& cluster_file);
std::pair<ColourORFMap, ColourEdgeMap> findGenes (const bool repeat,
const size_t overlap,
const size_t max_path_length,
bool no_filter,
const std::vector<std::string>& stop_codons_for,
const std::vector<std::string>& start_codons_for,
const size_t min_ORF_length,
const size_t max_overlap,
const std::vector<std::string>& input_colours,
const std::string& ORF_model_file,
const std::string& TIS_model_file,
const float& minimum_ORF_score,
const float& minimum_path_score,
const size_t max_ORF_path_length,
const bool clustering,
const double& id_cutoff,
const double& len_diff_cutoff,
size_t num_threads,
const std::string& cluster_file);


std::pair<RefindMap, bool> refind_gene(const size_t& colour_ID,
Expand Down Expand Up @@ -107,7 +107,6 @@ std::pair<ColourORFMap, ColourEdgeMap> findGenes (const bool repeat,
// clear graph object
void clear() {_ccdbg.clear(); _KmerArray.clear();};


private:
// index graph
void _index_graph (const std::vector<std::string>& stop_codons_for,
Expand All @@ -121,6 +120,9 @@ std::pair<ColourORFMap, ColourEdgeMap> findGenes (const bool repeat,
// stored bifrost DBG
ColoredCDBG<MyUnitigMap> _ccdbg;

// stop codon frequency
float _stop_freq = 0;

// mapping of head kmers to nodes
std::vector<Kmer> _KmerArray;

Expand Down
95 changes: 57 additions & 38 deletions src/indexing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,31 +59,26 @@ ColoredCDBG<MyUnitigMap> buildGraph (const std::string& infile_1,
std::vector<std::size_t> findIndex(const std::string& seq,
const std::string& subseq,
const int start_index,
const int overlap,
const bool reverse)
{
// search in forward direction from start index forwards
if (!reverse)
std::vector<std::size_t> index_list;
size_t pos = seq.find(subseq, start_index);

while(pos != string::npos)
{
std::vector<std::size_t> index_list;
size_t pos = seq.find(subseq, start_index);
while(pos != string::npos)
{
index_list.push_back(pos - overlap);
pos = seq.find(subseq,pos+1);
}
return index_list;
} else // search in reverse direction from start index backwards
index_list.push_back(pos);
pos = seq.find(subseq,pos+1);
}

if (reverse)
{
std::vector<std::size_t> index_list;
size_t pos = seq.rfind(subseq, start_index);
while(pos != string::npos && pos != 0)
for (auto& entry : index_list)
{
index_list.push_back(start_index - pos);
pos = seq.rfind(subseq,pos - 1);
// reverse by adding 3 (two to start from start of stop codon, 1 to make zero indexed)
entry = seq.size() - (entry + 3);
}
return index_list;
}
return index_list;
}

// calculate stop codon frames for each reading frame
Expand Down Expand Up @@ -174,6 +169,8 @@ std::vector<std::pair<Kmer, bool>> get_neighbours (const T& neighbour_iterator)
template <class T, class U, bool is_const>
void analyse_unitigs_binary (ColoredCDBG<MyUnitigMap>& ccdbg,
UnitigMap<DataAccessor<T>, DataStorage<U>, is_const> um,
size_t& num_stops,
size_t& num_codons,
const std::vector<std::string>& codon_for,
const std::vector<std::string>& codon_rev,
const int& kmer,
Expand All @@ -187,6 +184,10 @@ void analyse_unitigs_binary (ColoredCDBG<MyUnitigMap>& ccdbg,
const std::string unitig = um.referenceUnitigToString();
const size_t unitig_len = unitig.size();

// number of 3-mers is sequence length negate 2, multiplied by two for forward and reverse strand
#pragma omp atomic
num_codons += (unitig_len - 2) * 2;

// get head kmer for unitig, add to kmer dictionary
const Kmer head_kmer_binary = um.getUnitigHead();
const std::string head_kmer = head_kmer_binary.toString();
Expand All @@ -209,40 +210,50 @@ void analyse_unitigs_binary (ColoredCDBG<MyUnitigMap>& ccdbg,
std::vector<std::size_t> full_indices_neg;
std::vector<std::size_t> part_indices_neg;

std::vector<std::size_t> found_indices;
// use overlap - 2, taking into account unitigs where stop codon is split at end
const int overlap = kmer - 3;

// find the full and part indices of each of the codons in the unitig in positive strand
for (const auto& codon : codon_for)
{
// full positive strand
found_indices = findIndex(unitig, codon, 0, 0, false);
full_indices_pos.insert(full_indices_pos.end(), make_move_iterator(found_indices.begin()), make_move_iterator(found_indices.end()));
found_indices.clear();

// part positive strand, ensure that unitig length is sufficient to have at least 1 codon length once overlap negated
if (unitig_len > kmer + 2) {
found_indices = findIndex(unitig, codon, kmer - 1, kmer - 1,false);
part_indices_pos.insert(part_indices_pos.end(), make_move_iterator(found_indices.begin()), make_move_iterator(found_indices.end()));
found_indices.clear();
std::vector<std::size_t> found_indices = findIndex(unitig, codon, 0, false);

#pragma omp atomic
num_stops += found_indices.size();

full_indices_pos.insert(full_indices_pos.end(), found_indices.begin(), found_indices.end());

// part positive strand, take off overlap - 2, as may be case that codon in upstream node is split at end of unitig
for (const auto& entry : found_indices)
{
if (entry >= (overlap))
{
part_indices_pos.push_back(entry - (overlap));
}
}
}

// find the full and part indices of each of the codons in the unitig in negative strand

// get this working, now working without rfind, need to determine correct coorindates for partial unitigs
for (const auto& codon : codon_rev)
{
// full negative strand
std::vector<std::size_t> found_indices = findIndex(unitig, codon, 0, true);

#pragma omp atomic
num_stops += found_indices.size();

// issue here when actually find a match
found_indices = findIndex(unitig, codon, unitig_len - 3, 0, true);
full_indices_neg.insert(full_indices_neg.end(), make_move_iterator(found_indices.begin()), make_move_iterator(found_indices.end()));
found_indices.clear();
full_indices_neg.insert(full_indices_neg.end(), found_indices.begin(), found_indices.end());

// part negative strand, ensure that unitig length is sufficient to have at least 1 codon length once overlap negated
if (unitig_len > kmer + 2)
// part negative strand, taking into account potential for stop codon to be cut off from previous unitig
for (const auto& entry : found_indices)
{
found_indices = findIndex(unitig, codon, (unitig_len - 3) - (kmer - 1), 0, true);
part_indices_neg.insert(part_indices_neg.end(), make_move_iterator(found_indices.begin()), make_move_iterator(found_indices.end()));
found_indices.clear();
if (entry >= (overlap))
{
part_indices_neg.push_back(entry - (overlap));
}
}
}

Expand Down Expand Up @@ -360,6 +371,7 @@ void calculate_genome_paths(const std::vector<Kmer>& head_kmer_arr,

NodeColourVector index_graph(std::vector<Kmer>& head_kmer_arr,
ColoredCDBG<MyUnitigMap>& ccdbg,
float& stop_codon_freq,
const std::vector<std::string>& stop_codons_for,
const std::vector<std::string>& stop_codons_rev,
const int kmer,
Expand All @@ -379,6 +391,10 @@ NodeColourVector index_graph(std::vector<Kmer>& head_kmer_arr,

NodeColourVector node_colour_vector(nb_colours);

// determine number of stops and codons
size_t num_stops = 0;
size_t num_codons = 0;

// run unitig indexing in parallel
#pragma omp parallel
{
Expand All @@ -391,7 +407,7 @@ NodeColourVector index_graph(std::vector<Kmer>& head_kmer_arr,
UnitigColorMap<MyUnitigMap> um = ccdbg.find(*it, true);

// generate results per unitig
analyse_unitigs_binary(ccdbg, um, stop_codons_for, stop_codons_rev, kmer, nb_colours);
analyse_unitigs_binary(ccdbg, um, num_stops, num_codons, stop_codons_for, stop_codons_rev, kmer, nb_colours);
DataAccessor<MyUnitigMap>* da = um.getData();
MyUnitigMap* um_data = da->getData(um);

Expand All @@ -414,6 +430,9 @@ NodeColourVector index_graph(std::vector<Kmer>& head_kmer_arr,
}
}

// determine average stop codon frequency per codon
stop_codon_freq = (float)num_stops / (float)num_codons;

// generate index of references within ref_set
std::vector<size_t> ref_index;
for (int i = 0; i < input_colours.size(); i++)
Expand Down
4 changes: 3 additions & 1 deletion src/indexing.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ ColoredCDBG<MyUnitigMap> buildGraph (const std::string& infile_1,
std::vector<std::size_t> findIndex(const std::string& seq,
const std::string& subseq,
const int start_index,
const int overlap,
const bool reverse);

std::bitset<3> calculateFrame_binary_full (const std::vector<std::size_t>& index_list);
Expand All @@ -34,6 +33,8 @@ std::vector<std::pair<Kmer, bool>> get_neighbours (const T& neighbour_iterator);
template <class T, class U, bool is_const>
void analyse_unitigs_binary (ColoredCDBG<MyUnitigMap>& ccdbg,
UnitigMap<DataAccessor<T>, DataStorage<U>, is_const> um,
size_t& num_stops,
size_t& num_codons,
const std::vector<std::string>& codon_for,
const std::vector<std::string>& codon_rev,
const int& kmer,
Expand All @@ -48,6 +49,7 @@ void calculate_genome_paths(const std::vector<Kmer>& head_kmer_arr,

NodeColourVector index_graph(std::vector<Kmer>& head_kmer_arr,
ColoredCDBG<MyUnitigMap>& ccdbg,
float& stop_codon_freq,
const std::vector<std::string>& stop_codons_for,
const std::vector<std::string>& stop_codons_rev,
const int kmer,
Expand Down
Loading

0 comments on commit 8a0e6f2

Please sign in to comment.