diff --git a/ChangeLog b/ChangeLog index 620c2a5..3b780e6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2019-05-04 Benjamin Jean-Marie Tremblay * Don't bother and try to fill edges of unconnected vertices (shuffle_euler) diff --git a/README b/README index 630b6a1..233f02f 100644 --- a/README +++ b/README @@ -45,10 +45,9 @@ This utility counts the total number of k-lets in the input sequence. Be aware that the total number of k-lets is n^k, where n is the alphabet length. For use cases involving memory constraints, providing the sequence alphabet ahead of time will allow countlets to count k-lets while only needing to load k + 1 -letters into memory at a time (at the cost of speed for higher k values). When -the alphabet is provided, it will typically never take up more than several MBs -of memory. Optionally, k-lets with counts of zero can be ommitted from the -output. +letters into memory at a time. When the alphabet is provided, it will typically +never take up more than several MBs of memory. Optionally, k-lets with counts of +zero can be ommitted from the output. Example usage: diff --git a/src/countlets.cpp b/src/countlets.cpp index 79cafc0..c5beca9 100644 --- a/src/countlets.cpp +++ b/src/countlets.cpp @@ -25,13 +25,13 @@ #include #include #include -#include +#include #include "klets.hpp" using namespace std; void usage() { printf( - "countlets v1.2 Copyright (C) 2019 Benjamin Jean-Marie Tremblay \n" + "countlets v1.3 Copyright (C) 2019 Benjamin Jean-Marie Tremblay \n" " \n" "Usage: countlets [options] -i [filename] -o [filename] \n" " echo [string] | countlets [options] > [filename] \n" @@ -42,15 +42,14 @@ void usage() { " format. \n" " -a A string containing all of the alphabet letters present in the \n" " sequence. This allows the program not to have to load the entire \n" - " sequence into memory to find all of the unique letters. The downside\n" - " is that runtime increases more with increasing k. \n" + " sequence into memory to find all of the unique letters. \n" " -k K-let size. Defaults to 1. \n" " -n Don't print k-lets with counts of zero. \n" " -h Show usage. \n" ); } -map count_stream(istream &input, vector klets, +unordered_map count_stream(istream &input, vector klets, unsigned int k) { char l; @@ -58,7 +57,8 @@ map count_stream(istream &input, vector klets, string let; let.reserve(k + 1); - map counts; + unordered_map counts; + counts.reserve(klets.size()); for (size_t i = 0; i < klets.size(); ++i) { counts[klets[i]] = 0; } @@ -199,7 +199,7 @@ int main(int argc, char **argv) { /* this version only keeps k+1 characters in memory */ - map counts; + unordered_map counts; if (alph.length() < 1) { cerr << "Error: could not parse -a option" << endl; @@ -228,17 +228,15 @@ int main(int argc, char **argv) { cerr << "Warning: foreign character(s) encountered" << endl; } - map::iterator it; - if (has_out) { - for (it = counts.begin(); it != counts.end(); ++it) { - if (it->second > 0 || !nozero) - outfile << it->first << "\t" << it->second << endl; + for (size_t i = 0; i < klets.size(); ++i) { + if (counts[klets[i]] > 0 || !nozero) + outfile << klets[i] << "\t" << counts[klets[i]] << "\t" << endl; } } else { - for (it = counts.begin(); it != counts.end(); ++it) { - if (it->second > 0 || !nozero) - cout << it->first << "\t" << it->second << endl; + for (size_t i = 0; i < klets.size(); ++i) { + if (counts[klets[i]] > 0 || !nozero) + cout << klets[i] << "\t" << counts[klets[i]] << "\t" << endl; } } diff --git a/src/klets.cpp b/src/klets.cpp index 9e7cae7..d03b39f 100644 --- a/src/klets.cpp +++ b/src/klets.cpp @@ -24,8 +24,14 @@ #include #include #include +#include using namespace std; +#ifdef ADD_TIMERS +#include +using Clock = chrono::high_resolution_clock; +#endif + vector make_klets(vector lets_uniq, unsigned int k) { size_t alphlen = lets_uniq.size(); @@ -68,25 +74,35 @@ vector count_klets(vector letters, vector lets_uniq, * sequence in memory. */ + #ifdef ADD_TIMERS + auto t0 = Clock::now(); + cerr << ">BEGIN count_klets()" << endl; + #endif + size_t seqlen = letters.size(); unsigned int nlets = pow(alphlen, k); - vector intletters; + unsigned int l, counter; vector let_counts(nlets, 0); + vector intletters; intletters.reserve(seqlen); + unordered_map let2int; + let2int.reserve(lets_uniq.size()); - for (size_t i = 0; i < seqlen; ++i) { - - for (size_t j = 0; j < alphlen; ++j) { - if (letters[i] == lets_uniq[j]) { - intletters.push_back(j); - break; - } - } + for (size_t i = 0; i < lets_uniq.size(); ++i) { + let2int[lets_uniq[i]] = (unsigned int)i; + } + for (size_t i = 0; i < seqlen; ++i) { + intletters.push_back(let2int[letters[i]]); } - unsigned int l; - unsigned int counter; + #ifdef ADD_TIMERS + auto t1 = Clock::now(); + cerr << " lets->ints\t" + << chrono::duration_cast(t1 - t0).count() + << " us" << endl; + #endif + for (size_t i = 0; i < seqlen - k + 1; ++i) { l = 0; counter = 0; @@ -98,6 +114,17 @@ vector count_klets(vector letters, vector lets_uniq, } + #ifdef ADD_TIMERS + auto t2 = Clock::now(); + cerr << " count loop\t" + << chrono::duration_cast(t2 - t1).count() + << " us" << endl; + cerr << " ---\n fun total\t" + << chrono::duration_cast(t2 - t0).count() + << " us" << endl; + cerr << ">END count_klets()" << endl; + #endif + return let_counts; } diff --git a/src/shuffle_euler.cpp b/src/shuffle_euler.cpp index ffe1c7c..1c38671 100644 --- a/src/shuffle_euler.cpp +++ b/src/shuffle_euler.cpp @@ -214,7 +214,7 @@ string shuffle_euler(vector letters, default_random_engine gen, unsigned i bool verbose) { #ifdef ADD_TIMERS - cerr << ">shuffler_euler()" << endl; + cerr << ">BEGIN shuffler_euler()" << endl; auto t0 = Clock::now(); #endif @@ -341,6 +341,7 @@ string shuffle_euler(vector letters, default_random_engine gen, unsigned i cerr << " ---\n fun total\t" << chrono::duration_cast(t16 - t0).count() << " us" << endl; + cerr << ">END shuffler_euler()" << endl; #endif return out;