Skip to content

Commit

Permalink
countlets v1.3, slight optimisation in klets.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
bjmt committed May 5, 2019
1 parent ae178c3 commit 5594898
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 31 deletions.
6 changes: 6 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
2019-05-04 Benjamin Jean-Marie Tremblay <benjmtremblay@gmail.com

* faster char to int conversion in klets.cpp
* countlets is much faster when providing alphabet (using unordered_map)
* countlets version bumped to 1.3

2019-05-04 Benjamin Jean-Marie Tremblay <benjmtremblay@gmail.com>

* Don't bother and try to fill edges of unconnected vertices (shuffle_euler)
Expand Down
7 changes: 3 additions & 4 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,9 @@ This utility counts the total number of k-lets in the input sequence. Be aware
that the total number of k-lets is n^k, where n is the alphabet length. For use
cases involving memory constraints, providing the sequence alphabet ahead of
time will allow countlets to count k-lets while only needing to load k + 1
letters into memory at a time (at the cost of speed for higher k values). When
the alphabet is provided, it will typically never take up more than several MBs
of memory. Optionally, k-lets with counts of zero can be ommitted from the
output.
letters into memory at a time. When the alphabet is provided, it will typically
never take up more than several MBs of memory. Optionally, k-lets with counts of
zero can be ommitted from the output.

Example usage:

Expand Down
28 changes: 13 additions & 15 deletions src/countlets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@
#include <string>
#include <set>
#include <unistd.h>
#include <map>
#include <unordered_map>
#include "klets.hpp"
using namespace std;

void usage() {
printf(
"countlets v1.2 Copyright (C) 2019 Benjamin Jean-Marie Tremblay \n"
"countlets v1.3 Copyright (C) 2019 Benjamin Jean-Marie Tremblay \n"
" \n"
"Usage: countlets [options] -i [filename] -o [filename] \n"
" echo [string] | countlets [options] > [filename] \n"
Expand All @@ -42,23 +42,23 @@ void usage() {
" format. \n"
" -a <str> A string containing all of the alphabet letters present in the \n"
" sequence. This allows the program not to have to load the entire \n"
" sequence into memory to find all of the unique letters. The downside\n"
" is that runtime increases more with increasing k. \n"
" sequence into memory to find all of the unique letters. \n"
" -k <int> K-let size. Defaults to 1. \n"
" -n Don't print k-lets with counts of zero. \n"
" -h Show usage. \n"
);
}

map<string, unsigned int> count_stream(istream &input, vector<string> klets,
unordered_map<string, unsigned int> count_stream(istream &input, vector<string> klets,
unsigned int k) {

char l;

string let;
let.reserve(k + 1);

map<string, unsigned int> counts;
unordered_map<string, unsigned int> counts;
counts.reserve(klets.size());
for (size_t i = 0; i < klets.size(); ++i) {
counts[klets[i]] = 0;
}
Expand Down Expand Up @@ -199,7 +199,7 @@ int main(int argc, char **argv) {

/* this version only keeps k+1 characters in memory */

map<string, unsigned int> counts;
unordered_map<string, unsigned int> counts;

if (alph.length() < 1) {
cerr << "Error: could not parse -a option" << endl;
Expand Down Expand Up @@ -228,17 +228,15 @@ int main(int argc, char **argv) {
cerr << "Warning: foreign character(s) encountered" << endl;
}

map<string, unsigned int>::iterator it;

if (has_out) {
for (it = counts.begin(); it != counts.end(); ++it) {
if (it->second > 0 || !nozero)
outfile << it->first << "\t" << it->second << endl;
for (size_t i = 0; i < klets.size(); ++i) {
if (counts[klets[i]] > 0 || !nozero)
outfile << klets[i] << "\t" << counts[klets[i]] << "\t" << endl;
}
} else {
for (it = counts.begin(); it != counts.end(); ++it) {
if (it->second > 0 || !nozero)
cout << it->first << "\t" << it->second << endl;
for (size_t i = 0; i < klets.size(); ++i) {
if (counts[klets[i]] > 0 || !nozero)
cout << klets[i] << "\t" << counts[klets[i]] << "\t" << endl;
}
}

Expand Down
49 changes: 38 additions & 11 deletions src/klets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,14 @@
#include <iomanip>
#include <cmath>
#include <algorithm>
#include <unordered_map>
using namespace std;

#ifdef ADD_TIMERS
#include <chrono>
using Clock = chrono::high_resolution_clock;
#endif

vector<string> make_klets(vector<char> lets_uniq, unsigned int k) {

size_t alphlen = lets_uniq.size();
Expand Down Expand Up @@ -68,25 +74,35 @@ vector<unsigned int> count_klets(vector<char> letters, vector<char> lets_uniq,
* sequence in memory.
*/

#ifdef ADD_TIMERS
auto t0 = Clock::now();
cerr << ">BEGIN count_klets()" << endl;
#endif

size_t seqlen = letters.size();
unsigned int nlets = pow(alphlen, k);
vector<unsigned int> intletters;
unsigned int l, counter;
vector<unsigned int> let_counts(nlets, 0);
vector<unsigned int> intletters;
intletters.reserve(seqlen);
unordered_map<char, unsigned int> let2int;
let2int.reserve(lets_uniq.size());

for (size_t i = 0; i < seqlen; ++i) {

for (size_t j = 0; j < alphlen; ++j) {
if (letters[i] == lets_uniq[j]) {
intletters.push_back(j);
break;
}
}
for (size_t i = 0; i < lets_uniq.size(); ++i) {
let2int[lets_uniq[i]] = (unsigned int)i;
}

for (size_t i = 0; i < seqlen; ++i) {
intletters.push_back(let2int[letters[i]]);
}

unsigned int l;
unsigned int counter;
#ifdef ADD_TIMERS
auto t1 = Clock::now();
cerr << " lets->ints\t"
<< chrono::duration_cast<chrono::microseconds>(t1 - t0).count()
<< " us" << endl;
#endif

for (size_t i = 0; i < seqlen - k + 1; ++i) {

l = 0; counter = 0;
Expand All @@ -98,6 +114,17 @@ vector<unsigned int> count_klets(vector<char> letters, vector<char> lets_uniq,

}

#ifdef ADD_TIMERS
auto t2 = Clock::now();
cerr << " count loop\t"
<< chrono::duration_cast<chrono::microseconds>(t2 - t1).count()
<< " us" << endl;
cerr << " ---\n fun total\t"
<< chrono::duration_cast<chrono::microseconds>(t2 - t0).count()
<< " us" << endl;
cerr << ">END count_klets()" << endl;
#endif

return let_counts;

}
3 changes: 2 additions & 1 deletion src/shuffle_euler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ string shuffle_euler(vector<char> letters, default_random_engine gen, unsigned i
bool verbose) {

#ifdef ADD_TIMERS
cerr << ">shuffler_euler()" << endl;
cerr << ">BEGIN shuffler_euler()" << endl;
auto t0 = Clock::now();
#endif

Expand Down Expand Up @@ -341,6 +341,7 @@ string shuffle_euler(vector<char> letters, default_random_engine gen, unsigned i
cerr << " ---\n fun total\t"
<< chrono::duration_cast<chrono::microseconds>(t16 - t0).count()
<< " us" << endl;
cerr << ">END shuffler_euler()" << endl;
#endif

return out;
Expand Down

0 comments on commit 5594898

Please sign in to comment.