forked from lemire/rollinghashcpp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
threewisehash.h
95 lines (80 loc) · 3.1 KB
/
threewisehash.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#ifndef THREEWISEHASH
#define THREEWISEHASH
#include <deque>
#include <vector>
#include "characterhash.h"
using namespace std;
/**
* Each instance is a rolling hash function meant to hash streams of characters.
* Each new instance of this class comes with new random keys.
*
* Recommended usage to get L-bit hash values over n-grams:
* ThreeWiseHash<> hf(n,L );
* for(uint32 k = 0; k<n;++k) {
* unsigned char c = ... ; // grab some character
* hf.eat(c); // feed it to the hasher
* }
* while(...) { // go over your string
* hf.hashvalue; // at all times, this contains the hash value
* unsigned char c = ... ;// points to the next character
* unsigned char out = ...; // character we want to forget
* hf.update(out,c); // update hash value
* }
*/
template <typename hashvaluetype = uint32, typename chartype = unsigned char>
class ThreeWiseHash {
public:
// myn is the length of the sequences, e.g., 3 means that you want to hash sequences of 3 characters
// mywordsize is the number of bits you which to receive as hash values, e.g., 19 means that the hash values are 19-bit integers
ThreeWiseHash(int myn, int mywordsize=19) : n(myn), wordsize(mywordsize),
hashers(),hasher(0) {
if(static_cast<uint>(wordsize) > 8*sizeof(hashvaluetype)) {
cerr<<"Can't create "<<wordsize<<"-bit hash values"<<endl;
throw "abord";
}
for (int i=0; i < n; ++i) {
CharacterHash<hashvaluetype,chartype> ch(maskfnc<hashvaluetype>(wordsize));
hashers.push_back(ch);
}
}
// add inchar as an input, this is used typically only at the start
// the hash value is updated to that of a longer string (one where inchar was appended)
void eat(chartype inchar) {
ngram.push_back(inchar);
__updateHashValue();
}
// add inchar as an input and remove outchar, the hashvalue is updated
// this function can be used to update the hash value from the hash value of [outchar]ABC to the hash value of ABC[inchar]
void update(chartype outchar, chartype inchar) {
ngram.push_back(inchar);
ngram.pop_front();
__updateHashValue();
}
// prepare to process a new string, you will need to call "eat" again
void reset() {
hashvalue = 0;
ngram.clear();
}
void __updateHashValue() {
hashvalue = 0;
for(size_t k = 0; k<ngram.size(); ++k) {
hashvalue ^= hashers[k].hashvalues[ngram[k]];
}
}
// this is a convenience function, use eat,update and .hashvalue to use as a rolling hash function
template<class container>
hashvaluetype hash(container & c) {
hashvaluetype answer(0);
for(size_t k = 0; k<c.size(); ++k) {
answer ^= hashers[k].hashvalues[c[k]];
}
return answer;
}
hashvaluetype hashvalue;
int n;
const int wordsize;
deque<chartype> ngram;
vector<CharacterHash<hashvaluetype,chartype> > hashers;
CharacterHash<hashvaluetype,chartype> hasher;//placeholder
};
#endif