forked from lemire/rollinghashcpp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cyclichash.h
154 lines (128 loc) · 5.41 KB
/
cyclichash.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#ifndef CYCLICHASH
#define CYCLICHASH
#include "characterhash.h"
/**
* Each instance is a rolling hash function meant to hash streams of characters.
* Each new instance of this class comes with new random keys.
*
* Recommended usage to get L-bit hash values over n-grams:
* CyclicHash<> hf(n,L );
* for(uint32 k = 0; k<n;++k) {
* unsigned char c = ... ; // grab some character
* hf.eat(c); // feed it to the hasher
* }
* while(...) { // go over your string
* hf.hashvalue; // at all times, this contains the hash value
* unsigned char c = ... ;// points to the next character
* unsigned char out = ...; // character we want to forget
* hf.update(out,c); // update hash value
* }
*/
template <typename hashvaluetype = uint32, typename chartype = unsigned char>
class CyclicHash {
public:
// myn is the length of the sequences, e.g., 3 means that you want to hash sequences of 3 characters
// mywordsize is the number of bits you which to receive as hash values, e.g., 19 means that the hash values are 19-bit integers
CyclicHash(int myn, int mywordsize=19) : hashvalue(0),
n(myn), wordsize(mywordsize),
hasher(maskfnc<hashvaluetype>(wordsize)),
mask1(maskfnc<hashvaluetype>(wordsize-1)),
myr(n%wordsize),
maskn(maskfnc<hashvaluetype>(wordsize-myr))
{
if(static_cast<uint>(wordsize) > 8*sizeof(hashvaluetype)) {
cerr<<"Can't create "<<wordsize<<"-bit hash values"<<endl;
throw "abord";
}
}
CyclicHash(int myn, uint32 seed1, uint32 seed2, int mywordsize=19) :
hashvalue(0),
n(myn), wordsize(mywordsize),
hasher(maskfnc<hashvaluetype>(wordsize), seed1, seed2),
mask1(maskfnc<hashvaluetype>(wordsize-1)),
myr(n%wordsize),
maskn(maskfnc<hashvaluetype>(wordsize-myr))
{
if(static_cast<uint>(wordsize) > 8*sizeof(hashvaluetype)) {
cerr<<"Can't create "<<wordsize<<"-bit hash values"<<endl;
throw "abord";
}
}
void fastleftshiftn(hashvaluetype & x) const {
x = ((x & maskn) << myr ) | (x >> (wordsize-myr)) ;
}
void fastleftshift1(hashvaluetype & x) const {
x = ((x & mask1) << 1 ) | (x >> (wordsize-1)) ;
}
void fastrightshift1(hashvaluetype & x) const {
x = (x >> 1 ) | ((x & 1)<< (wordsize-1)) ;
}
hashvaluetype getfastleftshift1(hashvaluetype x) const {
return ((x & mask1) << 1 ) | (x >> (wordsize-1)) ;
}
hashvaluetype getfastrightshift1(hashvaluetype x) const {
return (x >> 1 ) | ((x & 1)<< (wordsize-1)) ;
}
// this is a convenience function, use eat,update and .hashvalue to use as a rolling hash function
template<class container>
hashvaluetype hash(container & c) {
hashvaluetype answer(0);
for(uint k = 0; k<c.size(); ++k) {
fastleftshift1(answer);
answer ^= hasher.hashvalues[static_cast<unsigned int>(c[k])];
}
return answer;
}
hashvaluetype hashz(chartype outchar,uint n) {
hashvaluetype answer = hasher.hashvalues[static_cast<unsigned int>(outchar)];
for(uint k = 0; k<n; ++k) {
fastleftshift1(answer);
}
return answer;
}
// add inchar as an input and remove outchar, the hashvalue is updated
// this function can be used to update the hash value from the hash value of [outchar]ABC to the hash value of ABC[inchar]
void update(chartype outchar, chartype inchar) {
hashvaluetype z (hasher.hashvalues[outchar]);
fastleftshiftn(z);
hashvalue = getfastleftshift1(hashvalue)
^ z
^ hasher.hashvalues[inchar];
}
// this is the reverse of the update function.
// this function can be used to update the hash value from the hash value of ABC[inchar] to the hash value of [outchar]ABC
void reverse_update(chartype outchar, chartype inchar) {
hashvaluetype z (hasher.hashvalues[outchar]);
fastleftshiftn(z);
hashvalue ^= z ^ hasher.hashvalues[inchar];
hashvalue = getfastrightshift1(hashvalue);
}
// add inchar as an input, this is used typically only at the start
// the hash value is updated to that of a longer string (one where inchar was appended)
void eat(chartype inchar) {
fastleftshift1(hashvalue);
hashvalue ^= hasher.hashvalues[inchar];
}
//for an n-gram X it returns hash value of (n + 1)-gram XY without changing the object X. For example, if X = "ABC", then X.hash_extend("D") returns value of "ABCD" without changing the state of X
hashvaluetype hash_extend(chartype Y) {
return getfastleftshift1(hashvalue) ^ hasher.hashvalues[Y];
}
// same as hash_extend, but with prepending the n-gram with character Y. If X = "ABC", then X.hash_prepend("D") returns value of "DABC" without changing the state of X
hashvaluetype hash_prepend(chartype Y) {
hashvaluetype z (hasher.hashvalues[Y]);
fastleftshiftn(z);
return z ^ hashvalue;
}
// prepare to process a new string, you will need to call "eat" again
void reset() {
hashvalue = 0;
}
hashvaluetype hashvalue;
int n;
const int wordsize;
CharacterHash<hashvaluetype,chartype> hasher;
const hashvaluetype mask1;
const int myr;
const hashvaluetype maskn;
};
#endif