forked from wmorgan/whistlepig
-
Notifications
You must be signed in to change notification settings - Fork 0
/
entry.c
161 lines (126 loc) · 4.54 KB
/
entry.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#include "whistlepig.h"
#include "tokenizer.lex.h"
static inline khint_t khash_hash_string(const char *s) {
khint_t h = *s;
if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
return h;
}
inline khint_t fielded_term_hash(fielded_term ft) {
return khash_hash_string(ft.field) ^ khash_hash_string(ft.term);
}
inline khint_t fielded_term_equals(fielded_term a, fielded_term b) {
return (strcmp(a.field, b.field) == 0) && (strcmp(a.term, b.term) == 0);
}
wp_entry* wp_entry_new() {
wp_entry* ret = malloc(sizeof(wp_entry));
ret->entries = kh_init(entries);
ret->next_offset = 0;
return ret;
}
RAISING_STATIC(add_token(wp_entry* entry, const char* field, const char* term, size_t field_len, size_t term_len)) {
fielded_term ft;
int status;
// copy field and term
ft.field = calloc(field_len + 1, sizeof(char));
strncpy(ft.field, field, field_len);
ft.term = calloc(term_len + 1, sizeof(char));
strncpy(ft.term, term, term_len);
khiter_t k = kh_put(entries, entry->entries, ft, &status);
if(status == 1) { // not found
RARRAY(pos_t) pa;
RARRAY_INIT(pos_t, pa);
RARRAY_ADD(pos_t, pa, entry->next_offset);
kh_value(entry->entries, k) = pa;
}
else { // just add the next offset to the array
RARRAY_ADD(pos_t, kh_value(entry->entries, k), entry->next_offset);
// don't need these guys any more
free(ft.field);
free(ft.term);
}
entry->next_offset++;
return NO_ERROR;
}
uint32_t wp_entry_size(wp_entry* entry) {
uint32_t ret = 0;
for(khiter_t i = kh_begin(entry->entries); i < kh_end(entry->entries); i++) {
if(kh_exist(entry->entries, i)) {
RARRAY(pos_t) positions = kh_val(entry->entries, i);
ret += RARRAY_NELEM(positions);
}
}
return ret;
}
#define MAX_TOKEN_LENGTH 50
RAISING_STATIC(add_from_lexer(wp_entry* entry, yyscan_t* scanner, const char* field)) {
size_t field_len = strlen(field);
while(yylex(*scanner) != TOK_DONE) {
if(yyget_leng(*scanner) <= MAX_TOKEN_LENGTH) {
RELAY_ERROR(add_token(entry, field, yyget_text(*scanner), field_len, yyget_leng(*scanner)));
}
}
return NO_ERROR;
}
wp_error* wp_entry_add_token(wp_entry* entry, const char* field, const char* term) {
RELAY_ERROR(add_token(entry, field, term, strlen(field), strlen(term)));
return NO_ERROR;
}
// tokenizes and adds everything under a single field
wp_error* wp_entry_add_string(wp_entry* entry, const char* field, const char* string) {
yyscan_t scanner;
lexinfo charpos = {0, 0};
yylex_init_extra(&charpos, &scanner);
YY_BUFFER_STATE state = yy_scan_string(string, scanner);
RELAY_ERROR(add_from_lexer(entry, &scanner, field));
yy_delete_buffer(state, scanner);
yylex_destroy(scanner);
return NO_ERROR;
}
// tokenizes and adds everything from a file under a single field
wp_error* wp_entry_add_file(wp_entry* entry, const char* field, FILE* f) {
yyscan_t scanner;
lexinfo charpos = {0, 0};
yylex_init_extra(&charpos, &scanner);
yyset_in(f, scanner);
RELAY_ERROR(add_from_lexer(entry, &scanner, field));
yylex_destroy(scanner);
return NO_ERROR;
}
wp_error* wp_entry_write_to_segment(wp_entry* entry, wp_segment* seg, docid_t doc_id) {
for(khiter_t i = kh_begin(entry->entries); i < kh_end(entry->entries); i++) {
if(kh_exist(entry->entries, i)) {
fielded_term ft = kh_key(entry->entries, i);
RARRAY(pos_t) positions = kh_val(entry->entries, i);
RELAY_ERROR(wp_segment_add_posting(seg, ft.field, ft.term, doc_id, RARRAY_NELEM(positions), RARRAY_ALL(positions)));
}
}
return NO_ERROR;
}
// currently this is a crazy overestimate (it's calculating the size without
// VBE) but that's fine. as long as we're not an underestimate, we should be ok.
wp_error* wp_entry_sizeof_postings_region(wp_entry* entry, wp_segment* seg, uint32_t* size) {
*size = 0;
for(khiter_t i = kh_begin(entry->entries); i < kh_end(entry->entries); i++) {
if(kh_exist(entry->entries, i)) {
RARRAY(pos_t) positions = kh_val(entry->entries, i);
uint32_t this_size;
RELAY_ERROR(wp_segment_sizeof_posarray(seg, RARRAY_NELEM(positions), RARRAY_ALL(positions), &this_size));
*size += this_size;
}
}
return NO_ERROR;
}
wp_error* wp_entry_free(wp_entry* entry) {
for(khiter_t k = kh_begin(entry->entries); k < kh_end(entry->entries); k++) {
if(kh_exist(entry->entries, k)) {
fielded_term ft = kh_key(entry->entries, k);
RARRAY(pos_t) positions = kh_val(entry->entries, k);
free(ft.term);
free(ft.field);
RARRAY_FREE(pos_t, positions);
}
}
kh_destroy(entries, entry->entries);
free(entry);
return NO_ERROR;
}