-
Notifications
You must be signed in to change notification settings - Fork 0
/
datasetReading.py
433 lines (390 loc) · 16 KB
/
datasetReading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
from trec_car.read_data import *
import json
import stop_words as stpw
i = 0
id_set = {}
def parse_annotations(file):
"""
A simple function to parse annotations from a cbor trec-car file
similar to the one found on trec-car-tools github repository
"""
j = 0
for p in iter_annotations(open(file, 'rb')):
j += 1
headings = p.nested_headings()
print(p.page_id)
print('headings= ', [(str(section.heading), len(children)) for (section, children) in headings])
print([len(t) for t in p.flat_headings_list()])
if len(p.outline()) > 0:
print('deep headings= ',
[(str(section.heading), len(children)) for (section, children) in p.deep_headings_list()])
print('flat headings= ',
["/".join([str(section.heading) for section in sectionpath]) for sectionpath in p.flat_headings_list()])
print("\n----------------------------------------------------\n")
print(j)
def parse_paragraphs(file):
"""
A simple function to parse paragraphs from a cbor trec-car file
similar to the one found on trec-car-tools github repository
"""
j = 0
for p in iter_paragraphs(open(file, 'rb')):
print('\n', p.para_id, ':')
# Print just the text
print('Simple text\n\n')
texts = [elem.text if isinstance(elem, ParaText) else elem.anchor_text for elem in p.bodies]
print(' '.join(texts))
# Print just the linked entities
print('Entities\n\n')
entities = [elem.page for elem in p.bodies if isinstance(elem, ParaLink)]
print(entities)
# Print text interspersed with links as pairs (text, link)
print('Mixed\n\n')
mixed = [(elem.anchor_text, elem.page) if isinstance(elem, ParaLink) else (elem.text, None) for elem in p.bodies]
print(mixed)
print('all of it\n\n')
print(p.get_text())
print("\n---------------------------------------------------------------------------------------------------\n")
j += 1
print(j)
def make_article_corpus(file):
"""
Function to create seperate trectext documents for each article
in the collection
"""
i = 0
for p in iter_annotations(open(file, 'rb')):
fp = r'article_test200_corpus/articleDoc' + str(i) + '.trectext'
wf = open(fp, 'wb')
wf.write(b"<DOC>\n")
wf.write(b"<DOCNO>")
wf.write((p.page_id).encode('utf8'))
wf.write(b"</DOCNO>\n")
wf.write(b"<TITLE>")
wf.write((p.page_name).encode('utf8'))
wf.write(b"</TITLE>\n")
wf.write(b"<TEXT>\n")
article_text = (''.join(str(s) for s in p.skeleton)).encode('utf8')
wf.write(article_text)
wf.write(b"\n</TEXT>\n")
wf.write(b"</DOC>\n")
i+=1
print(i)
def make_big_corpus(file):
"""
Function to create the bid trectext corpus from the half-wiki
collection
"""
global i
j=0
fp = r'train_big_corpus'+str(j)+'.trectext'
wf = open(fp, 'wb')
for p in iter_paragraphs(open(file, 'rb')):
if p.para_id not in id_set.keys():
id_set[p.para_id] = 1
wf.write(b"<DOC>\n")
wf.write(b"<DOCNO>")
wf.write((p.para_id).encode('utf8'))
wf.write(b"</DOCNO>\n")
wf.write(b"<TEXT>\n")
wf.write((p.get_text()).encode('utf8'))
wf.write(b"\n</TEXT>\n")
wf.write(b"</DOC>\n\n")
i += 1
if i > 70000:
wf.close()
i = 0
j+=1
fp = r'train_big_corpus'+str(j)+'.trectext'
wf = open(fp, 'wb')
print(i)
wf.close()
def search_paras_no_headline(p, wf):
"""
Recursive function used to extract all paragraphs from
the small test200 corpus from the outlines cbor file
"""
global i
if isinstance(p, Para):
if p.paragraph.para_id not in id_set:
id_set.append(p.paragraph.para_id)
wf.write(b"<DOC>\n")
wf.write(b"<DOCNO>")
wf.write((p.paragraph.para_id).encode('utf8'))
wf.write(b"</DOCNO>\n")
wf.write(b"<TEXT>\n")
wf.write((p.get_text()).encode('utf8'))
wf.write(b"\n</TEXT>\n")
wf.write(b"</DOC>\n\n")
i += 1
elif isinstance(p, Section):
for ch in p.children:
search_paras_no_headline(ch, wf)
def make_paras(file):
"""
Function used to create the small test200 corpus through
calling the recursive search_paras_no_headline function
"""
global i
fp = r'paragraph_corpus_test200.trectext'
wf = open(fp, 'wb')
for p in iter_annotations(open(file, 'rb')):
for s in p.skeleton:
search_paras_no_headline(s, wf)
print(i)
wf.close()
def search_article(p, id, wf):
"""
Old implementation of a recursive function used to search
for articles in the cbor files
"""
global i
if isinstance(p, Para):
if p.paragraph.para_id not in id_set:
id_set.append(p.paragraph.para_id)
wf.write(b"<DOC>\n")
wf.write(b"<DOCNO>")
wf.write((p.paragraph.para_id).encode('utf8'))
wf.write(b"</DOCNO>\n")
wf.write(b"<HEADLINE>")
wf.write(id.encode('utf8'))
wf.write(b"</HEADLINE>\n")
wf.write(b"<TEXT>\n")
wf.write(id.encode('utf8'))
wf.write(b'\n')
wf.write((p.get_text()).encode('utf8'))
wf.write(b"\n</TEXT>\n")
wf.write(b"</DOC>\n\n")
i += 1
elif isinstance(p, Section):
for ch in p.children:
search_article(ch, id, wf)
def make_article_corpus_v2(file):
"""
Old implementation of function to create article level corpus
with the usage also of a headline field
"""
global i
fp = r'article_corpus_test200.trectext'
wf = open(fp, 'wb')
for p in iter_annotations(open(file, 'rb')):
for s in p.skeleton:
search_article(s, p.page_name, wf)
print(i)
wf.close()
def search_paras(p, id, wf):
"""
Old implementation of a recursive function used to search
for paragraphs in the cbor files
"""
global i
if isinstance(p, Para):
if p.paragraph.para_id not in id_set:
id_set.append(p.paragraph.para_id)
wf.write(b"<DOC>\n")
wf.write(b"<DOCNO>")
wf.write((p.paragraph.para_id).encode('utf8'))
wf.write(b"</DOCNO>\n")
wf.write(b"<HEADLINE>")
wf.write(id.encode('utf8'))
wf.write(b"</HEADLINE>\n")
wf.write(b"<TEXT>\n")
wf.write((id.replace('/',' ')).encode('utf8'))
wf.write(b'\n')
wf.write((p.get_text()).encode('utf8'))
wf.write(b"\n</TEXT>\n")
wf.write(b"</DOC>\n\n")
i += 1
elif isinstance(p, Section):
for ch in p.children:
search_paras(ch, id+'/'+p.heading, wf)
def make_hierarchical_corpus(file):
"""
Old implementation of function to create hierarchical level corpus
with the usage also of a headline field
"""
global i
fp = r'hierarchical_corpus_test200.trectext'
wf = open(fp, 'wb')
for p in iter_annotations(open(file, 'rb')):
for s in p.skeleton:
search_paras(s, p.page_name, wf)
print(i)
wf.close()
def search_toplevel(p, id, level, wf):
"""
Old implementation of a recursive function used to search
for toplevel sections in the cbor files
"""
global i
if isinstance(p, Para):
if p.paragraph.para_id not in id_set:
id_set.append(p.paragraph.para_id)
wf.write(b"<DOC>\n")
wf.write(b"<DOCNO>")
wf.write((p.paragraph.para_id).encode('utf8'))
wf.write(b"</DOCNO>\n")
wf.write(b"<HEADLINE>")
wf.write(id.encode('utf8'))
wf.write(b"</HEADLINE>\n")
wf.write(b"<TEXT>\n")
wf.write((id.replace('/', ' ')).encode('utf8'))
wf.write(b'\n')
wf.write((p.get_text()).encode('utf8'))
wf.write(b"\n</TEXT>\n")
wf.write(b"</DOC>\n\n")
i += 1
elif isinstance(p, Section):
for ch in p.children:
if level == 0:
search_toplevel(ch, id+'/'+p.heading, level+1, wf)
else:
search_toplevel(ch, id, level+1, wf)
def make_toplevel_corpus(file):
"""
Old implementation of function to create toplevel level corpus
with the usage also of a headline field
"""
global i
fp = r'toplevel_corpus_test200.trectext'
wf = open(fp, 'wb')
for p in iter_annotations(open(file, 'rb')):
for s in p.skeleton:
search_toplevel(s, p.page_name, 0, wf)
print(i)
wf.close()
def remove_stop_word(s):
"""
Simple function to remove stopwords from an English based stopword list
"""
return ' '.join([p for p in s.split() if p not in list(stpw.get_stop_words('en'))])
def create_queries(file):
"""
Function used to create the json files for all the needed queries
in the hierarchical level - if the commented for lines are uncommented
and the above for lines are commented then we can create toplevel queries
while if the whole inner for loop is commented we can create article level
queries
"""
fp = r'train_queries_hierarchical.json'
wf = open(fp, 'w')
data = dict()
data['index'] = r'C:\Users\Vasilis\Documents\galagoTrials\train_paragraph'
data['requested'] = 10
data['processingModel'] = 'org.lemurproject.galago.core.retrieval.processing.RankedDocumentModel'
data['scorer'] = 'bm25'
data['queries'] = []
j = 1
for p in iter_annotations(open(file, 'rb')):
data['queries'].append({'number': str(p.page_id), 'text': '#combine(' +
remove_stop_word(p.page_name) + ')'})
print(j)
j += 1
for (q, q_id) in [(" ".join([str(section.heading)
for section in sectionpath]), "/".join([str(section.headingId)
for section in sectionpath]))
for sectionpath in p.flat_headings_list()]:
# for (q, q_id) in [(str(section.heading), str(section.headingId))
# for (section, children) in p.deep_headings_list()]:
# #data['queries'].append({'number': str(j), 'text': '#combine('+p.page_name+' '+q+')'})
data['queries'].append({'number': str(p.page_id+'/'+q_id), 'text': '#combine('
+ remove_stop_word(p.page_name) + ' ' + remove_stop_word(q) + ')'})
j += 1
print(j)
json.dump(data, wf)
def create_queries_for_expansion(file):
"""
Function used to create the json files for all the needed queries
with query expansion included (if rm3 is the relevance model then the weight should be specified)
in the hierarchical level
"""
fp = r'train_paragraph_exp3_hierarchical.json'
wf = open(fp, 'w')
data = dict()
data['index'] = r'C:\Users\Vasilis\Documents\galagoTrials\train_paragraph'
data['requested'] = 10
data['processingModel'] = 'org.lemurproject.galago.core.retrieval.processing.RankedDocumentModel'
data['relevanceModel'] = 'org.lemurproject.galago.core.retrieval.prf.RelevanceModel3'
data['scorer'] = 'bm25'
data['queries'] = []
j = 1
for p in iter_annotations(open(file, 'rb')):
data['queries'].append({'number': str(p.page_id), 'text': '#rm:bOrigWt=0.8:fbDocs=10:fbTerm=10(' +
remove_stop_word(p.page_name) + ')'})
j += 1
print(j)
for (q, q_id) in [(" ".join([str(section.heading)
for section in sectionpath]), "/".join([str(section.headingId)
for section in sectionpath]))
for sectionpath in p.flat_headings_list()]:
data['queries'].append({'number': str(p.page_id+'/'+q_id), 'text': '#rm:bOrigWt=0.8:fbDocs=10:fbTerm=10('
+ remove_stop_word(p.page_name) + ' ' + remove_stop_word(q) + ')'})
j += 1
print(j)
json.dump(data, wf)
def create_queries_for_expansion_article(file):
"""
Function used to create the json files for all the needed queries
with query expansion included (if rm3 is the relevance model then the weight should be specified)
in the article level
"""
fp = r'train_paragraph_exp3_article.json'
wf = open(fp, 'w')
data = dict()
data['index'] = r'C:\Users\Vasilis\Documents\galagoTrials\train_paragraph'
data['requested'] = 20
data['processingModel'] = 'org.lemurproject.galago.core.retrieval.processing.RankedDocumentModel'
data['relevanceModel'] = 'org.lemurproject.galago.core.retrieval.prf.RelevanceModel3'
data['scorer'] = 'bm25'
data['queries'] = []
j = 1
for p in iter_annotations(open(file, 'rb')):
data['queries'].append({'number': str(p.page_id), 'text': '#rm:bOrigWt=0.8:fbDocs=10:fbTerm=10(' +
remove_stop_word(p.page_name) + ')'})
j += 1
print(j)
json.dump(data, wf)
def create_queries_for_expansion_toplevel(file):
"""
Function used to create the json files for all the needed queries
with query expansion included (if rm3 is the relevance model then the weight should be specified)
in the toplevel level
"""
fp = r'train_paragraph_exp3_toplevel.json'
wf = open(fp, 'w')
data = dict()
data['index'] = r'C:\Users\Vasilis\Documents\galagoTrials\train_paragraph'
data['requested'] = 20
data['processingModel'] = 'org.lemurproject.galago.core.retrieval.processing.RankedDocumentModel'
data['relevanceModel'] = 'org.lemurproject.galago.core.retrieval.prf.RelevanceModel3'
data['scorer'] = 'bm25'
data['queries'] = []
j = 1
for p in iter_annotations(open(file, 'rb')):
data['queries'].append({'number': str(p.page_id), 'text': '#rm:bOrigWt=0.8:fbDocs=10:fbTerm=10(' +
remove_stop_word(p.page_name) + ')'})
j += 1
print(j)
for (q, q_id) in [(str(section.heading), str(section.headingId))
for (section, children) in p.deep_headings_list()]:
data['queries'].append({'number': str(p.page_id+'/'+q_id), 'text': '#rm:bOrigWt=0.8:fbDocs=10:fbTerm=10('
+ remove_stop_word(p.page_name) + ' ' + remove_stop_word(q) + ')'})
j += 1
print(j)
json.dump(data, wf)
# Examples of the function calls to process cbor files and create corpus or to create query json files
file = r'test200-train\train.pages.cbor'
file = r'train\base.train.cbor-paragraphs.cbor'
#parse_annotations(file)
#make_article_corpus_v2(file)
#make_hierarchical_corpus(file)
#make_toplevel_corpus(file)
file = r'train\base.train.cbor-outlines.cbor'
#create_queries(file)
create_queries_for_expansion(file)
#create_queries_for_expansion_article(file)
#create_queries_for_expansion_toplevel(file)
#parse_paragraphs(file)
#make_big_corpus(file)
file = r'test200-train\train.pages.cbor-paragraphs.cbor'
#parse_paragraphs(file)