forked from ErwinRussel/CoreIRGroup7
-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_data_test.py
169 lines (142 loc) · 6.81 KB
/
read_data_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
from trec_car.read_data import *
import json
import os
# path = "../corpus/train/train.fold0.cbor"
articles = "../corpus/train/train.fold0.cbor.article.qrels"
# outlines = "../corpus/train/train.fold0.cbor.outlines"
paragraphs = "test200/train.test200.fold0.cbor.paragraphs"
outlines = "test200/train.pages.cbor-outlines.cbor"
def print_headers(file=outlines):
"""
Print article headings.
Based on: https://github.com/TREMA-UNH/trec-car-tools
"""
with open(file, 'rb') as f:
for p in iter_annotations(f):
print('\npagename:', p.page_name)
# get one data structure with nested (heading, [children]) pairs
print(p)
headings = p.nested_headings()
print('headings= ', [(str(section.heading), len(children)) for (section, children) in headings])
if len(p.outline()) > 2:
print('heading 1=', p.outline()[0])
print('deep headings= ',
[(str(section.heading), len(children)) for (section, children) in p.deep_headings_list()])
print('flat headings= ', ["/".join([str(section.heading) for section in sectionpath]) for sectionpath in
p.flat_headings_list()])
def print_paragraphs(file=paragraphs):
"""
Print the content of article's paragraphs
Based on: https://github.com/TREMA-UNH/trec-car-tools
"""
i = 0
with open(file, 'rb') as f:
for p in iter_paragraphs(f):
i += 1
print('\n', p.para_id, ':')
# Print just the text
texts = [elem.text if isinstance(elem, ParaText)
else elem.anchor_text
for elem in p.bodies]
print(' '.join(texts))
# Print just the linked entities
entities = [elem.page
for elem in p.bodies
if isinstance(elem, ParaLink)]
print(entities)
# Print text interspersed with links as pairs (text, link)
mixed = [(elem.anchor_text, elem.page) if isinstance(elem, ParaLink)
else (elem.text, None)
for elem in p.bodies]
print(mixed)
print(i)
def create_corpus_galago(file):
"""
Function to map the paragraphs in cbor format to a trectext format, suitable for galago indexing.
Format based on https://github.com/jiepujiang/cs646_tutorials#installation
"""
cnt = 0
file_index = 0
file_size = 50000
output_file = r'paragraph_corpus_' + str(file_index) + '.trectext'
stream = open(output_file, 'wb')
for p in iter_paragraphs(open(file, 'rb')):
stream.write(b"<DOC>\n")
stream.write(b"<DOCNO>")
stream.write((p.para_id).encode('utf8'))
stream.write(b"</DOCNO>\n")
stream.write(b"<TEXT>\n")
# I think we only need to care about paragraph text (we do not use links)
stream.write((p.get_text()).encode('utf8'))
stream.write(b"\n</TEXT>\n")
stream.write(b"</DOC>\n\n")
cnt += 1
if cnt > file_size:
stream.close()
print("Filled up file number " + str(file_index) + " with documents.")
file_index += 1
cnt = 0
output_file = r'paragraph_corpus_' + str(file_index) + '.trectext'
stream = open(output_file, 'wb')
stream.close()
print("DONE!")
# create_corpus_galago(paragraphs)
def create_queries(file):
"""
Function used to create the queries based on headings hierarchy.
Nice source: https://www.inf.ed.ac.uk/teaching/courses/tts/handouts2017/galago_tutorial.pdf
To run the queries execute the following (optional for more informative output):
galago/bin/galago batch-search (--verbose=true) PATH_TO_FILE/queries.json
"""
out = r'queries.json'
out_stream = open(out, 'w')
queries = dict()
queries['index'] = r'../ir_core/index'
# queries['requested'] = 100
# queries['processingModel'] = 'org.lemurproject.galago.core.retrieval.processing.RankedDocumentModel'
# queries['scorer'] = 'bm25'
queries['queries'] = []
for p in iter_annotations(open(file, 'rb')):
queries['queries'].append({'number': str(p.page_id), 'text': p.page_name})
# queries['queries'].append({'number': str(p.page_id), 'text': '#combine(' + p.page_name + ')'})
flattened_heading_list = p.flat_headings_list()
for query, query_id in [((" ".join([str(headings.heading) for headings in heading_path])),
"/".join([str(headings.headingId) for headings in heading_path]))
for heading_path in flattened_heading_list]:
queries['queries'].append({'number': str(p.page_id + '/' + query_id),
'text': p.page_name + ' ' + query})
# 'text': '#combine(' + p.page_name + ' ' + query + ')'})
json.dump(queries, out_stream)
print("Done creating queries.")
# create_queries(outlines)
def create_queries_relevance(file, rm_version=1):
"""
Function used to create the queries based on headings hierarchy for querying with pseudo-relevance feedback.
Nice source: https://www.inf.ed.ac.uk/teaching/courses/tts/handouts2017/galago_tutorial.pdf
To run the queries execute the following (optional for more informative output):
galago/bin/galago batch-search (--verbose=true) PATH_TO_FILE/queries_relevance.json
"""
out = r'queries_relevance_test.json'
out_stream = open(out, 'w')
queries = dict()
queries['index'] = r'../ir_core/index'
queries['requested'] = 100
queries['processingModel'] = 'org.lemurproject.galago.core.retrieval.processing.RankedDocumentModel'
queries['relevanceModel'] = 'org.lemurproject.galago.core.retrieval.prf.RelevanceModel' + str(rm_version)
# queries['fbDocs'] = 10
# queries['fbTerm'] = 5
# queries['fbOrigWeight'] = 0.75
# queries['scorer'] = 'bm25'
queries['queries'] = []
for p in iter_annotations(open(file, 'rb')):
queries['queries'].append({'number': str(p.page_id), 'text': '#rm(' + p.page_name + ')'})
flattened_heading_list = p.flat_headings_list()
for query, query_id in [((" ".join([str(headings.heading) for headings in heading_path])),
"/".join([str(headings.headingId) for headings in heading_path]))
for heading_path in flattened_heading_list]:
queries['queries'].append({'number': str(p.page_id + '/' + query_id),
'text': '#rm(' + p.page_name + ' ' + query + ')'})
json.dump(queries, out_stream)
print("Done creating queries for relevance model.")
# create_queries(outlines)
create_queries(outlines)