-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_queries.py
228 lines (181 loc) · 5.64 KB
/
test_queries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import pickle
import numpy as np
import pandas as pd
import spacy
import string
import pkg_resources
from symspellpy import SymSpell, Verbosity
nlp = spacy.load('en_core_web_sm')
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
def read_query():
'''
:returns:
query read from terminal
'''
q = input('Enter query: ')
return q
def load_files(bi=False):
'''
:param bi: load bigram doc lengths or not
:returns:
inverted_index, unigram doc lengths, doc-2-title, [bigram doc lengths]
'''
with open('inv_index.pkl', 'rb+') as f:
index = pickle.load(f)
with open('doc_lengths.pkl', 'rb+') as f:
doc_lengths = pickle.load(f)
if bi:
with open('doc_bi_lengths.pkl', 'rb+') as f:
doc_bi_lengths = pickle.load(f)
with open('doc_id_2_title.pkl', 'rb+') as f:
doc_id_2_title = pickle.load(f)
if bi:
return [index, doc_lengths, doc_id_2_title, doc_bi_lengths]
else:
return [index, doc_lengths, doc_id_2_title]
def spell_correct(vocab, raw):
'''
:param vocab: vocab file (aka inverted-index)
:param raw: string to be checked and corrected for spelling
:returns:
corrected string
'''
raw = raw.strip().lower().translate(str.maketrans('', '', string.punctuation))
raw_doc = nlp(raw)
corrected_list = []
for tok in raw_doc:
word = tok.text
# if word already in vocab or a proper noun do not change
if word in vocab:
corrected_list.append(word)
continue
else:
suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
try:
# take the best suggestion if found else keep the original
suggestion = suggestions[0].term
except:
suggestion = word
corrected_list.append(suggestion)
return ' '.join(corrected_list)
def retrieve_documents(q, files, bi=False):
'''
:param q: query to be searched
:param files: list of files read from disk
:param bi: search bigrams of query or not
:returns:
sorted dictionary with (doc_id, score) pairs
'''
index, doc_lengths, doc_id_2_title = files[0], files[1], files[2]
query = q.strip().lower().translate(str.maketrans('', '', string.punctuation))
query_words = query.split()
if bi:
query_words = [(query_words[i], query_words[i+1]) for i in range(len(query_words)-1)]
doc_lengths = files[3]
query_dict = {}
N = len(doc_lengths.keys())
# store query terms with their frequencies
for w in query_words:
if w in query_dict:
query_dict[w] += 1
else:
query_dict[w] = 1
score_dict = {}
for t, f in query_dict.items():
try:
posting = index[t]
except:
# this query word/bigram does not occur in vocab
continue
# tf value
tfq = 1 + np.log10(f)
# tf-idf value wrt query (ltc scheme)
wtq = tfq * (np.log10(N/len(posting)))
for doc_id, tfd in posting.items():
# tf-idf value wrt doc (lnc scheme)
tfd = 1 + np.log10(tfd)
if doc_id in score_dict:
score_dict[doc_id] += wtq * tfd
else:
score_dict[doc_id] = wtq * tfd
for doc_id, score in score_dict.items():
# divide by doc length
score_dict[doc_id] = score/np.sqrt(doc_lengths[doc_id])
sorted_scores = {k: v for k, v in sorted(score_dict.items(), key=lambda x: -x[1])}
return sorted_scores
def retrieve_spell(q, files):
'''
:param q: query to be searched
:param files: list of files read from disk
:returns:
sorted dictionary with (doc_id, score) pairs after combining results from
original query and corrected query
'''
docs_before_correction = retrieve_documents(q, files)
corrected_q = spell_correct(files[0], q)
docs_after_correction = retrieve_documents(corrected_q, files)
top_docs = doc_join(docs_before_correction, docs_after_correction)
return top_docs
def retrieve_bi(q, files):
'''
:param q: query to be searched
:param files: list of files read from disk
:returns:
sorted dictionary with (doc_id, score) pairs after combining results from
original query and bigram query
'''
docs_uni = retrieve_documents(q, files)
docs_bi = retrieve_documents(q, files, bi=True)
top_docs = doc_join(docs_uni, docs_bi, bi=True)
return top_docs
def doc_join(doc1,doc2, bi=False):
'''
:param doc1: dictionary of scores with original query
:param doc2: dictionary of scores with tranformed query
:param bi: if true we combine by adding scores else by taking max
:returns:
final dictionary of scores after merging the input dictionaries
'''
final_scores = doc1
for doc_id,score in doc2.items():
if doc_id in doc1:
if bi:
final_scores[doc_id] = doc1[doc_id] + score
else:
final_scores[doc_id] = max(doc1[doc_id], score)
else:
final_scores[doc_id] = score
final_scores = {k:v for k, v in sorted(final_scores.items(), key=lambda kv:(-kv[1], kv[0]))}
return final_scores
def main():
q = read_query()
inp = input('Enter 1 for Part1\nEnter 2 for Part2, Improvement1\nEnter 3 for Part2, Improvement2\nResponse: ')
k = 10
if inp in ['1', '2']:
files = load_files()
else:
files = load_files(bi=True)
if(inp == '1'):
top_docs = retrieve_documents(q, files)
elif(inp == '2'):
top_docs = retrieve_spell(q, files)
elif(inp == '3'):
top_docs = retrieve_bi(q, files)
else:
print('This option is not supported.')
# get the top k documents
top_k_docs = [(k, v) for k, v in top_docs.items()][:k]
if(len(top_k_docs) == 0):
print('No relevant documents found/query terms do not exist in vocabulary')
return
docs = []
scores = []
for i in range(min(k, len(top_k_docs))):
docs.append(files[2][top_k_docs[i][0]])
scores.append(top_k_docs[i][1])
df = pd.DataFrame({'Document': docs, 'Score': scores})
print(df)
if __name__ == '__main__':
main()